In [1]:
import pandas as pd

# Path to your CSV
path = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan.csv"

# Read
df = pd.read_csv(path)

# Stable sort so rows with type == 'webapp' come first, keep all rows/columns, preserve original order within groups
is_webapp = df["type"].astype(str).str.strip().str.lower().eq("webapp").astype(int)
df_sorted = df.assign(_is_webapp=is_webapp).sort_values(
    by="_is_webapp", ascending=False, kind="mergesort"
).drop(columns="_is_webapp")

# Write back to the same file
df_sorted.to_csv(path, index=False)


In [6]:
import pandas as pd

# Paths
path_first = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan.csv"
path_second = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan_second_try.csv"

# Read both files
df_first = pd.read_csv(path_first)
df_second = pd.read_csv(path_second)

# Get sets of repo names
first_repos = set(df_first["repo_full_name"].dropna().str.strip())
second_repos = set(df_second["repo_full_name"].dropna().str.strip())

# Find new ones
new_repos = second_repos - first_repos

# Print results
print(f"Number of new entries in the second file: {len(new_repos)}")
print("\nSample of new repo names:")
print(list(new_repos)[:10])  # show up to 10

# Find repos missing in the second file
missing_repos = first_repos - second_repos

# Print results
print(f"Number of entries in the first file but not in the second: {len(missing_repos)}")
print("\nSample of missing repo names:")
#print(list(missing_repos)[:10])  # show up to 10


Number of new entries in the second file: 0

Sample of new repo names:
[]
Number of entries in the first file but not in the second: 35

Sample of missing repo names:


In [13]:
import pandas as pd

# Paths
path_first = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan.csv"
path_second = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan_second_try.csv"

# Read both files
df_first = pd.read_csv(path_first)
df_second = pd.read_csv(path_second)

# Normalize repo names for reliable comparison
df_first["repo_full_name"] = df_first["repo_full_name"].str.strip()
df_second["repo_full_name"] = df_second["repo_full_name"].str.strip()

# 1️⃣ Count rows in the second file where type == 'webapp'
webapps_in_second = (df_second["type"].astype(str).str.lower() == "webapp").sum()

# 2️⃣ Find rows in the first file that are NOT in the second and have type == 'webapp'
missing_repos = set(df_first["repo_full_name"]) - set(df_second["repo_full_name"])
missing_webapps = df_first[df_first["repo_full_name"].isin(missing_repos) & (df_first["type"].astype(str).str.lower() == "webapp")]

# Results
print(f"Number of 'webapp' rows in the second file: {webapps_in_second}")
print(f"Number of 'webapp' rows missing from the second file: {len(missing_webapps)}")
print(missing_webapps["repo_full_name"].to_list())


Number of 'webapp' rows in the second file: 72
Number of 'webapp' rows missing from the second file: 2
['BerriAI/litellm', 'PrefectHQ/prefect']


In [14]:
import pandas as pd

# Path to your file
path_second = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan_second_try.csv"

# Read CSV
df = pd.read_csv(path_second)

# Columns to analyze
cols = ["contributors", "stars", "python_pct", "commits_last_90d"]

# Convert columns to numeric (in case they were read as strings)
df[cols] = df[cols].apply(pd.to_numeric, errors="coerce")

# Compute stats
summary = df[cols].agg(["min", "max", "mean"]).T.round(2)

# Display result
print("📊 Summary statistics for numeric columns:")
print(summary)


📊 Summary statistics for numeric columns:
                      min       max      mean
contributors        10.00    3910.0    307.30
stars             5104.00  178887.0  20963.78
python_pct          50.62     100.0     83.32
commits_last_90d    10.00    4793.0    268.78


In [16]:
import numpy as np

q50 = contributors.median()
q75 = contributors.quantile(0.75)
q90 = contributors.quantile(0.9)
print(f"Median: {q50}, 75th percentile: {q75}, 90th percentile: {q90}")


Median: 157.5, 75th percentile: 324.0, 90th percentile: 701.6000000000008


In [17]:
import pandas as pd

# Path to your second file
path_second = "/Users/danarapp/Desktop/energypattern-keyword-search/processing_pipeline/select_repos/webapps_large_scan_second_try.csv"

# Read CSV
df = pd.read_csv(path_second)

# Ensure numeric columns are actually numeric
numeric_cols = ["stars", "commits_last_90d", "python_pct"]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

# Compute percentiles
percentiles = [0.5, 0.75, 0.9]
summary = df[numeric_cols].quantile(percentiles).T
summary.columns = ["Median", "75th percentile", "90th percentile"]
summary = summary.round(2)

print("📊 Percentile Summary (Stars, Commits, Python %):")
print(summary)


📊 Percentile Summary (Stars, Commits, Python %):
                   Median  75th percentile  90th percentile
stars             12553.0         23412.00         42934.70
commits_last_90d    107.0           230.25           671.10
python_pct           88.5            98.44            99.85
