In [None]:
curl "https://api.github.com/search/repositories?q=language:python+stars:>100&per_page=10&page=85"

curl "https://api.github.com/search/repositories?q=language:python+stars:%3E200+fork:false+pushed:%3E=2023-01-01&per_page=100&page=1"




In [None]:
 
import os
os.environ["GITHUB_ACCESS_TOKEN"] = GITHUB_ACCESS_TOKEN
os.environ["HUGGINGFACE_ACCESS_TOKEN"] = HUGGINGFACE_ACCESS_TOKEN

In [None]:
import requests
import pandas as pd
import time
import urllib.parse
import os

# Constants
GITHUB_API_URL = "https://api.github.com"
PER_PAGE = 100  # Maximum allowed by GitHub API
ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")  # Set your GitHub access token
TOPICS = ["data-science", "machine-learning", "ai", "api", "python"]  # Replace with your topics ["nlp"]
MIN_STAR = 100  # Define your minimum star count threshold

# Headers for authentication
headers = {}
if ACCESS_TOKEN:
    headers = {"Authorization": f"token {ACCESS_TOKEN}"}

# Function to collect repository data
def collect_repo_data():
    repos_data = []
    for topic in TOPICS:
        print(f"Processing topic: {topic}")
        stars_upper_limit = 100000  # Start with a very high star count
        while True:
            page = 1
            first_page = True
            total_pages = 1
            last_star_count = None  # To keep track of the last repository's star count
            while page <= total_pages:
                query = f"language:python stars:<{stars_upper_limit} fork:false topic:{topic} pushed:>=2023-01-01"
                params = {
                    "q": query,
                    "sort": "stars",
                    "order": "desc",
                    "per_page": PER_PAGE,
                    "page": page
                }

                response = requests.get(f"{GITHUB_API_URL}/search/repositories", headers=headers, params=params)
                if response.status_code != 200:
                    print(f"Failed to fetch repositories: {response.status_code}")
                    print(response.json())
                    break

                data = response.json()
                if first_page:
                    total_count = data.get('total_count', 0)
                    total_pages = min((total_count + PER_PAGE - 1) // PER_PAGE, 10)  # API caps at 1000 results
                    print(f"Total repositories found: {total_count}. Total pages: {total_pages}")
                    first_page = False

                items = data.get("items", [])
                if not items:
                    print(f"No repositories found on page {page}")
                    break

                for repo in items:
                    star_count = repo["stargazers_count"]
                    # Stop if the star count is below MIN_STAR
                    if star_count < MIN_STAR:
                        print(f"Reached repositories with stars less than {MIN_STAR}. Moving to next topic.")
                        break
                    repo_data = {
                        "repo_id": repo["id"],
                        "repo_name": repo["name"],
                        "full_name": repo["full_name"],
                        "owner_login": repo["owner"]["login"],
                        "repo_url": repo["html_url"],
                        "description": repo["description"],
                        "primary_language": repo["language"],
                        "topics": repo.get("topics", []),
                        "license": repo["license"]["name"] if repo["license"] else None,
                        "created_at": repo["created_at"],
                        "updated_at": repo["updated_at"],
                        "pushed_at": repo["pushed_at"],
                        "size": repo["size"],
                        "stargazers_count": star_count,
                        "watchers_count": repo["watchers_count"],
                        "forks_count": repo["forks_count"],
                        "open_issues_count": repo["open_issues_count"],
                        "default_branch": repo["default_branch"],
                        "score": repo.get("score"),
                        "is_fork": repo["fork"],
                        "visibility": repo.get("visibility", "public"),
                        "topic": topic  # Include the topic
                    }
                    repos_data.append(repo_data)
                    last_star_count = star_count
                if star_count < MIN_STAR:
                    break
        

                print(f"Completed page {page}/{total_pages} for topic {topic} with stars upper limit {stars_upper_limit}")
                page += 1
                time.sleep(1.5)  # Sleep to respect API rate limits
                
            if star_count < MIN_STAR:
                break

            # If total_count is less than 1000, we've retrieved all repositories for this stars_upper_limit
            if total_count < 1000:
                print(f"All repositories fetched for stars less than {stars_upper_limit}.")
                break
            elif last_star_count is not None:
                # Update the stars_upper_limit to be less than the last star count
                stars_upper_limit = last_star_count - 1
                print(f"Adjusting stars upper limit to {stars_upper_limit} and continuing.")
            else:
                # No more repositories to fetch
                break
        print(f"Finished processing topic: {topic}")

    return repos_data


repos_data = collect_repo_data()

# Save repos_data to CSV
repos_df = pd.DataFrame(repos_data)
repos_df = repos_df.drop_duplicates(subset=["full_name"])
repos_df.to_csv("repositories.csv", index=False)

Processing topic: data-science
Total repositories found: 4063. Total pages: 10
Completed page 1/10 for topic data-science with stars upper limit 100000
Completed page 2/10 for topic data-science with stars upper limit 100000
Completed page 3/10 for topic data-science with stars upper limit 100000
Completed page 4/10 for topic data-science with stars upper limit 100000
Reached repositories with stars less than 100. Moving to next topic.
Finished processing topic: data-science
Processing topic: machine-learning
Total repositories found: 15624. Total pages: 10
Completed page 1/10 for topic machine-learning with stars upper limit 100000
Completed page 2/10 for topic machine-learning with stars upper limit 100000
Completed page 3/10 for topic machine-learning with stars upper limit 100000
Completed page 4/10 for topic machine-learning with stars upper limit 100000
Completed page 5/10 for topic machine-learning with stars upper limit 100000
Completed page 6/10 for topic machine-learning with

In [12]:
import pandas as pd
import requests
import time
import os
from tqdm import tqdm
ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")
GITHUB_API_URL = "https://api.github.com"
headers = {}
if ACCESS_TOKEN:
    headers = {"Authorization": f"token {ACCESS_TOKEN}"}

# Function to fetch READMEs for a list of repositories
def fetch_readmes(repo_full_names):
    readme_errors = []
    if not os.path.exists("readmes"):
        os.makedirs("readmes")

    for repo_full_name in tqdm(repo_full_names):

        #skip if readme already exists
        if os.path.exists(f"readmes/{repo_full_name.replace('/', '_')}.md"):
            continue


        readme_url = f"{GITHUB_API_URL}/repos/{repo_full_name}/readme"

        # Create a copy of headers and add 'Accept' header
        readme_headers = headers.copy()
        readme_headers['Accept'] = 'application/vnd.github.v3.raw'

        readme_resp = requests.get(readme_url, headers=readme_headers)

        if readme_resp.status_code == 200:
            # Save README to a file named after the repo full name
            # Replace slashes in the full name to make it a valid filename
            safe_name = repo_full_name.replace("/", "_")
            with open(f"readmes/{safe_name}.md", "w", encoding="utf-8") as f:
                f.write(readme_resp.text)
        else:
            readme_errors.append((repo_full_name, readme_resp.status_code))
            continue  # Skip repositories without README

        time.sleep(0.2)  # Sleep to respect API rate limits

    return readme_errors

repos_df = pd.read_csv("repositories.csv")

In [13]:
fetch_readmes(repos_df.full_name.values.tolist())

100%|██████████| 9483/9483 [44:35<00:00,  3.54it/s]  


[('sksalahuddin2828/Pandas_Numpy_Matplotlib_Plotly', 404),
 ('oliveirabruno01/babyagi-asi', 404),
 ('engineer-man/youtube', 404),
 ('artemonsh/fastapi_course', 404),
 ('BotoX/xiaomi-m365-firmware-patcher', 404),
 ('georgemarshall/django-cryptography', 403),
 ('django-cas-ng/django-cas-ng', 403),
 ('nazrulworld/fhir.resources', 403),
 ('schmiph2/pysepm', 403),
 ('tuxity/insta-unfollower', 403),
 ('szastupov/aiotg', 403),
 ('Aura-healthcare/hrv-analysis', 403),
 ('xxyzz/WordDumb', 403),
 ('N4S4/synology-api', 403),
 ('gem/oq-engine', 403),
 ('odysseusmax/tg-index', 403),
 ('mikeroyal/ISP-Guide', 403),
 ('the4thdoctor/pg_chameleon', 403),
 ('pymtl/pymtl3', 403),
 ('FreeLanguageTools/vocabsieve', 403),
 ('django-cms/django-classy-tags', 403),
 ('vanheeringen-lab/genomepy', 403),
 ('vertica/vertica-python', 403),
 ('wallds/NoVmpy', 403),
 ('p0dalirius/ipsourcebypass', 403),
 ('nottheswimmer/pytago', 403),
 ('ultrasecurity/TeleKiller', 403),
 ('webrecorder/warcio', 403),
 ('lschoe/mpyc', 403

In [1]:
#read csv
import pandas as pd
repos_df = pd.read_csv("repositories.csv")

readme_text = []

#for every repo in the csv, get the readme
for index,row in repos_df.iterrows():
    repo_full_name = row["full_name"]
    safe_name = repo_full_name.replace("/", "_")

    #open the readme file
    try:
        with open(f"readmes/{safe_name}.md", "r", encoding="utf-8") as f:
            readme = f.read()
            readme_text.append(readme)
    except:
        readme_text.append(None)


In [2]:
repos_df["readme_text"] = readme_text
repos_df = repos_df.dropna(subset=["readme_text"])

In [3]:
import re
import numpy as np

#count the occurences of "^- " of each readme
lists_count = repos_df["readme_text"].str.count(r"^- ", flags=re.MULTILINE)
#log then normalize it
lists_count = np.log1p(1 + lists_count)
lists_count = (lists_count - lists_count.min()) / (lists_count.max() - lists_count.min())
repos_df["lists_count"] = lists_count

#count hypperlinks
hyperlinks_count = repos_df["readme_text"].str.count(r"http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", flags=re.MULTILINE)
#log then normalize it
hyperlinks_count = np.log1p(1 + hyperlinks_count)
hyperlinks_count = (hyperlinks_count - hyperlinks_count.min()) / (hyperlinks_count.max() - hyperlinks_count.min())
repos_df["hyperlinks_count"] = hyperlinks_count

#number of images ending in .png, .jpg, .jpeg, .gif (url or local path) in .md ir .rst
#[image] (https://example.com/image.png)
#image:: https://example.com/image.png
#<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>
images_count = repos_df["readme_text"].str.count(r"!\[.*\]\(.*\.(png|jpg|jpeg|gif|svg)\)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"image::.*\.(png|jpg|jpeg|gif|svg)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>", flags=re.MULTILINE)
#log then normalize it
images_count = np.log1p(1 + images_count)
images_count = (images_count - images_count.min()) / (images_count.max() - images_count.min())
repos_df["images_count"] = images_count

#number of code blocks divide by 2 and round down
code_blocks_count = (repos_df["readme_text"].str.count(r"^```", flags=re.MULTILINE) // 2).astype(int)
#log then normalize it
code_blocks_count = np.log1p(1 + code_blocks_count)
code_blocks_count = (code_blocks_count - code_blocks_count.min()) / (code_blocks_count.max() - code_blocks_count.min())
repos_df["code_blocks_count"] = code_blocks_count

#number of new lines with content
content_lines_count = repos_df["readme_text"].str.count(r"^.*[^\s]", flags=re.MULTILINE)
#log then normalize it
content_lines_count = np.log1p(1 + content_lines_count)
content_lines_count = (content_lines_count - content_lines_count.min()) / (content_lines_count.max() - content_lines_count.min())
repos_df["content_lines_count"] = content_lines_count


# Section keyword search
# start with #
# 1 if contains atleast 1 keyword, 0 otherwise


#Contributing section
contributing_keywords = ["contribut"]
repos_df[f"keyword_contributing"] = False
for keyword in contributing_keywords:
    repos_df[f"keyword_contributing"] = repos_df[f"keyword_contributing"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)

    

# Getting Started section
getting_started_keywords = ["getting started", "installation", "quick start", "quickstart", "setup", "usage", "example", "examples", "demo", "demos"]
repos_df[f"keyword_getting_started"] = False
for keyword in getting_started_keywords:
    repos_df[f"keyword_getting_started"] = repos_df[f"keyword_getting_started"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)

# License section
license_keywords = ["license"]  
repos_df[f"keyword_license"] = False
for keyword in license_keywords:
    repos_df[f"keyword_license"] = repos_df[f"keyword_license"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)


# Flesch Reading Score
#textstat.flesch_reading_ease(test_data)
import textstat
flesch_reading_scores = repos_df["readme_text"].apply(lambda x: textstat.flesch_reading_ease(x) if x else None)
#normalize
flesch_reading_scores = (flesch_reading_scores - flesch_reading_scores.min()) / (flesch_reading_scores.max() - flesch_reading_scores.min())
repos_df["flesch_reading_scores"] = flesch_reading_scores


# number of headers
headers_count = repos_df["readme_text"].str.count(r"#+", flags=re.MULTILINE)
#log then normalize
headers_count = np.log1p(1 + headers_count)
headers_count = (headers_count - headers_count.min()) / (headers_count.max() - headers_count.min())
repos_df["headers_count"] = headers_count

KeyError: 'contributing'

# Dependant Extraction

In [8]:
stars_ = np.log1p(repos_df["stargazers_count"])
stars_ = (stars_ - stars_.min()) / (stars_.max() - stars_.min())
repos_df['stargazers_count_normalized'] = stars_

forks_ = np.log1p(repos_df["forks_count"])
forks_ = (forks_ - forks_.min()) / (forks_.max() - forks_.min())
repos_df['forks_count_normalized'] = forks_

watchers_ = np.log1p(repos_df["watchers_count"])
watchers_ = (watchers_ - watchers_.min()) / (watchers_.max() - watchers_.min())
repos_df['watchers_count_normalized'] = watchers_

heurstics = (stars_ + forks_ + watchers_) / 3
heurstics = (heurstics - heurstics.min()) / (heurstics.max() - heurstics.min())
repos_df['popularity_heurstic'] = heurstics

In [38]:
dependant_columns = ["stargazers_count_normalized", "forks_count_normalized", "watchers_count_normalized", "popularity_heurstic"]
dependant_columns = dependant_columns[-1:]
dependant_columns

['popularity_heurstic']

In [41]:
#Independant Columns normalized between 0 and 1
count_independent_columns = ["lists_count", "hyperlinks_count", "images_count", "code_blocks_count", "content_lines_count", "headers_count"]
#score Indecpend column
score_independent_columns = ["flesch_reading_scores"]
#keyword independant columns#binary 0 or 1
keyword_independent_columns = ["keyword_contributing", "keyword_getting_started", "keyword_license"]

dependant_columns = ["stargazers_count_normalized", "forks_count_normalized", "watchers_count_normalized", "popularity_heurstic"]
dependant_columns = dependant_columns[-1:]
dependant_columns

\
independent_columns = count_independent_columns + score_independent_columns + keyword_independent_columns


score_dict={"counts":{ column:{} for column in count_independent_columns}
            ,
       "keyword":{ column:{} for column in keyword_independent_columns
       },
       "scores":{ column:{} for column in score_independent_columns
       }}


repos_df = repos_df.dropna(subset=independent_columns + dependant_columns)
# Pearson correlation
from sklearn.feature_selection import r_regression

for dependant in dependant_columns[-1:]:
    score = r_regression(repos_df[independent_columns].values, repos_df[dependant].values)

    for i, column in enumerate(independent_columns):
        print(f"Pearson correlation between {column} and {dependant}: {round(score[i],3)}")

        if column in count_independent_columns:
            score_dict["counts"][column]['Pearsons'] = round(score[i],3)
        elif column in keyword_independent_columns:
            score_dict["keyword"][column]['Pearsons'] = round(score[i],3)
        elif column in score_independent_columns:
            score_dict["scores"][column]['Pearsons'] = round(score[i],3)
        


# Cliffs Delta


#Fishers Exact Test
# import scipy.stats as stats

# odd_ratio, p_value = stats.fisher_exact([repos_df["content_lines_count"], repos_df["stargazers_count_normalized"]])


#logistic regression feature important extraction
from sklearn.linear_model import LinearRegression

X = repos_df[independent_columns].values
y = repos_df[dependant_columns[0]].values

clf = LinearRegression().fit(X, y)
clf.coef_  # logistic regression feature importance extraction

for i, column in enumerate(independent_columns):
    print(f"Linear regression coefficient for {column}: {round(clf.coef_[i],3)}")

    if column in count_independent_columns:
        score_dict["counts"][column]['LinearRegression'] = round(clf.coef_[i],3)
    elif column in keyword_independent_columns:
        score_dict["keyword"][column]['LinearRegression'] = round(clf.coef_[i],3)
    elif column in score_independent_columns:
        score_dict["scores"][column]['LinearRegression'] = round(clf.coef_[i],3)


# random forest feature importance extraction

from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor().fit(X, y)
clf.feature_importances_  # random forest feature importance extraction

for i, column in enumerate(independent_columns):
    print(f"Random forest feature importance for {column}: {round(clf.feature_importances_[i],3)}")

    if column in count_independent_columns:
        score_dict["counts"][column]["Random Forest"] = round(clf.feature_importances_[i],3)
    elif column in keyword_independent_columns:
        score_dict["keyword"][column]["Random Forest"] = round(clf.feature_importances_[i],3)
    elif column in score_independent_columns:
        score_dict["scores"][column]["Random Forest"] = round(clf.feature_importances_[i],3)


Pearson correlation between lists_count and popularity_heurstic: 0.049
Pearson correlation between hyperlinks_count and popularity_heurstic: 0.237
Pearson correlation between images_count and popularity_heurstic: 0.16
Pearson correlation between code_blocks_count and popularity_heurstic: -0.081
Pearson correlation between content_lines_count and popularity_heurstic: 0.083
Pearson correlation between headers_count and popularity_heurstic: -0.016
Pearson correlation between flesch_reading_scores and popularity_heurstic: -0.038
Pearson correlation between keyword_contributing and popularity_heurstic: 0.055
Pearson correlation between keyword_getting_started and popularity_heurstic: -0.109
Pearson correlation between keyword_license and popularity_heurstic: -0.005
Linear regression coefficient for lists_count: 0.014
Linear regression coefficient for hyperlinks_count: 0.342
Linear regression coefficient for images_count: 0.091
Linear regression coefficient for code_blocks_count: -0.064
Line

In [42]:
# convert to pands dataframe with mulit index
# rows is the score type
# columns is the feature (with subcolumns for each method)

score_df = pd.DataFrame()
for score_type, score_data in score_dict.items():
    df_sub = pd.DataFrame(score_data).T
    df_sub = df_sub.reset_index()
    df_sub.insert(0, "score_type", score_type)
    
    score_df = pd.concat([score_df, df_sub], axis=0)
score_df.groupby(["score_type",'index']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pearsons,LinearRegression,Random Forest
score_type,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
counts,code_blocks_count,-0.081,-0.064,0.085
counts,content_lines_count,0.083,0.021,0.173
counts,headers_count,-0.016,-0.067,0.126
counts,hyperlinks_count,0.237,0.342,0.181
counts,images_count,0.16,0.091,0.093
counts,lists_count,0.049,0.014,0.094
keyword,keyword_contributing,0.055,0.017,0.017
keyword,keyword_getting_started,-0.109,-0.03,0.016
keyword,keyword_license,-0.005,-0.003,0.017
scores,flesch_reading_scores,-0.038,0.141,0.196
