In [None]:
import requests
import pandas as pd
import time
import urllib.parse
import os

os.environ["GITHUB_ACCESS_TOKEN"] = GITHUB_ACCESS_TOKEN
os.environ["HUGGINGFACE_ACCESS_TOKEN"] = HUGGINGFACE_ACCESS_TOKEN



keywords = {
  "keywords": {
    "getting-started": {
      "gh-header-keywords": [
        "getting started",
        "installation",
        "quick start",
        "quickstart",
        "setup",
        "usage",
        "example",
        "demo"
      ],
      "hf-header-keywords": [
        "how to use",
        "usage",
        "inference",
        "example usage",
        "sample code",
        "model usage",
        "usage example"
      ],
      "gh-content-keywords": [
        "install",
        "pip",
        "requirements",
        "dependency",
        "clone",
        "git clone",
        "build",
        "run",
        "execute",
        "usage",
        "command line",
        "code snippet",
        "environment",
        "virtualenv",
        "conda",
        "script",
        "examples",
        "usage instructions",
        "dependencies",
        "install instructions"
      ],
      "hf-content-keywords": [
        "import",
        "from transformers import",
        "pipeline",
        "tokenizer",
        "model",
        "generate",
        "inference",
        "code snippet",
        "PyTorch",
        "TensorFlow",
        "example",
        "Hugging Face Hub",
        "task",
        "fine-tune",
        "load model",
        "preprocess",
        "prompt",
        "output",
        "usage example",
        "inference example"
      ]
    },
    "contributing": {
      "gh-header-keywords": [
        "contributing",
        "contribution",
        "contribute",
        "pull request",
        "bug report",
        "issue"
      ],
      "hf-header-keywords": [
        "how to contribute",
        "contribute",
        "report issues",
        "feedback",
        "suggestions",
        "contact",
        "collaborate",
        "acknowledgements"
      ],
      "gh-content-keywords": [
        "fork",
        "pull request",
        "issue tracker",
        "guidelines",
        "code style",
        "testing",
        "documentation",
        "contributing guide",
        "report bugs",
        "feature requests",
        "collaboration",
        "development",
        "submit",
        "branch",
        "merge",
        "code review",
        "community",
        "issues",
        "bug reports",
        "commit"
      ],
      "hf-content-keywords": [
        "issues",
        "contact",
        "suggestions",
        "improvements",
        "collaboration",
        "email",
        "open an issue",
        "feedback",
        "bug report",
        "help",
        "reach out",
        "community",
        "discussion",
        "contribution",
        "pull request",
        "modify",
        "enhance",
        "questions",
        "support",
        "contact author"
      ]
    },
    "license": {
      "gh-header-keywords": [
        "license",
        "licence",
        "copy right"
      ],
      "hf-header-keywords": [
        "license",
        "licence",
        "copyright",
        "terms",
        "usage terms",
        "legal",
        "rights"
      ],
      "gh-content-keywords": [
        "MIT",
        "Apache",
        "GPL",
        "BSD",
        "terms",
        "conditions",
        "distribution",
        "modification",
        "use",
        "commercial use",
        "liability",
        "warranty",
        "limitations",
        "rights",
        "reproduction",
        "software license",
        "license text",
        "open source",
        "copying",
        "proprietary"
      ],
      "hf-content-keywords": [
        "MIT",
        "Apache",
        "BSD",
        "GPL",
        "terms",
        "conditions",
        "use",
        "limitations",
        "copyright",
        "redistribution",
        "open source",
        "commercial use",
        "non-commercial",
        "Creative Commons",
        "CC BY",
        "license text",
        "restrictions",
        "public domain",
        "responsibility",
        "liability"
      ]
    }
  }
}

# Constants
GITHUB_API_URL = "https://api.github.com"
PER_PAGE = 100  # Maximum allowed by GitHub API
ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")  # Set your GitHub access token
TOPICS = ["data-science", "machine-learning", "ai", "api", "python","pytorch","data-science","computer-vision", "tensorflow", "llm", "artificial-intelligence"]  # Replace with your topics ["nlp"]
MIN_STAR = 90  # Define your minimum star count threshold


In [None]:


# Headers for authentication
headers = {}
if ACCESS_TOKEN:
    headers = {"Authorization": f"token {ACCESS_TOKEN}"}

# Function to collect repository data
def collect_repo_data():
    repos_data = []
    for topic in TOPICS:
        print(f"Processing topic: {topic}")
        stars_upper_limit = 500000  # Start with a very high star count
        while True:
            page = 1
            first_page = True
            total_pages = 1
            last_star_count = None  # To keep track of the last repository's star count
            while page <= total_pages:
                query = f"language:python stars:<{stars_upper_limit} fork:false topic:{topic} pushed:>=2023-01-01"
                params = {
                    "q": query,
                    "sort": "stars",
                    "order": "desc",
                    "per_page": PER_PAGE,
                    "page": page
                }

                response = requests.get(f"{GITHUB_API_URL}/search/repositories", headers=headers, params=params)
                if response.status_code != 200:
                    print(f"Failed to fetch repositories: {response.status_code}")
                    print(response.json())
                    break

                data = response.json()
                if first_page:
                    total_count = data.get('total_count', 0)
                    total_pages = min((total_count + PER_PAGE - 1) // PER_PAGE, 10)  # API caps at 1000 results
                    print(f"Total repositories found: {total_count}. Total pages: {total_pages}")
                    first_page = False

                items = data.get("items", [])
                if not items:
                    print(f"No repositories found on page {page}")
                    break

                for repo in items:
                    star_count = repo["stargazers_count"]
                    # Stop if the star count is below MIN_STAR
                    if star_count < MIN_STAR:
                        print(f"Reached repositories with stars less than {MIN_STAR}. Moving to next topic.")
                        break
                    repo_data = {
                        "repo_id": repo["id"],
                        "repo_name": repo["name"],
                        "full_name": repo["full_name"],
                        "owner_login": repo["owner"]["login"],
                        "repo_url": repo["html_url"],
                        "description": repo["description"],
                        "primary_language": repo["language"],
                        "topics": repo.get("topics", []),
                        "license": repo["license"]["name"] if repo["license"] else None,
                        "created_at": repo["created_at"],
                        "updated_at": repo["updated_at"],
                        "pushed_at": repo["pushed_at"],
                        "size": repo["size"],
                        "stargazers_count": star_count,
                        "watchers_count": repo["watchers_count"],
                        "forks_count": repo["forks_count"],
                        "open_issues_count": repo["open_issues_count"],
                        "default_branch": repo["default_branch"],
                        "score": repo.get("score"),
                        "is_fork": repo["fork"],
                        "visibility": repo.get("visibility", "public"),
                        "topic": topic  # Include the topic
                    }
                    repos_data.append(repo_data)
                    last_star_count = star_count
                if star_count < MIN_STAR:
                    break
        

                print(f"Completed page {page}/{total_pages} for topic {topic} with stars upper limit {stars_upper_limit}")
                page += 1
                time.sleep(1.5)  # Sleep to respect API rate limits
                
            if star_count < MIN_STAR:
                break

            # If total_count is less than 1000, we've retrieved all repositories for this stars_upper_limit
            if total_count < 1000:
                print(f"All repositories fetched for stars less than {stars_upper_limit}.")
                break
            elif last_star_count is not None:
                # Update the stars_upper_limit to be less than the last star count
                stars_upper_limit = last_star_count - 1
                print(f"Adjusting stars upper limit to {stars_upper_limit} and continuing.")
            else:
                # No more repositories to fetch
                break
        print(f"Finished processing topic: {topic}")

    return repos_data


repos_data = collect_repo_data()

# Save repos_data to CSV
repos_df = pd.DataFrame(repos_data)
repos_df = repos_df.drop_duplicates(subset=["full_name"])
repos_df.to_csv("repositories_.csv", index=False)

Processing topic: data-science
Total repositories found: 4211. Total pages: 10
Completed page 1/10 for topic data-science with stars upper limit 500000
Completed page 2/10 for topic data-science with stars upper limit 500000
Completed page 3/10 for topic data-science with stars upper limit 500000
Completed page 4/10 for topic data-science with stars upper limit 500000
Reached repositories with stars less than 90. Moving to next topic.
Finished processing topic: data-science
Processing topic: machine-learning
Total repositories found: 16174. Total pages: 10
Completed page 1/10 for topic machine-learning with stars upper limit 500000
Completed page 2/10 for topic machine-learning with stars upper limit 500000
Completed page 3/10 for topic machine-learning with stars upper limit 500000
Completed page 4/10 for topic machine-learning with stars upper limit 500000
Completed page 5/10 for topic machine-learning with stars upper limit 500000
Completed page 6/10 for topic machine-learning with 

In [None]:
import pandas as pd
import requests
import time
import os
from tqdm import tqdm
headers = {}
if ACCESS_TOKEN:
    headers = {"Authorization": f"token {ACCESS_TOKEN}"}

# Function to fetch READMEs for a list of repositories
def fetch_readmes(repo_full_names):
    readme_errors = []
    if not os.path.exists("readmes"):
        os.makedirs("readmes")

    for repo_full_name in tqdm(repo_full_names):

        #skip if readme already exists
        if os.path.exists(f"readmes/{repo_full_name.replace('/', '_')}.md"):
            continue


        readme_url = f"{GITHUB_API_URL}/repos/{repo_full_name}/readme"

        # Create a copy of headers and add 'Accept' header
        readme_headers = headers.copy()
        readme_headers['Accept'] = 'application/vnd.github.v3.raw'

        readme_resp = requests.get(readme_url, headers=readme_headers)

        if readme_resp.status_code == 200:
            # Save README to a file named after the repo full name
            # Replace slashes in the full name to make it a valid filename
            safe_name = repo_full_name.replace("/", "_")
            with open(f"readmes/{safe_name}.md", "w", encoding="utf-8") as f:
                f.write(readme_resp.text)
        else:
            readme_errors.append((repo_full_name, readme_resp.status_code))
            continue  # Skip repositories without README

        time.sleep(0.2)  # Sleep to respect API rate limits

    return readme_errors

repos_df = pd.read_csv("repositories.csv")

In [6]:
fetch_readmes(repos_df.full_name.values.tolist())

100%|██████████| 12751/12751 [45:35<00:00,  4.66it/s] 


[('sksalahuddin2828/Pandas_Numpy_Matplotlib_Plotly', 404),
 ('oliveirabruno01/babyagi-asi', 404),
 ('engineer-man/youtube', 404),
 ('BotoX/xiaomi-m365-firmware-patcher', 404),
 ('Artfunkel/BlenderSourceTools', 404),
 ('lilydjwg/winterpy', 404),
 ('K-G-PRAJWAL/Python-Projects', 404),
 ('fmartinou/tydom2mqtt', 404),
 ('JonasSchult/Mask3D', 403),
 ('emidan19/deep-tempest', 403),
 ('xlang-ai/UnifiedSKG', 403),
 ('snap-stanford/deepsnap', 403),
 ('wuji3/visiondk', 403),
 ('AmazingDD/daisyRec', 403),
 ('kuanghuei/SCAN', 403),
 ('pyg-team/pytorch-frame', 403),
 ('nomic-ai/contrastors', 403),
 ('tatp22/multidim-positional-encoding', 403),
 ('metaopt/torchopt', 403),
 ('kjsman/stable-diffusion-pytorch', 403),
 ('hkchengrex/STCN', 403),
 ('voidful/TextRL', 403),
 ('IntelLabs/bayesian-torch', 403),
 ('clcarwin/convert_torch_to_pytorch', 403),
 ('akanimax/pro_gan_pytorch', 403),
 ('v-iashin/video_features', 403),
 ('610265158/Peppa_Pig_Face_Landmark', 403),
 ('bradyz/cross_view_transformers', 403)

In [None]:
#read csv
import pandas as pd
repos_df = pd.read_csv("repositories.csv")

readme_text = []

#for every repo in the csv, get the readme
for index,row in repos_df.iterrows():
    repo_full_name = row["full_name"]
    safe_name = repo_full_name.replace("/", "_")

    #open the readme file
    try:
        with open(f"readmes/{safe_name}.md", "r", encoding="utf-8") as f:
            readme = f.read()
            readme_text.append(readme)
    except:
        readme_text.append(None)

repos_df["readme_text"] = readme_text
repos_df = repos_df.dropna(subset=["readme_text"])

from readmepp import ReadMe

predictor = ReadMe(lang='en')
predictor.model.to('cuda')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import re
import numpy as np

#count the occurences of "^- " of each readme
lists_count = repos_df["readme_text"].str.count(r"^- ", flags=re.MULTILINE)
#log then normalize it
lists_count = np.log1p(1 + lists_count)
lists_count = (lists_count - lists_count.min()) / (lists_count.max() - lists_count.min())
repos_df["lists_count"] = lists_count

#count hypperlinks
hyperlinks_count = repos_df["readme_text"].str.count(r"http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", flags=re.MULTILINE)
#log then normalize it
hyperlinks_count = np.log1p(1 + hyperlinks_count)
hyperlinks_count = (hyperlinks_count - hyperlinks_count.min()) / (hyperlinks_count.max() - hyperlinks_count.min())
repos_df["hyperlinks_count"] = hyperlinks_count

#number of images ending in .png, .jpg, .jpeg, .gif (url or local path) in .md ir .rst
#[image] (https://example.com/image.png)
#image:: https://example.com/image.png
#<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>
images_count = repos_df["readme_text"].str.count(r"!\[.*\]\(.*\.(png|jpg|jpeg|gif|svg)\)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"image::.*\.(png|jpg|jpeg|gif|svg)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>", flags=re.MULTILINE)
#log then normalize it
images_count = np.log1p(1 + images_count)
images_count = (images_count - images_count.min()) / (images_count.max() - images_count.min())
repos_df["images_count"] = images_count

#number of code blocks divide by 2 and round down
code_blocks_count = (repos_df["readme_text"].str.count(r"^```", flags=re.MULTILINE) // 2).astype(int)
#log then normalize it
code_blocks_count = np.log1p(1 + code_blocks_count)
code_blocks_count = (code_blocks_count - code_blocks_count.min()) / (code_blocks_count.max() - code_blocks_count.min())
repos_df["code_blocks_count"] = code_blocks_count

#number of new lines with content
content_lines_count = repos_df["readme_text"].str.count(r"^.*[^\s]", flags=re.MULTILINE)
#log then normalize it
content_lines_count = np.log1p(1 + content_lines_count)
content_lines_count = (content_lines_count - content_lines_count.min()) / (content_lines_count.max() - content_lines_count.min())
repos_df["content_lines_count"] = content_lines_count



# Flesch Reading Score
#textstat.flesch_reading_ease(test_data)
import textstat
flesch_reading_scores = repos_df["readme_text"].apply(lambda x: textstat.flesch_reading_ease(x) if x else None)
#normalize
flesch_reading_scores = (flesch_reading_scores - flesch_reading_scores.min()) / (flesch_reading_scores.max() - flesch_reading_scores.min())
repos_df["flesch_reading_scores"] = flesch_reading_scores


# number of headers
headers_count = repos_df["readme_text"].str.count(r"#+", flags=re.MULTILINE)
#log then normalize
headers_count = np.log1p(1 + headers_count)
headers_count = (headers_count - headers_count.min()) / (headers_count.max() - headers_count.min())
repos_df["headers_count"] = headers_count

In [5]:
repos_df["readmepp"] = repos_df["readme_text"].apply(lambda x: predictor.predict(x) if x else None)

In [9]:
import re
import pandas as pd

def parse_markdown_headers(text):
    '''Parse markdown text into a list of (header_level, header_text, content) tuples'''
    lines = text.split('\n')
    headers = []
    current_header = None
    current_content = []
    current_level = None
    for line in lines:
        header_match = re.match(r'^(#{1,3})\s+(.*)', line)
        if header_match:
            # Save the current content if any
            if current_header is not None:
                headers.append((current_level, current_header, '\n'.join(current_content)))
            # Start a new header
            current_level = len(header_match.group(1))
            current_header = header_match.group(2)
            current_content = []
        else:
            if current_header is not None:
                current_content.append(line)
    # Save the last header content
    if current_header is not None:
        headers.append((current_level, current_header, '\n'.join(current_content)))
    return headers

def process_readme(text, keywords_subset,content_subset, levels=(1,2,3)):
    """
    Process a single README text.
    Returns:
        header_found: True if at least one header keyword is found in headers of specified levels.
        percentage: Percentage of keywords found in content under matching headers.
    """
    headers = parse_markdown_headers(text)
    # Initialize
    header_found = False
    content_text = ''
    index = 0
    for level, header_text, content in headers:
        if level in levels:
            # Check if any of the keywords are present in the header text
            if any(kw.lower() in header_text.lower() for kw in keywords_subset):
                header_found = True
                #then collect all header with lower level and stop untill equal or higher level
                content_text += '\n' + content
                for next_level, next_header_text, next_content in headers[index+1:]:
                    if next_level <= level:
                        break
                    content_text += '\n' + next_content

        index += 1
    # If no matching headers, percentage is 0
    if not header_found or not keywords_subset:
        return header_found, 0.0
    # Now compute the percentage of keywords found in content_text
    total_keywords = len(content_subset)
    found_keywords = sum(1 for kw in content_subset if kw.lower() in content_text.lower())
    percentage = found_keywords / total_keywords
    return header_found, percentage

# Modify your main loop
readme_type = "gh"  # "gh" or "hf"
for keyword_category in keywords["keywords"]:
    print(keyword_category)

    # Initialize columns
    header_column = f"header_{keyword_category}"
    percentage_column = f"percentage_{keyword_category}"
    #repos_df[header_column] = False  # This will be the header_found boolean
    #repos_df[percentage_column] = 0.0  # This will be the keyword percentage

    keywords_subset = keywords["keywords"][keyword_category][f"{readme_type}-header-keywords"]
    content_subset = keywords["keywords"][keyword_category][f"{readme_type}-content-keywords"]

    # Apply the function to each row
    def process_row(row):
        text = row['readme_text']
        header_found, percentage = process_readme(text, keywords_subset, content_subset)
        return pd.Series({header_column: header_found, percentage_column: percentage})

    repos_df[[header_column, percentage_column]] = repos_df.apply(process_row, axis=1)

getting-started
contributing
license


# Dependant Extraction

In [12]:
stars_ = np.log1p(repos_df["stargazers_count"])
stars_ = (stars_ - stars_.min()) / (stars_.max() - stars_.min())
repos_df['stargazers_count_normalized'] = stars_

forks_ = np.log1p(repos_df["forks_count"])
forks_ = (forks_ - forks_.min()) / (forks_.max() - forks_.min())
repos_df['forks_count_normalized'] = forks_

# watchers_ = np.log1p(repos_df["watchers_count"])
# watchers_ = (watchers_ - watchers_.min()) / (watchers_.max() - watchers_.min())
# repos_df['watchers_count_normalized'] = watchers_

heurstics = repos_df["forks_count"]/repos_df["stargazers_count"]
heurstics = (heurstics - heurstics.min()) / (heurstics.max() - heurstics.min())
repos_df['onboarding_normalized'] = heurstics

repos_df[['readmepp_normalized']] = (repos_df[['readmepp']] - 1)/5

In [13]:
#Independant Columns normalized between 0 and 1
count_independent_columns = ["lists_count", "hyperlinks_count", "images_count", "code_blocks_count", "content_lines_count", "headers_count"]
#score Indecpend column
score_independent_columns = ["flesch_reading_scores" , "readmepp_normalized"]
#keyword independant columns#binary 0 or 1
keyword_independent_columns = ["header_getting-started", "header_contributing", "header_license"]
content_indenpendant_columns = ["percentage_getting-started", "percentage_contributing", "percentage_license"]

dependant_columns = ["stargazers_count_normalized", "forks_count_normalized", "onboarding_normalized"]
#dependant_columns = dependant_columns[-1:]


independent_columns = count_independent_columns + score_independent_columns + keyword_independent_columns + content_indenpendant_columns


score_dict={"counts":{ column:{} for column in count_independent_columns}
            ,
       "keyword":{ column:{} for column in keyword_independent_columns},
       "content": { column:{} for column in content_indenpendant_columns},
       "scores":{ column:{} for column in score_independent_columns}
       }


repos_df = repos_df.dropna(subset=independent_columns + dependant_columns)
# Pearson correlation
from sklearn.feature_selection import r_regression

for dependant in dependant_columns:
    score = r_regression(repos_df[independent_columns].values, repos_df[dependant].values)

    for i, column in enumerate(independent_columns):
        print(f"Pearson correlation between {column} and {dependant}: {round(score[i],3)}")

        if column in count_independent_columns:
            score_dict["counts"][column][dependant] = round(score[i],3)
        elif column in keyword_independent_columns:
            score_dict["keyword"][column][dependant] = round(score[i],3)
        elif column in score_independent_columns:
            score_dict["scores"][column][dependant] = round(score[i],3)

for dependant in dependant_columns:
    for i in range(len(content_indenpendant_columns)):
        content_col = content_indenpendant_columns[i]
        keyword_col = keyword_independent_columns[i]
        repos_df_sub = repos_df[repos_df[keyword_col] == True]
        
        score = r_regression(repos_df_sub[[content_col]].values, repos_df_sub[dependant].values)
        print(f"Pearson correlation between {content_col} and {dependant}: {round(score[0],3)}")
        score_dict["content"][content_col][dependant] = round(score[0],3)
    


# Cliffs Delta


#Fishers Exact Test
# import scipy.stats as stats

# odd_ratio, p_value = stats.fisher_exact([repos_df["content_lines_count"], repos_df["stargazers_count_normalized"]])


#logistic regression feature important extraction
# from sklearn.linear_model import LinearRegression

# X = repos_df[independent_columns].values
# y = repos_df[dependant_columns[0]].values

# clf = LinearRegression().fit(X, y)
# clf.coef_  # logistic regression feature importance extraction

# for i, column in enumerate(independent_columns):
#     print(f"Linear regression coefficient for {column}: {round(clf.coef_[i],3)}")

#     if column in count_independent_columns:
#         score_dict["counts"][column]['LinearRegression'] = round(clf.coef_[i],3)
#     elif column in keyword_independent_columns:
#         score_dict["keyword"][column]['LinearRegression'] = round(clf.coef_[i],3)
#     elif column in score_independent_columns:
#         score_dict["scores"][column]['LinearRegression'] = round(clf.coef_[i],3)


# random forest feature importance extraction

# from sklearn.ensemble import RandomForestRegressor

# clf = RandomForestRegressor().fit(X, y)
# clf.feature_importances_  # random forest feature importance extraction

# for i, column in enumerate(independent_columns):
#     print(f"Random forest feature importance for {column}: {round(clf.feature_importances_[i],3)}")

#     if column in count_independent_columns:
#         score_dict["counts"][column]["Random Forest"] = round(clf.feature_importances_[i],3)
#     elif column in keyword_independent_columns:
#         score_dict["keyword"][column]["Random Forest"] = round(clf.feature_importances_[i],3)
#     elif column in score_independent_columns:
#         score_dict["scores"][column]["Random Forest"] = round(clf.feature_importances_[i],3)


Pearson correlation between lists_count and stargazers_count_normalized: 0.106
Pearson correlation between hyperlinks_count and stargazers_count_normalized: 0.319
Pearson correlation between images_count and stargazers_count_normalized: 0.197
Pearson correlation between code_blocks_count and stargazers_count_normalized: 0.038
Pearson correlation between content_lines_count and stargazers_count_normalized: 0.176
Pearson correlation between headers_count and stargazers_count_normalized: 0.102
Pearson correlation between flesch_reading_scores and stargazers_count_normalized: -0.023
Pearson correlation between readmepp_normalized and stargazers_count_normalized: 0.02
Pearson correlation between header_getting-started and stargazers_count_normalized: -0.016
Pearson correlation between header_contributing and stargazers_count_normalized: 0.078
Pearson correlation between header_license and stargazers_count_normalized: 0.029
Pearson correlation between percentage_getting-started and stargazer

In [16]:
# convert to pands dataframe with mulit index
# rows is the score type
# columns is the feature (with subcolumns for each method)

score_df = pd.DataFrame()
for score_type, score_data in score_dict.items():
    df_sub = pd.DataFrame(score_data).T
    df_sub = df_sub.reset_index()
    df_sub.insert(0, "score_type", score_type)
    
    score_df = pd.concat([score_df, df_sub], axis=0)

sort_array = [2] * len(content_indenpendant_columns) + [0] * len(count_independent_columns) + [1] * len(keyword_independent_columns) + [3] * len(score_independent_columns)
df_group = score_df.groupby(["score_type",'index']).mean()
df_group["sort"] = sort_array
df_group = df_group.sort_values(by=["sort"]).drop(columns="sort")
df_group

Unnamed: 0_level_0,Unnamed: 1_level_0,stargazers_count_normalized,forks_count_normalized,onboarding_normalized
score_type,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
counts,code_blocks_count,0.038,-0.07,-0.111
counts,content_lines_count,0.176,0.059,-0.132
counts,headers_count,0.102,-0.002,-0.105
counts,hyperlinks_count,0.319,0.24,-0.064
counts,images_count,0.197,0.138,-0.048
counts,lists_count,0.106,0.043,-0.055
keyword,header_contributing,0.078,0.047,-0.024
keyword,header_getting-started,-0.016,-0.105,-0.106
keyword,header_license,0.029,0.01,0.001
content,percentage_contributing,0.018,0.035,0.04


In [27]:
# repos_df["header_getting-started"].sum()
# repos_df["header_contributing"].sum()
# repos_df["header_license"].sum()
print(f"getting-started: {repos_df['header_getting-started'].sum()}")
print(f"contributing: {repos_df['header_contributing'].sum()}")
print(f"license: {repos_df['header_license'].sum()}")

getting-started: 6281
contributing: 2573
license: 2283


In [32]:
from readmepp import ReadMe

predictor = ReadMe(lang='en')


ModuleNotFoundError: No module named 'torch'

In [28]:
predictor.predict("hello how are you")

NameError: name 'predictor' is not defined

In [37]:
import torch
#reload torch module

torch.cuda.is_available()

False

In [None]:
pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu124