In [2]:
# Import libraries
import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.parse import urlparse

In [3]:
# Load the JSON file into a pandas dataframe
df = pd.read_parquet('Data/Processed/procesed_relational_dataset.parquet')

In [4]:
new_df = df[['github_link', 'project_name', 'project_url']].copy()
new_df['link'] = df['github_link'].apply(lambda x: urlparse(x).path)
new_df['maintenance_score'] = None

new_df

Unnamed: 0,github_link,project_name,project_url,link,maintenance_score
0,https://github.com/0-k/netto,netto,0-k%2Fnetto,/0-k/netto,
1,https://github.com/00001h/gamegui,GameGui,00001h%2Fgamegui,/00001h/gamegui,
2,https://github.com/007gzs/apiutils,apiutils,007gzs%2Fapiutils,/007gzs/apiutils,
3,https://github.com/007gzs/avatars,avatars,007gzs%2Favatars,/007gzs/avatars,
4,https://github.com/007gzs/baijiayun,baijiayun,007gzs%2Fbaijiayun,/007gzs/baijiayun,
...,...,...,...,...,...
30499,https://github.com/zzzsochi/aiohttp_traversal,aiohttp_traversal,zzzsochi%2Faiohttp_traversal,/zzzsochi/aiohttp_traversal,
30500,https://github.com/zzzsochi/cats,cats,zzzsochi%2Fcats,/zzzsochi/cats,
30501,https://github.com/zzzsochi/includer,includer,zzzsochi%2Fincluder,/zzzsochi/includer,
30502,https://github.com/zzzsochi/rpio-server,rpio-server,zzzsochi%2Frpio-server,/zzzsochi/rpio-server,


In [17]:
# Function to make the API request
def get_scorecard_info(project_url):
    url = f"https://api.securityscorecards.dev/projects/github.com{project_url}/"
    headers = {
        'Accept': 'application/json'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 403:
        print("API rate limit exceeded.")
        return None
    else:
        print(f"Error {response.status_code}: {project_url}")
        return None

# Iterate over each project and make the API request
results = []
for index, row in new_df.iterrows():
    project_url = row['link']
    result = get_scorecard_info(project_url)
    if result:
        data = result.pop('checks')
        maintained_dict = {}
        for check in data:
            if check['name'] == 'Maintained':
                maintained_dict = data.pop(data.index(check))
                new_df.at[index, 'maintenance_score'] = maintained_dict['score']
                print(maintained_dict['reason'])
                break

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

3 commit(s) and 1 issue activity found in the last 90 days -- score normalized to 3
7 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 5
9 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 7
0 commit(s) and 3 issue activity found in the last 90 days -- score normalized to 2
30 commit(s) and 3 issue activity found in the last 90 days -- score normalized to 10
4 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 3
30 commit(s) and 4 issue activity found in the last 90 days -- score normalized to 10
0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0
0 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 0
7 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 5
5 commit(s) and 0 issue activity found in the last 90 days -- score normalized to 4
30 commit(s) and 23 issue activity found in the last 90 days -- score no

In [18]:
new_df.to_parquet('Data/Processed/maintenance_score.parquet')

# DON'T EXECUTE THE CODE AFTER THIS CELL

In [5]:
maintenance_score = pd.read_parquet('Data/Processed/maintenance_score_30k.parquet')
maintenance_score

Unnamed: 0,06-2024,07-2024
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
30499,0,0
30500,0,0
30501,0,0
30502,0,0


In [6]:
def count_maintenance_scores(df):
    score_counts = df['07-2024'].value_counts()
    return score_counts

# Call the function with the maintenance_score DataFrame
scores = count_maintenance_scores(maintenance_score)
scores

07-2024
0     28113
10      873
1       381
2       244
5       217
3       211
4       143
6       106
7        79
8        76
9        61
Name: count, dtype: int64

In [14]:
new_df['maintenance_score'] = maintenance_score['07-2024']
new_df

Unnamed: 0,github_link,project_name,project_url,link,maintenance_score
0,https://github.com/0-k/netto,netto,0-k%2Fnetto,/0-k/netto,0
1,https://github.com/00001h/gamegui,GameGui,00001h%2Fgamegui,/00001h/gamegui,0
2,https://github.com/007gzs/apiutils,apiutils,007gzs%2Fapiutils,/007gzs/apiutils,0
3,https://github.com/007gzs/avatars,avatars,007gzs%2Favatars,/007gzs/avatars,0
4,https://github.com/007gzs/baijiayun,baijiayun,007gzs%2Fbaijiayun,/007gzs/baijiayun,0
...,...,...,...,...,...
30499,https://github.com/zzzsochi/aiohttp_traversal,aiohttp_traversal,zzzsochi%2Faiohttp_traversal,/zzzsochi/aiohttp_traversal,0
30500,https://github.com/zzzsochi/cats,cats,zzzsochi%2Fcats,/zzzsochi/cats,0
30501,https://github.com/zzzsochi/includer,includer,zzzsochi%2Fincluder,/zzzsochi/includer,0
30502,https://github.com/zzzsochi/rpio-server,rpio-server,zzzsochi%2Frpio-server,/zzzsochi/rpio-server,0


In [15]:
# import pandas as pd

# def get_samples(df):
#     samples = []
#     unique_scores = df['maintenance_score'].unique()
    
#     for score in unique_scores:
#         score_df = df[df['maintenance_score'] == score]
#         if len(score_df) >= 10:  # Ensure there are at least 10 rows
#             score_samples = score_df.sample(n=10, random_state=42)
#             samples.append(score_samples)
#         else:
#             print(f"Not enough samples for maintenance_score = {score}")

#     return pd.concat(samples, ignore_index=True)

# # Call the function with the maintenance_score DataFrame
# samples_df = get_samples(new_df)
# samples_df

Unnamed: 0,github_link,project_name,project_url,link,maintenance_score
0,https://github.com/yhat/busby,busby,yhat%2Fbusby,/yhat/busby,0
1,https://github.com/ababic/django-cogwheels,django-cogwheels,ababic%2Fdjango-cogwheels,/ababic/django-cogwheels,0
2,https://github.com/liamblake/hcvote,hcvote,liamblake%2Fhcvote,/liamblake/hcvote,0
3,https://github.com/nikalexis/django_htmx_ui_ad...,django_htmx_ui_adminlte,nikalexis%2Fdjango_htmx_ui_adminlte,/nikalexis/django_htmx_ui_adminlte,0
4,https://github.com/mhauru/ncon,ncon,mhauru%2Fncon,/mhauru/ncon,0
...,...,...,...,...,...
105,https://github.com/technion-kishony-lab/quibbler,quibbler,technion-kishony-lab%2Fquibbler,/technion-kishony-lab/quibbler,9
106,https://github.com/molssi-seamm/torchani_step,torchani_step,molssi-seamm%2Ftorchani_step,/molssi-seamm/torchani_step,9
107,https://github.com/rpdelaney/dumbpw,dumbpw,rpdelaney%2Fdumbpw,/rpdelaney/dumbpw,9
108,https://github.com/dmulholl/pyro,pyro,dmulholl%2Fpyro,/dmulholl/pyro,9


In [16]:
samples_df.to_csv("Data/Processed/sample_repositories_updated.csv", index=False)