In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
# Function to fill the new dataframe with commit counts
def fill_counts(row, row_index, df, information):
    for entry in row:
        year_month = f"{entry['month']:02d}-{entry['year']}"
        if year_month in df.columns:
            df.at[row_index, year_month] = entry[information]


def implement_months(repository):
    if 'date_month' in repository.columns:
        # Convert 'date_month' column to datetime format
        repository['date_month'] = pd.to_datetime(repository['date_month'])

        # Extract year and month from the 'date_month' column
        repository['year'] = repository['date_month'].dt.year
        repository['month'] = repository['date_month'].dt.month

        repository.drop(columns=['date_month'], inplace=True)
    
    repository = repository.sort_values(by=['year', 'month'], ascending=True)
    repository.reset_index(inplace=True, drop=True)
    repository['month'] = repository['month'].astype(str).str.zfill(2)
    repository['date'] = repository['month'].astype(str) + '-' + repository['year'].astype(str)

    if repository.empty:
        return None

    # Create a complete date range from the minimum to maximum dates in the original data
    # max and min values likely to be a fixed value for all repositories
    min_year, min_month = repository['year'].iloc[0], repository['month'].iloc[0]
    max_year, max_month = repository['year'].iloc[-1], repository['month'].iloc[-1]
    min_date = f"{min_month}-{min_year}"
    max_date = f"{max_month}-{max_year}"
    date_range = pd.date_range(start=min_date, end=max_date, freq='MS')

    # Create a DataFrame from the date range
    date_df = pd.DataFrame({'date': date_range})

    # Extract year and month from the date range
    date_df['year'] = date_df['date'].dt.year
    date_df['month'] = date_df['date'].dt.month.astype(str).str.zfill(2)

    # Convert the date column to the same format as in your original DataFrame
    date_df['date'] = date_df['date'].dt.strftime('%m-%Y')

    # Merge the original DataFrame with the date DataFrame to fill in missing values
    repository = pd.merge(date_df, repository, on=['year', 'month', 'date'], how='left')

    repository = repository.fillna(0)

    return repository

def array_to_duration(repository, column):
    repository["duration"] = repository[column].apply(lambda x: x[0] * 30 + x[1] + x[2] / (24 * 3600) + x[3] / (24 * 3600 * 10 ** 9) if x is not None and len(x) == 4 else 9999.9999)
    repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
    repository.drop(columns=[column], inplace=True)
    return repository

def extract_comments_and_issues(json_data):
    issue_df = json_data[['issue.createdAt', 'issue.creatorRole', 'comments']].copy()
    issue_df['issue.createdAt'] = issue_df['issue.createdAt'].apply(lambda x: pd.to_datetime(x))
    issue_df['month'] = issue_df['issue.createdAt'].dt.month
    issue_df['year'] = issue_df['issue.createdAt'].dt.year
    issue_df['date'] = issue_df['month'].astype(str).str.zfill(2) + '-' + issue_df['year'].astype(str)
    issue_df = issue_df.rename(columns={'issue.creatorRole': 'creatorRole'})
    issue_df = issue_df.drop(columns=['issue.createdAt'])

    comments_list = []
    for comments in issue_df['comments']:
        comments_list.extend(comments)
    issue_df = issue_df.drop(columns=['comments'])

    if comments_list != []:    
        comments_df = pd.json_normalize(comments_list)
        comments_df['createdAt'] = comments_df['createdAt'].apply(lambda x: pd.to_datetime(x))
        comments_df['month'] = comments_df['createdAt'].dt.month
        comments_df['year'] = comments_df['createdAt'].dt.year
        comments_df = comments_df.drop(columns=['createdAt', 'creator'])

        issue_df = pd.concat([issue_df, comments_df]).reset_index(drop=True)
        

    # Filtering valid roles
    valid_roles = ['COLLABORATOR', 'MEMBER', 'OWNER']
    issue_df = issue_df[issue_df['creatorRole'].isin(valid_roles)]

    grouped_counts = issue_df.groupby(['month', 'year']).size().reset_index(name='sum')
    # grouped_counts['month'] = grouped_counts['month'].astype('Int64')
    # grouped_counts['year'] = grouped_counts['year'].astype('Int64')
    grouped_counts['sum'] = grouped_counts['sum'].astype('Int64')
    grouped_counts = implement_months(grouped_counts)
    return grouped_counts

def calculate_three_month_score(df):
    score = df.sum(axis=1)
    return score

def check_all_true(df):
    return df.apply(lambda row: row[df.columns[0]] and row[df.columns[1]] and row[df.columns[2]], axis=1)

## Reading the JSON file

In [17]:
df = pd.read_json("../../01_input/json/pypi_metrics_file_36k_updated.json")
data_df = df.transpose()

data_df.reset_index(inplace=True)
data_df.rename(columns={'index': 'github_link'}, inplace=True)
data_df = data_df.reindex(columns=['project_name', 'github_link', 'project_url', 'project_id', 'metric_results'])

# Extract parameters from metric_results column
df = pd.json_normalize(data_df['metric_results'])

# Merge the two dataframes
data_df = pd.concat([data_df, df], axis=1)

# Assuming your DataFrame is named data_df
data_df = data_df.map(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)
df = data_df
df


Unnamed: 0,project_name,github_link,project_url,project_id,metric_results,get_commits_per_month,get_avg_issue_close_time_per_month,get_avg_pull_request_close_time_per_month,get_new_issue_author_count_per_month,get_new_pull_request_author_count_per_month,get_avg_issue_response_time_per_month,get_avg_pull_request_merge_time_per_month,get_closed_issues_per_month,get_closed_pull_requests_per_month,get_commits_count_by_author_descending,get_label_issue_and_pull_request_count,get_issue_author_comment_count,get_discussion_author_comment_count,get_project_information,get_issues_and_issue_comments
0,netto,https://github.com/0-k/netto,0-k%2Fnetto,R_kgDOIU4I5A,"{'get_commits_per_month': [{'year': 2022, 'mon...","[{'year': 2022, 'month': 11, 'COUNT(c)': 47}, ...",,,,,,,,,"[{'author_login': '0-k', 'commit_count': 65}]","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
1,GameGui,https://github.com/00001h/gamegui,00001h%2Fgamegui,R_kgDOGTAsgw,"{'get_commits_per_month': [{'year': 2022, 'mon...","[{'year': 2022, 'month': 1, 'COUNT(c)': 18}, {...",,,,,,,,,"[{'author_login': '00001H', 'commit_count': 53}]","[{'l.name': 'Focus on', 'issue_label_count': 0...",,,"[{'isArchived': True, 'archivedAt': '2023-03-2...",
2,apiutils,https://github.com/007gzs/apiutils,007gzs%2Fapiutils,MDEwOlJlcG9zaXRvcnkyMzI3MDY3OTU=,"{'get_commits_per_month': [{'year': 2020, 'mon...","[{'year': 2020, 'month': 8, 'COUNT(c)': 5}, {'...",,,,,,,,,"[{'author_login': '007gzs', 'commit_count': 35...","[{'l.name': 'invalid', 'issue_label_count': 0,...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
3,avatars,https://github.com/007gzs/avatars,007gzs%2Favatars,MDEwOlJlcG9zaXRvcnkyMjU3NTA3Mjc=,"{'get_commits_per_month': [{'year': 2019, 'mon...","[{'year': 2019, 'month': 12, 'COUNT(c)': 11}]",,,,,,,,,"[{'author_login': '007gzs', 'commit_count': 11}]","[{'l.name': 'bug', 'issue_label_count': 0, 'pu...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
4,baijiayun,https://github.com/007gzs/baijiayun,007gzs%2Fbaijiayun,MDEwOlJlcG9zaXRvcnkyNjQwODM1NTQ=,"{'get_commits_per_month': [{'year': 2020, 'mon...","[{'year': 2020, 'month': 5, 'COUNT(c)': 7}]",,,,,,,,,"[{'author_login': 'default', 'commit_count': 6...","[{'l.name': 'enhancement', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36682,cats,https://github.com/zzzsochi/cats,zzzsochi%2Fcats,MDEwOlJlcG9zaXRvcnk0OTA5NzIxMA==,"{'get_commits_per_month': [{'year': 2016, 'mon...","[{'year': 2016, 'month': 1, 'COUNT(c)': 6}]",,,"[{'date_month': '2016-03-01T00:00:00+00:00', '...",,"[{'year': 2016, 'month': 3, 'avg_response_time...",,"[{'date_month': '2016-03-01T00:00:00+00:00', '...",,"[{'author_login': 'zzzsochi', 'commit_count': 6}]","[{'l.name': 'bug', 'issue_label_count': 0, 'pu...","[{'u.login': 'magniff', 'comment_count': 2}, {...",,"[{'isArchived': False, 'archivedAt': '0001-01-...","[{'issue': {'creatorRole': 'NONE', 'title': 'C..."
36683,includer,https://github.com/zzzsochi/includer,zzzsochi%2Fincluder,MDEwOlJlcG9zaXRvcnkzNjYwNjI5OA==,"{'get_commits_per_month': [{'year': 2015, 'mon...","[{'year': 2015, 'month': 11, 'COUNT(c)': 1}, {...",,,,,,,,,"[{'author_login': 'zzzsochi', 'commit_count': 9}]","[{'l.name': 'duplicate', 'issue_label_count': ...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
36684,rpio-server,https://github.com/zzzsochi/rpio-server,zzzsochi%2Frpio-server,MDEwOlJlcG9zaXRvcnk0MTgwMDcxOA==,"{'get_commits_per_month': [{'year': 2016, 'mon...","[{'year': 2016, 'month': 7, 'COUNT(c)': 1}, {'...",,,,,,,,,"[{'author_login': 'zzzsochi', 'commit_count': 7}]","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
36685,zini,https://github.com/zzzsochi/zini,zzzsochi%2Fzini,MDEwOlJlcG9zaXRvcnk0OTk1MTYyOQ==,"{'get_commits_per_month': [{'year': 2016, 'mon...","[{'year': 2016, 'month': 1, 'COUNT(c)': 16}, {...",,"[{'year': 2017, 'month': 4, 'AVG(open_duration...",,"[{'date_month': '2017-04-01T00:00:00+00:00', '...",,"[{'year': 2017, 'month': 4, 'avg_merge_duratio...",,"[{'date_month': '2017-04-01T00:00:00+00:00', '...","[{'author_login': 'zzzsochi', 'commit_count': ...","[{'l.name': 'invalid', 'issue_label_count': 0,...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",


In [18]:
# Define the start and end dates (we are getting three months before of the starting date, because each month should consider the activities based on the last 90 days)
start_year, start_month = 2022, 11
end_year, end_month = 2023, 12

# Generate the list of months between start and end dates
months = pd.date_range(start=f"{start_month}-{start_year}", end=f"{end_month}-{end_year}", freq='MS').strftime("%m-%Y").tolist()

In [19]:
commit_per_month = df['get_commits_per_month']

# Create a new dataframe with months as columns
commit_per_month_structured = pd.DataFrame(index=commit_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(commit_per_month)):
    fill_counts(commit_per_month.iloc[i], i, commit_per_month_structured, 'COUNT(c)')

commit_per_month_structured.fillna(0, inplace=True)
commit_per_month_structured

  commit_per_month_structured.fillna(0, inplace=True)


Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,47,5,0,0,0,0,6,0,0,0,0,0,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36682,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36683,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36684,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36685,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
issues = df["get_issues_and_issue_comments"]

# Create a new dataframe with months as columns
issues_structured = pd.DataFrame(index=issues.index, columns=months)

# # Apply the function to each row
for i in range(len(issues)):
    # Skip if the entry is None or NaN
    if issues[i] is None or (isinstance(issues[i], float) and np.isnan(issues[i])):
        continue
    # print(i)
    # # Check if issues[i] is a list before using json_normalize
    # if not isinstance(issues[i], list):
    #     continue
    inp = pd.json_normalize(issues[i])
    df_entry = extract_comments_and_issues(inp)
    if df_entry is not None:
        # print(i)
        for j in df_entry['date']:
            if j in issues_structured.columns:
                issues_structured.at[i, j] = df_entry[df_entry['date'] == j]['sum'].values[0]

issues_structured.fillna(0, inplace=True)
issues_structured

  issues_structured.fillna(0, inplace=True)


Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36682,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36683,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36684,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36685,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
project_information = pd.json_normalize(df['get_project_information'].apply(lambda x: x[0] if x is not None else None))
# Convert "archivedAt" and "createdAt" columns to datetime type
project_information["archivedAt"] = project_information["archivedAt"].apply(lambda x: pd.to_datetime(x) if x != "0001-01-01T01:01:01+00:00" else pd.to_datetime("1970-01-01T00:00:00+00:00"))
project_information["createdAt"] = pd.to_datetime(project_information["createdAt"])

# Extract year and month
project_information["create_year"] = project_information["createdAt"].dt.year.astype('Int64')
project_information["create_month"] = project_information["createdAt"].dt.month.astype('Int64')
project_information["archive_year"] = project_information["archivedAt"].dt.year.astype('Int64')
project_information["archive_month"] = project_information["archivedAt"].dt.month.astype('Int64')
isArchived = project_information["isArchived"].astype('Int64')
project_information.drop(columns=['archivedAt', 'createdAt'], inplace=True)

project_information
# project_information = project_information.astype(int)
project_information

Unnamed: 0,isArchived,create_year,create_month,archive_year,archive_month
0,False,2022,10,1970,1
1,True,2021,10,2023,3
2,False,2020,1,1970,1
3,False,2019,12,1970,1
4,False,2020,5,1970,1
...,...,...,...,...,...
36682,False,2016,1,1970,1
36683,False,2015,5,1970,1
36684,False,2015,9,1970,1
36685,False,2016,1,1970,1


In [22]:
# Create a new dataframe with months as columns
project_information_structured = pd.DataFrame(index=project_information.index, columns=months)

def fill_dataframe(df1, start_year, start_month, end_year, end_month):
    # Generate the months for the second table
    months = pd.date_range(start=f"{start_month}-{start_year}", end=f"{end_month}-{end_year}", freq='MS').strftime("%m-%Y").tolist()
    
    # Initialize the second dataframe with NaN values
    df2 = pd.DataFrame(np.nan, index=df1.index, columns=months)

    for idx, row in df1.iterrows():
        # Check for NAType or missing values
        if pd.isna(row['create_year']) or pd.isna(row['create_month']) or (row['isArchived'] and (pd.isna(row['archive_year']) or pd.isna(row['archive_month']))):
            print(f"Row {idx} contains missing data. Filling row with False values.")
            df2.loc[idx, months] = False
            continue  # Skip to the next iteration

        # Convert to integers
        create_year = int(row['create_year'])
        create_month = int(row['create_month'])
        create_date = pd.Period(year=create_year, month=create_month, freq='M')
        
        if row['isArchived']:
            archive_year = int(row['archive_year'])
            archive_month = int(row['archive_month'])
            archive_date = pd.Period(year=archive_year, month=archive_month, freq='M')
        else:
            archive_date = pd.Period(year=end_year, month=end_month, freq='M')
        
        start_date = pd.Period(year=start_year, month=start_month, freq='M')

        start_fill = max(create_date, start_date)
        end_fill = archive_date

        for month in months:
            period = pd.Period(month, freq='M')
            if start_fill <= period <= end_fill:
                df2.at[idx, month] = True
            else:
                df2.at[idx, month] = False

    return df2

# Apply the function to each row in the original dataframe
project_information_structured = fill_dataframe(project_information, start_year, start_month, end_year, end_month)
# project_information_structured = project_information_structured.fillna(False)
project_information_structured

  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True


Row 2034 contains missing data. Filling row with False values.
Row 3435 contains missing data. Filling row with False values.
Row 6677 contains missing data. Filling row with False values.
Row 6678 contains missing data. Filling row with False values.
Row 9118 contains missing data. Filling row with False values.
Row 12091 contains missing data. Filling row with False values.
Row 12347 contains missing data. Filling row with False values.
Row 14796 contains missing data. Filling row with False values.
Row 16810 contains missing data. Filling row with False values.
Row 19154 contains missing data. Filling row with False values.
Row 22167 contains missing data. Filling row with False values.
Row 24396 contains missing data. Filling row with False values.
Row 24501 contains missing data. Filling row with False values.
Row 27028 contains missing data. Filling row with False values.
Row 29434 contains missing data. Filling row with False values.
Row 32219 contains missing data. Filling row 

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,False,False,False,False,False,False,False,False,False
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36682,True,True,True,True,True,True,True,True,True,True,True,True,True,True
36683,True,True,True,True,True,True,True,True,True,True,True,True,True,True
36684,True,True,True,True,True,True,True,True,True,True,True,True,True,True
36685,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [23]:
# Define the start and end dates (we are getting three months before of the starting date, because each month should consider the activities based on the last 90 days)
start_year, start_month = 2023, 1
end_year, end_month = 2023, 12

# Generate the list of months between start and end dates
months = pd.date_range(start=f"{start_month}-{start_year}", end=f"{end_month}-{end_year}", freq='MS').strftime("%m-%Y").tolist()
months

['01-2023',
 '02-2023',
 '03-2023',
 '04-2023',
 '05-2023',
 '06-2023',
 '07-2023',
 '08-2023',
 '09-2023',
 '10-2023',
 '11-2023',
 '12-2023']

In [24]:
pi_activity_score = pd.DataFrame(index=project_information_structured.index, columns=months)
commit_activity_score = pd.DataFrame(index=commit_per_month_structured.index, columns=months)
issue_activity_score = pd.DataFrame(index=issues_structured.index, columns=months)

for i in range(len(pi_activity_score.columns)):
    pi_activity_score.iloc[:, i] = check_all_true(project_information_structured.iloc[:, i:i+3])
    commit_activity_score.iloc[:, i] = calculate_three_month_score(commit_per_month_structured.iloc[:, i:i+3])
    issue_activity_score.iloc[:, i] = calculate_three_month_score(issues_structured.iloc[:, i:i+3])

# commit_activity_score

In [25]:
import math
maintained_score = commit_activity_score + issue_activity_score
t = 4 * 90 / 30
maintained_score = maintained_score.map(lambda x: min(math.floor(10 * x  / t), 10)) 
# maintained_score = maintained_score.map(lambda x: min(x, 10)) 
maintained_score = maintained_score.where(pi_activity_score, 0)
maintained_score = maintained_score.astype(int)
maintained_score

Unnamed: 0,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,10,4,0,0,5,5,5,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36682,0,0,0,0,0,0,0,0,0,0,0,0
36683,0,0,0,0,0,0,0,0,0,0,0,0
36684,0,0,0,0,0,0,0,0,0,0,0,0
36685,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
maintained_score['09-2023'].value_counts()

09-2023
0     30751
10     2875
1       660
5       512
2       462
3       420
4       318
6       204
7       195
8       161
9       129
Name: count, dtype: int64

In [27]:
df['maintenance_score'] = maintained_score['09-2023']
df.to_parquet('../../01_input/input/procesed_relational_dataset_with_maintained_score.parquet')

In [24]:
df.head()

Unnamed: 0,project_name,github_link,project_url,project_id,metric_results,get_commits_per_month,get_avg_issue_close_time_per_month,get_avg_pull_request_close_time_per_month,get_new_issue_author_count_per_month,get_new_pull_request_author_count_per_month,...,get_closed_issues_per_month,get_closed_pull_requests_per_month,get_commits_count_by_author_descending,get_label_issue_and_pull_request_count,get_issue_author_comment_count,get_discussion_author_comment_count,get_project_information,get_issues_and_issue_comments,09-2023,maintenance_score
0,netto,https://github.com/0-k/netto,0-k%2Fnetto,R_kgDOIU4I5A,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 47, 'month': 11, 'year': 2022}, ...",,,,,...,,,"[{'author_login': '0-k', 'commit_count': 65}]","[{'issue_label_count': 0, 'l.name': 'help want...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",,0,0
1,GameGui,https://github.com/00001h/gamegui,00001h%2Fgamegui,R_kgDOGTAsgw,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 18, 'month': 1, 'year': 2022}, {...",,,,,...,,,"[{'author_login': '00001H', 'commit_count': 53}]","[{'issue_label_count': 0, 'l.name': 'Focus on'...",,,"[{'archivedAt': '2023-03-28T13:54:56+00:00', '...",,0,0
2,apiutils,https://github.com/007gzs/apiutils,007gzs%2Fapiutils,MDEwOlJlcG9zaXRvcnkyMzI3MDY3OTU=,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 5, 'month': 8, 'year': 2020}, {'...",,,,,...,,,"[{'author_login': '007gzs', 'commit_count': 35...","[{'issue_label_count': 0, 'l.name': 'invalid',...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",,0,0
3,avatars,https://github.com/007gzs/avatars,007gzs%2Favatars,MDEwOlJlcG9zaXRvcnkyMjU3NTA3Mjc=,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 11, 'month': 12, 'year': 2019}]",,,,,...,,,"[{'author_login': '007gzs', 'commit_count': 11}]","[{'issue_label_count': 0, 'l.name': 'bug', 'pu...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",,0,0
4,baijiayun,https://github.com/007gzs/baijiayun,007gzs%2Fbaijiayun,MDEwOlJlcG9zaXRvcnkyNjQwODM1NTQ=,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 7, 'month': 5, 'year': 2020}]",,,,,...,,,"[{'author_login': 'default', 'commit_count': 6...","[{'issue_label_count': 0, 'l.name': 'enhanceme...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",,0,0


In [12]:
maintained_score.to_parquet('../../01_input/input/metrics/maintenance_score_experiment.parquet')

In [116]:
maintenance_score = pd.read_parquet('../../01_input/input/metric/maintenance_score.parquet')
maintenance_score['maintenance_score_calculated'] = maintained_score['07-2024']
maintenance_score['match'] = maintenance_score['maintenance_score'] == maintenance_score['maintenance_score_calculated']
maintenance_score

Unnamed: 0,github_link,project_name,project_url,link,maintenance_score,explanation,maintenance_score_calculated,match
0,https://github.com/3dfin/3dfin,3DFin,3dfin%2F3dfin,/3dfin/3dfin,10,20 commit(s) and 5 issue activity found in the...,10,True
1,https://github.com/ababic/django-cogwheels,django-cogwheels,ababic%2Fdjango-cogwheels,/ababic/django-cogwheels,0,0 commit(s) and 0 issue activity found in the ...,0,True
2,https://github.com/acdh-oeaw/arche-assets,arche-assets,acdh-oeaw%2Farche-assets,/acdh-oeaw/arche-assets,7,9 commit(s) and 0 issue activity found in the ...,7,True
3,https://github.com/adamchainz/django-perf-rec,django-perf-rec,adamchainz%2Fdjango-perf-rec,/adamchainz/django-perf-rec,10,19 commit(s) and 1 issue activity found in the...,10,True
4,https://github.com/aimage/flask_rest_multiform...,flask_rest_multiformat_api,aimage%2Fflask_rest_multiformat_api,/aimage/flask_rest_multiformat_api,2,3 commit(s) and 0 issue activity found in the ...,2,True
...,...,...,...,...,...,...,...,...
105,https://github.com/wind-python/windpowerlib,windpowerlib,wind-python%2Fwindpowerlib,/wind-python/windpowerlib,1,0 commit(s) and 2 issue activity found in the ...,2,False
106,https://github.com/xpublish-community/xpublish...,xpublish-intake-provider,xpublish-community%2Fxpublish-intake-provider,/xpublish-community/xpublish-intake-provider,5,7 commit(s) and 0 issue activity found in the ...,5,True
107,https://github.com/xuehaipan/nvitop,nvitop,xuehaipan%2Fnvitop,/xuehaipan/nvitop,10,12 commit(s) and 4 issue activity found in the...,10,True
108,https://github.com/yhat/busby,busby,yhat%2Fbusby,/yhat/busby,0,0 commit(s) and 0 issue activity found in the ...,0,True


In [117]:
maintenance_score['match'].value_counts() / 110

match
True     0.827273
False    0.172727
Name: count, dtype: float64

In [118]:
avg_diff = (maintenance_score['maintenance_score'] - maintenance_score['maintenance_score_calculated']).mean()
avg_diff

-0.3181818181818182

In [119]:
median = (maintenance_score['maintenance_score'] - maintenance_score['maintenance_score_calculated']).median()
median

0.0

In [120]:
new_dataframe = pd.concat([maintenance_score, commit_activity_score['07-2024'], issue_activity_score['07-2024']], axis=1)
new_dataframe

Unnamed: 0,github_link,project_name,project_url,link,maintenance_score,explanation,maintenance_score_calculated,match,07-2024,07-2024.1
0,https://github.com/3dfin/3dfin,3DFin,3dfin%2F3dfin,/3dfin/3dfin,10,20 commit(s) and 5 issue activity found in the...,10,True,20,9
1,https://github.com/ababic/django-cogwheels,django-cogwheels,ababic%2Fdjango-cogwheels,/ababic/django-cogwheels,0,0 commit(s) and 0 issue activity found in the ...,0,True,0,0
2,https://github.com/acdh-oeaw/arche-assets,arche-assets,acdh-oeaw%2Farche-assets,/acdh-oeaw/arche-assets,7,9 commit(s) and 0 issue activity found in the ...,7,True,9,0
3,https://github.com/adamchainz/django-perf-rec,django-perf-rec,adamchainz%2Fdjango-perf-rec,/adamchainz/django-perf-rec,10,19 commit(s) and 1 issue activity found in the...,10,True,19,1
4,https://github.com/aimage/flask_rest_multiform...,flask_rest_multiformat_api,aimage%2Fflask_rest_multiformat_api,/aimage/flask_rest_multiformat_api,2,3 commit(s) and 0 issue activity found in the ...,2,True,3,0
...,...,...,...,...,...,...,...,...,...,...
105,https://github.com/wind-python/windpowerlib,windpowerlib,wind-python%2Fwindpowerlib,/wind-python/windpowerlib,1,0 commit(s) and 2 issue activity found in the ...,2,False,0,3
106,https://github.com/xpublish-community/xpublish...,xpublish-intake-provider,xpublish-community%2Fxpublish-intake-provider,/xpublish-community/xpublish-intake-provider,5,7 commit(s) and 0 issue activity found in the ...,5,True,7,0
107,https://github.com/xuehaipan/nvitop,nvitop,xuehaipan%2Fnvitop,/xuehaipan/nvitop,10,12 commit(s) and 4 issue activity found in the...,10,True,11,6
108,https://github.com/yhat/busby,busby,yhat%2Fbusby,/yhat/busby,0,0 commit(s) and 0 issue activity found in the ...,0,True,0,0


In [3]:
df = pd.read_parquet('../../01_input/input/procesed_relational_dataset.parquet')
df

Unnamed: 0,project_name,github_link,project_url,project_id,metric_results,get_commits_per_month,get_avg_issue_close_time_per_month,get_avg_pull_request_close_time_per_month,get_new_issue_author_count_per_month,get_new_pull_request_author_count_per_month,get_avg_issue_response_time_per_month,get_avg_pull_request_merge_time_per_month,get_closed_issues_per_month,get_closed_pull_requests_per_month,get_commits_count_by_author_descending,get_label_issue_and_pull_request_count,get_issue_author_comment_count,get_discussion_author_comment_count,get_project_information,get_issues_and_issue_comments
0,netto,https://github.com/0-k/netto,0-k%2Fnetto,R_kgDOIU4I5A,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 47, 'month': 11, 'year': 2022}, ...",,,,,,,,,"[{'author_login': '0-k', 'commit_count': 65}]","[{'issue_label_count': 0, 'l.name': 'help want...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
1,GameGui,https://github.com/00001h/gamegui,00001h%2Fgamegui,R_kgDOGTAsgw,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 18, 'month': 1, 'year': 2022}, {...",,,,,,,,,"[{'author_login': '00001H', 'commit_count': 53}]","[{'issue_label_count': 0, 'l.name': 'Focus on'...",,,"[{'archivedAt': '2023-03-28T13:54:56+00:00', '...",
2,apiutils,https://github.com/007gzs/apiutils,007gzs%2Fapiutils,MDEwOlJlcG9zaXRvcnkyMzI3MDY3OTU=,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 5, 'month': 8, 'year': 2020}, {'...",,,,,,,,,"[{'author_login': '007gzs', 'commit_count': 35...","[{'issue_label_count': 0, 'l.name': 'invalid',...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
3,avatars,https://github.com/007gzs/avatars,007gzs%2Favatars,MDEwOlJlcG9zaXRvcnkyMjU3NTA3Mjc=,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 11, 'month': 12, 'year': 2019}]",,,,,,,,,"[{'author_login': '007gzs', 'commit_count': 11}]","[{'issue_label_count': 0, 'l.name': 'bug', 'pu...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
4,baijiayun,https://github.com/007gzs/baijiayun,007gzs%2Fbaijiayun,MDEwOlJlcG9zaXRvcnkyNjQwODM1NTQ=,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 7, 'month': 5, 'year': 2020}]",,,,,,,,,"[{'author_login': 'default', 'commit_count': 6...","[{'issue_label_count': 0, 'l.name': 'enhanceme...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36682,cats,https://github.com/zzzsochi/cats,zzzsochi%2Fcats,MDEwOlJlcG9zaXRvcnk0OTA5NzIxMA==,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 6, 'month': 1, 'year': 2016}]",,,"[{'date_month': '2016-03-01T00:00:00+00:00', '...",,"[{'avg_response_time': [0, 0, 20118, 0], 'mont...",,"[{'closed_issues': 0, 'date_month': '2016-03-0...",,"[{'author_login': 'zzzsochi', 'commit_count': 6}]","[{'issue_label_count': 0, 'l.name': 'bug', 'pu...","[{'comment_count': 2, 'u.login': 'magniff'}, {...",,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",[{'comments': [{'createdAt': '2016-03-20T18:07...
36683,includer,https://github.com/zzzsochi/includer,zzzsochi%2Fincluder,MDEwOlJlcG9zaXRvcnkzNjYwNjI5OA==,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 1, 'month': 11, 'year': 2015}, {...",,,,,,,,,"[{'author_login': 'zzzsochi', 'commit_count': 9}]","[{'issue_label_count': 0, 'l.name': 'duplicate...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
36684,rpio-server,https://github.com/zzzsochi/rpio-server,zzzsochi%2Frpio-server,MDEwOlJlcG9zaXRvcnk0MTgwMDcxOA==,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 1, 'month': 7, 'year': 2016}, {'...",,,,,,,,,"[{'author_login': 'zzzsochi', 'commit_count': 7}]","[{'issue_label_count': 0, 'l.name': 'help want...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
36685,zini,https://github.com/zzzsochi/zini,zzzsochi%2Fzini,MDEwOlJlcG9zaXRvcnk0OTk1MTYyOQ==,"{'get_avg_issue_close_time_per_month': [], 'ge...","[{'COUNT(c)': 16, 'month': 1, 'year': 2016}, {...",,"[{'AVG(open_duration)': [0, 0, 538, 0], 'month...",,"[{'date_month': '2017-04-01T00:00:00+00:00', '...",,"[{'avg_merge_duration': [0, 0, 538, 0], 'month...",,"[{'closed_pull_requests': 1, 'date_month': '20...","[{'author_login': 'zzzsochi', 'commit_count': ...","[{'issue_label_count': 0, 'l.name': 'invalid',...",,,"[{'archivedAt': '0001-01-01T01:01:01+00:00', '...",
