# 1. Data Exploration

This notebook is designed to perform data exploration of the retrieved repository from my supervisor Alexandros Tsakpinis.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

## Supporting Functions

Below, one could find the supporting functions:

In [2]:
def implement_months(repository):
    if repository.empty:
        return np.NaN  # or handle the empty case appropriately
    
    if 'date_month' in repository.columns:
        # Convert 'date_month' column to datetime format
        repository['date_month'] = pd.to_datetime(repository['date_month'])

        # Extract year and month from the 'date_month' column
        repository['year'] = repository['date_month'].dt.year
        repository['month'] = repository['date_month'].dt.month

        repository.drop(columns=['date_month'], inplace=True)
    
    repository = repository.sort_values(by=['year', 'month'], ascending=True)
    repository.reset_index(inplace=True, drop=True)
    repository['month'] = repository['month'].astype(str).str.zfill(2)
    repository['date'] = repository['month'].astype(str) + '-' + repository['year'].astype(str)

    # Create a complete date range from the minimum to maximum dates in the original data
    # max and min values likely to be a fixed value for all repositories
    min_year, min_month = repository['year'].iloc[0], repository['month'].iloc[0]
    max_year, max_month = repository['year'].iloc[-1], repository['month'].iloc[-1]
    min_date = f"{min_year}-{min_month}"
    max_date = f"{max_year}-{max_month}"
    date_range = pd.date_range(start=min_date, end=max_date, freq='MS')

    # Create a DataFrame from the date range
    date_df = pd.DataFrame({'date': date_range})

    # Extract year and month from the date range
    date_df['year'] = date_df['date'].dt.year
    date_df['month'] = date_df['date'].dt.month.astype(str).str.zfill(2)

    # Convert the date column to the same format as in your original DataFrame
    date_df['date'] = date_df['date'].dt.strftime('%m-%Y')

    # Merge the original DataFrame with the date DataFrame to fill in missing values
    repository = pd.merge(date_df, repository, on=['year', 'month', 'date'], how='left')

    repository = repository.fillna(0)

    return repository

def array_to_duration(repository, column):
    repository["duration"] = repository[column].apply(lambda x: x[0] * 30 + x[1] + x[2] / (24 * 3600) + x[3] / (24 * 3600 * 10 ** 9) if x is not None else np.inf)
    repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
    repository.drop(columns=[column], inplace=True)
    return repository

# Function to fill the new dataframe with commit counts
def fill_counts(row, row_index, df, information):
    for entry in row:
        year_month = f"{entry['month']:02d}-{entry['year']}"
        if year_month in df.columns:
            df.at[row_index, year_month] = entry[information]

def extract_comments_and_issues(json_data):
    issue_df = json_data[['issue.createdAt', 'issue.creatorRole', 'comments']].copy()
    issue_df['issue.createdAt'] = issue_df['issue.createdAt'].apply(lambda x: pd.to_datetime(x))
    issue_df['month'] = issue_df['issue.createdAt'].dt.month
    issue_df['year'] = issue_df['issue.createdAt'].dt.year
    issue_df['date'] = issue_df['month'].astype(str).str.zfill(2) + '-' + issue_df['year'].astype(str)
    issue_df = issue_df.rename(columns={'issue.creatorRole': 'creatorRole'})
    issue_df = issue_df.drop(columns=['issue.createdAt'])
    issue_df

    comments_list = []
    for comments in issue_df['comments']:
        comments_list.extend(comments)
    issue_df = issue_df.drop(columns=['comments'])

    if comments_list != []:    
        comments_df = pd.json_normalize(comments_list)
        comments_df['createdAt'] = comments_df['createdAt'].apply(lambda x: pd.to_datetime(x))
        comments_df['month'] = comments_df['createdAt'].dt.month
        comments_df['year'] = comments_df['createdAt'].dt.year
        comments_df = comments_df.drop(columns=['createdAt', 'creator'])

        issue_df = pd.concat([issue_df, comments_df]).reset_index(drop=True)

    # Filtering valid roles
    valid_roles = ['COLLABORATOR', 'MEMBER', 'OWNER']
    issue_df = issue_df[issue_df['creatorRole'].isin(valid_roles)]

    grouped_counts = issue_df.groupby(['month', 'year']).size().reset_index(name='sum')
    # grouped_counts['month'] = grouped_counts['month'].astype('Int64')
    # grouped_counts['year'] = grouped_counts['year'].astype('Int64')
    grouped_counts['sum'] = grouped_counts['sum'].astype('Int64')
    grouped_counts = implement_months(grouped_counts)
    return grouped_counts

## Reading the JSON file

The first step involves reading the structured JSON file and turn this into a structured table format, storing inside the dataframe.

In [3]:
# Load the JSON file into a pandas dataframe
df = pd.read_json('../../01_input/json/pypi_metrics_file_36k_updated.json')
# df = pd.read_json('../../01_input/json/pypi_metrics_file.json')

In [4]:
data_df = df.transpose()

data_df.reset_index(inplace=True)
data_df.rename(columns={'index': 'github_link'}, inplace=True)
data_df = data_df.reindex(columns=['project_name', 'github_link', 'project_url', 'project_id', 'metric_results'])

# Extract parameters from metric_results column
df = pd.json_normalize(data_df['metric_results'])

# Merge the two dataframes
data_df = pd.concat([data_df, df], axis=1)

# Assuming your DataFrame is named data_df
data_df = data_df.applymap(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)

data_df

  data_df = data_df.applymap(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)


Unnamed: 0,project_name,github_link,project_url,project_id,metric_results,get_commits_per_month,get_avg_issue_close_time_per_month,get_avg_pull_request_close_time_per_month,get_new_issue_author_count_per_month,get_new_pull_request_author_count_per_month,get_avg_issue_response_time_per_month,get_avg_pull_request_merge_time_per_month,get_closed_issues_per_month,get_closed_pull_requests_per_month,get_commits_count_by_author_descending,get_label_issue_and_pull_request_count,get_issue_author_comment_count,get_discussion_author_comment_count,get_project_information,get_issues_and_issue_comments
0,netto,https://github.com/0-k/netto,0-k%2Fnetto,R_kgDOIU4I5A,"{'get_commits_per_month': [{'year': 2022, 'mon...","[{'year': 2022, 'month': 11, 'COUNT(c)': 47}, ...",,,,,,,,,"[{'author_login': '0-k', 'commit_count': 65}]","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
1,GameGui,https://github.com/00001h/gamegui,00001h%2Fgamegui,R_kgDOGTAsgw,"{'get_commits_per_month': [{'year': 2022, 'mon...","[{'year': 2022, 'month': 1, 'COUNT(c)': 18}, {...",,,,,,,,,"[{'author_login': '00001H', 'commit_count': 53}]","[{'l.name': 'Focus on', 'issue_label_count': 0...",,,"[{'isArchived': True, 'archivedAt': '2023-03-2...",
2,apiutils,https://github.com/007gzs/apiutils,007gzs%2Fapiutils,MDEwOlJlcG9zaXRvcnkyMzI3MDY3OTU=,"{'get_commits_per_month': [{'year': 2020, 'mon...","[{'year': 2020, 'month': 8, 'COUNT(c)': 5}, {'...",,,,,,,,,"[{'author_login': '007gzs', 'commit_count': 35...","[{'l.name': 'invalid', 'issue_label_count': 0,...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
3,avatars,https://github.com/007gzs/avatars,007gzs%2Favatars,MDEwOlJlcG9zaXRvcnkyMjU3NTA3Mjc=,"{'get_commits_per_month': [{'year': 2019, 'mon...","[{'year': 2019, 'month': 12, 'COUNT(c)': 11}]",,,,,,,,,"[{'author_login': '007gzs', 'commit_count': 11}]","[{'l.name': 'bug', 'issue_label_count': 0, 'pu...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
4,baijiayun,https://github.com/007gzs/baijiayun,007gzs%2Fbaijiayun,MDEwOlJlcG9zaXRvcnkyNjQwODM1NTQ=,"{'get_commits_per_month': [{'year': 2020, 'mon...","[{'year': 2020, 'month': 5, 'COUNT(c)': 7}]",,,,,,,,,"[{'author_login': 'default', 'commit_count': 6...","[{'l.name': 'enhancement', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36682,cats,https://github.com/zzzsochi/cats,zzzsochi%2Fcats,MDEwOlJlcG9zaXRvcnk0OTA5NzIxMA==,"{'get_commits_per_month': [{'year': 2016, 'mon...","[{'year': 2016, 'month': 1, 'COUNT(c)': 6}]",,,"[{'date_month': '2016-03-01T00:00:00+00:00', '...",,"[{'year': 2016, 'month': 3, 'avg_response_time...",,"[{'date_month': '2016-03-01T00:00:00+00:00', '...",,"[{'author_login': 'zzzsochi', 'commit_count': 6}]","[{'l.name': 'bug', 'issue_label_count': 0, 'pu...","[{'u.login': 'magniff', 'comment_count': 2}, {...",,"[{'isArchived': False, 'archivedAt': '0001-01-...","[{'issue': {'creatorRole': 'NONE', 'title': 'C..."
36683,includer,https://github.com/zzzsochi/includer,zzzsochi%2Fincluder,MDEwOlJlcG9zaXRvcnkzNjYwNjI5OA==,"{'get_commits_per_month': [{'year': 2015, 'mon...","[{'year': 2015, 'month': 11, 'COUNT(c)': 1}, {...",,,,,,,,,"[{'author_login': 'zzzsochi', 'commit_count': 9}]","[{'l.name': 'duplicate', 'issue_label_count': ...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
36684,rpio-server,https://github.com/zzzsochi/rpio-server,zzzsochi%2Frpio-server,MDEwOlJlcG9zaXRvcnk0MTgwMDcxOA==,"{'get_commits_per_month': [{'year': 2016, 'mon...","[{'year': 2016, 'month': 7, 'COUNT(c)': 1}, {'...",,,,,,,,,"[{'author_login': 'zzzsochi', 'commit_count': 7}]","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
36685,zini,https://github.com/zzzsochi/zini,zzzsochi%2Fzini,MDEwOlJlcG9zaXRvcnk0OTk1MTYyOQ==,"{'get_commits_per_month': [{'year': 2016, 'mon...","[{'year': 2016, 'month': 1, 'COUNT(c)': 16}, {...",,"[{'year': 2017, 'month': 4, 'AVG(open_duration...",,"[{'date_month': '2017-04-01T00:00:00+00:00', '...",,"[{'year': 2017, 'month': 4, 'avg_merge_duratio...",,"[{'date_month': '2017-04-01T00:00:00+00:00', '...","[{'author_login': 'zzzsochi', 'commit_count': ...","[{'l.name': 'invalid', 'issue_label_count': 0,...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",


After performing the initial data operation, following values have structurized:
* `project_name`: Name of the project in GitHub.
* `github_link`: The full URL that navigates the repository.
* `project_url`: Extension of GitHub URL.
* `project_id`: Project ID.
* `metric_results`: In addition to the formal data related with the repository, following metric results that is generated by ourselves is generated. This list will be updated on frequent basis to increase the accuracy. (Normally, this metric isn't required because all the metrics inside of it is registered as a new column. However, these metrics will be updated regularly, so not dropping the column would be the best for us).
    * `get_commits_per_month`: The monthly commit information. All information is distinct, it will be very usable.
    * `get_avg_issue_close_time_per_month`: Average issue closing time per month. 62% of the data is missing.
    * `get_avg_pull_request_close_time_per_month`: Average pull request close time per month. 48% of the data is missing.
    * `get_new_issue_author_count_per_month`: New issue author count per month. 53% of the data is missing.
    * `get_new_pull_request_author_count_per_month`: New pull request author count per month. 44% of the data is missing.
    * `get_avg_issue_response_time_per_month`: Average issue response time per month. 53% of the data is missing.
    * `get_avg_pull_request_merge_time_per_month`: Average pull request merge time per month. 51% of the data is missing.
    * `get_closed_issues_per_month`: Closed issues per month. 53% of the data is missing.
    * `get_closed_pull_requests_per_month`: Closed pull requests per month. 44% of the data is missing.
    * `get_commits_count_by_author_descending`: Commits count by author descending. All information is distinct, it will be very usable. 
    * `get_label_issue_and_pull_request_count`: Label issue and pull request count. 2% of the data is missing, most of the data is distinctive.
    * `get_issue_author_comment_count`: Issue author comment count. 62% of the data is missing.
    * `get_discussion_author_comment_count`: Discussion author comment count. 98% of the data is missing, it's not usable at all.

Additional parameter suggestions:
* `repository_opened`: Maybe that could be a good idea to specify an exact datetime variable to display the repository open time.
* `last_update`: The date that the last commit operation is performed.

In [5]:
# Transformed data will be used for Data Processing, which will be the next step
data_df.to_parquet('../../01_input/input/procesed_relational_dataset.parquet', index=False)

## Loading the Dataset

In [6]:
# df = pd.read_parquet('../../01_input/input/procesed_relational_dataset.parquet')
df = data_df.sample(n=200, random_state=42).reset_index(drop=True)
df

Unnamed: 0,project_name,github_link,project_url,project_id,metric_results,get_commits_per_month,get_avg_issue_close_time_per_month,get_avg_pull_request_close_time_per_month,get_new_issue_author_count_per_month,get_new_pull_request_author_count_per_month,get_avg_issue_response_time_per_month,get_avg_pull_request_merge_time_per_month,get_closed_issues_per_month,get_closed_pull_requests_per_month,get_commits_count_by_author_descending,get_label_issue_and_pull_request_count,get_issue_author_comment_count,get_discussion_author_comment_count,get_project_information,get_issues_and_issue_comments
0,bootcamp_unimedbh_ciencia_dados,https://github.com/barbaramir/bootcamp_unimedb...,barbaramir%2Fbootcamp_unimedbh_ciencia_dados,R_kgDOIGMU5A,"{'get_commits_per_month': [{'year': 2022, 'mon...","[{'year': 2022, 'month': 10, 'COUNT(c)': 51}, ...",,,,,,,,,"[{'author_login': 'barbaramir', 'commit_count'...","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
1,SetSolver1,https://github.com/lukaswestholt/setsolver1,lukaswestholt%2Fsetsolver1,R_kgDOGYnlMg,"{'get_commits_per_month': [{'year': 2021, 'mon...","[{'year': 2021, 'month': 12, 'COUNT(c)': 3}, {...","[{'year': 2021, 'month': 11, 'AVG(open_duratio...",,"[{'date_month': '2021-11-01T00:00:00+00:00', '...",,"[{'year': 2021, 'month': 11, 'avg_response_tim...",,"[{'date_month': '2021-11-01T00:00:00+00:00', '...",,"[{'author_login': 'LukasWestholt', 'commit_cou...","[{'l.name': 'bug', 'issue_label_count': 0, 'pu...","[{'u.login': 'LukasWestholt', 'comment_count':...",,"[{'isArchived': False, 'archivedAt': '0001-01-...","[{'issue': {'creatorRole': 'OWNER', 'title': '..."
2,Wifi-Orca,https://github.com/invizabel/wifi-orca,invizabel%2Fwifi-orca,R_kgDOJ5Xxug,"{'get_commits_per_month': [{'year': 2023, 'mon...","[{'year': 2023, 'month': 9, 'COUNT(c)': 1}, {'...",,,,,,,,,"[{'author_login': 'Invizabel', 'commit_count':...","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
3,expdf,https://github.com/bupt-ipcr/expdf,bupt-ipcr%2Fexpdf,MDEwOlJlcG9zaXRvcnkyNTU3OTU5MzI=,"{'get_commits_per_month': [{'year': 2020, 'mon...","[{'year': 2020, 'month': 5, 'COUNT(c)': 91}, {...",,,,"[{'date_month': '2023-05-01T00:00:00+00:00', '...",,,,"[{'date_month': '2023-05-01T00:00:00+00:00', '...","[{'author_login': 'LampV', 'commit_count': 576}]","[{'l.name': 'invalid', 'issue_label_count': 0,...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
4,djangocms-grid,https://github.com/divio/djangocms-grid,divio%2Fdjangocms-grid,MDEwOlJlcG9zaXRvcnkxMDE0MDIwMw==,"{'get_commits_per_month': [{'year': 2014, 'mon...","[{'year': 2014, 'month': 11, 'COUNT(c)': 1}, {...",,"[{'year': 2014, 'month': 11, 'AVG(open_duratio...","[{'date_month': '2013-06-01T00:00:00+00:00', '...","[{'date_month': '2014-11-01T00:00:00+00:00', '...","[{'year': 2013, 'month': 6, 'avg_response_time...","[{'year': 2014, 'month': 11, 'avg_merge_durati...","[{'date_month': '2013-06-01T00:00:00+00:00', '...","[{'date_month': '2014-11-01T00:00:00+00:00', '...","[{'author_login': 'mkoistinen', 'commit_count'...","[{'l.name': 'duplicate', 'issue_label_count': ...","[{'u.login': 'alesdotio', 'comment_count': 4},...",,"[{'isArchived': True, 'archivedAt': '2019-01-1...","[{'issue': {'creatorRole': 'NONE', 'title': 'A..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,python-isal,https://github.com/pycompression/python-isal,pycompression%2Fpython-isal,MDEwOlJlcG9zaXRvcnkyODk4NzYxMjk=,"{'get_commits_per_month': [{'year': 2021, 'mon...","[{'year': 2021, 'month': 12, 'COUNT(c)': 32}, ...","[{'year': 2022, 'month': 1, 'AVG(open_duration...","[{'year': 2022, 'month': 9, 'AVG(open_duration...","[{'date_month': '2022-01-01T00:00:00+00:00', '...","[{'date_month': '2022-09-01T00:00:00+00:00', '...","[{'year': 2022, 'month': 1, 'avg_response_time...","[{'year': 2022, 'month': 9, 'avg_merge_duratio...","[{'date_month': '2022-01-01T00:00:00+00:00', '...","[{'date_month': '2022-09-01T00:00:00+00:00', '...","[{'author_login': 'rhpvorderman', 'commit_coun...","[{'l.name': 'on hold', 'issue_label_count': 1,...","[{'u.login': 'rhpvorderman', 'comment_count': ...",,"[{'isArchived': False, 'archivedAt': '0001-01-...","[{'issue': {'creatorRole': 'NONE', 'title': 'I..."
196,flask-sso-ui,https://github.com/dhanarsantika/flask-sso-ui,dhanarsantika%2Fflask-sso-ui,MDEwOlJlcG9zaXRvcnkyMzA3NjExODI=,"{'get_commits_per_month': [{'year': 2019, 'mon...","[{'year': 2019, 'month': 12, 'COUNT(c)': 5}]",,,,"[{'date_month': '2023-05-01T00:00:00+00:00', '...",,,,"[{'date_month': '2023-05-01T00:00:00+00:00', '...","[{'author_login': 'DhanarSantika', 'commit_cou...","[{'l.name': 'help wanted', 'issue_label_count'...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
197,daynight2geojson,https://github.com/geographicags/daynight2geojson,geographicags%2Fdaynight2geojson,MDEwOlJlcG9zaXRvcnkyOTQyOTc4NA==,"{'get_commits_per_month': [{'year': 2015, 'mon...","[{'year': 2015, 'month': 6, 'COUNT(c)': 12}, {...",,"[{'year': 2015, 'month': 7, 'AVG(open_duration...",,"[{'date_month': '2015-07-01T00:00:00+00:00', '...",,,,"[{'date_month': '2015-07-01T00:00:00+00:00', '...","[{'author_login': 'cayetanobv', 'commit_count'...","[{'l.name': 'question', 'issue_label_count': 0...",,,"[{'isArchived': False, 'archivedAt': '0001-01-...",
198,Stockton-Esports-Bot,https://github.com/dual-exhaust/stockton-espor...,dual-exhaust%2Fstockton-esports-bot,MDEwOlJlcG9zaXRvcnkyMDk4NzIxMjk=,"{'get_commits_per_month': [{'year': 2019, 'mon...","[{'year': 2019, 'month': 11, 'COUNT(c)': 4}, {...","[{'year': 2019, 'month': 10, 'AVG(open_duratio...","[{'year': 2019, 'month': 10, 'AVG(open_duratio...","[{'date_month': '2019-10-01T00:00:00+00:00', '...","[{'date_month': '2021-06-01T00:00:00+00:00', '...","[{'year': 2019, 'month': 10, 'avg_response_tim...","[{'year': 2019, 'month': 10, 'avg_merge_durati...","[{'date_month': '2019-10-01T00:00:00+00:00', '...","[{'date_month': '2021-06-01T00:00:00+00:00', '...","[{'author_login': 'Dual-Exhaust', 'commit_coun...","[{'l.name': 'invalid', 'issue_label_count': 0,...","[{'u.login': 'Dual-Exhaust', 'comment_count': 1}]",,"[{'isArchived': False, 'archivedAt': '0001-01-...","[{'issue': {'creatorRole': 'OWNER', 'title': '..."


In [7]:
df.to_parquet('../../01_input/input/procesed_relational_dataset_500.parquet', index=False)

## Define the data interval

In [8]:
# Define the start and end dates (we are getting three months before of the starting date, because each month should consider the activities based on the last 90 days)
start_year, start_month = 2022, 11
end_year, end_month = 2023, 12

# Generate the list of months between start and end dates
months = pd.date_range(start=f"{start_month}-{start_year}", end=f"{end_month}-{end_year}", freq='MS').strftime("%m-%Y").tolist()

## Get Commits per Month

In [9]:
commit_per_month = df['get_commits_per_month']

# Create a new dataframe with months as columns
commit_per_month_structured = pd.DataFrame(index=commit_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(commit_per_month)):
    fill_counts(commit_per_month.iloc[i], i, commit_per_month_structured, 'COUNT(c)')

commit_per_month_structured.fillna(0, inplace=True)

# commit_per_month_structured.fillna(0, inplace=True)
commit_per_month_structured

  commit_per_month_structured.fillna(0, inplace=True)


Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,17,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,14,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,31,6,0,0,0,8,0,47,153,13,1
196,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Saving the data

In [10]:
commit_per_month_structured.fillna(0, inplace=True)
commit_per_month_structured.to_parquet('../../01_input/input/metrics/commit_per_month.parquet')

## Average Issue Close Time per Month

In [11]:
avg_issue_close_time_per_month = df['get_avg_issue_close_time_per_month']

# Create a new dataframe with months as columns
avg_issue_close_time_per_month_structured_duration = pd.DataFrame(index=avg_issue_close_time_per_month.index, columns=months)
avg_issue_close_time_per_month_structured_count = pd.DataFrame(index=avg_issue_close_time_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(avg_issue_close_time_per_month)):
    if avg_issue_close_time_per_month.iloc[i] is np.NaN:
        continue
    x = pd.json_normalize(avg_issue_close_time_per_month.iloc[i])
    df_entry = array_to_duration(x, 'AVG(open_duration)')
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in avg_issue_close_time_per_month_structured_duration.columns:
            avg_issue_close_time_per_month_structured_duration.at[i, j] = df_entry[df_entry['date'] == j]['duration'].values[0]
            avg_issue_close_time_per_month_structured_count.at[i, j] = df_entry[df_entry['date'] == j]['COUNT(open_duration)'].values[0]

avg_issue_close_time_per_month_structured_duration.replace(0, np.nan, inplace=True)
avg_issue_close_time_per_month_structured_count.replace(0, np.nan, inplace=True)
avg_issue_close_time_per_month_structured_count

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,,2.0,,1.0,,,1.0,,1.0,7.0,6.0,
196,,,,,,,,,,,,,,
197,,,,,,,,,,,,,,
198,,,,,,,,,,,,,,


### Saving the data

In [12]:
avg_issue_close_time_per_month_structured_count.fillna(0, inplace=True)
avg_issue_close_time_per_month_structured_duration.fillna(0, inplace=True)
avg_issue_close_time_per_month_structured_count.to_parquet('../../01_input/input/metrics/avg_issue_close_time_per_month_count.parquet')
avg_issue_close_time_per_month_structured_duration.to_parquet('../../01_input/input/metrics/avg_issue_close_time_per_month_duration.parquet')

## Average PR Close Time Per Month

In [13]:
avg_pull_request_close_time_per_month = df['get_avg_pull_request_close_time_per_month']
# Create a new dataframe with months as columns
avg_pull_request_close_time_per_month_structured = pd.DataFrame(index=avg_pull_request_close_time_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(avg_pull_request_close_time_per_month)):
    if avg_pull_request_close_time_per_month.iloc[i] is np.NaN:
        continue
    x = pd.json_normalize(avg_pull_request_close_time_per_month.iloc[i])
    df_entry = array_to_duration(x, 'AVG(open_duration)')
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in avg_pull_request_close_time_per_month_structured.columns:
            avg_pull_request_close_time_per_month_structured.at[i, j] = df_entry[df_entry['date'] == j]['duration'].values[0]
            

avg_pull_request_close_time_per_month_structured.replace(0, np.nan, inplace=True)
avg_pull_request_close_time_per_month_structured


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,,2.563766,0.059051,,,,0.031383,,1.57037,3.479285,3.621162,
196,,,,,,,,,,,,,,
197,,,,,,,,,,,,,,
198,,,,,,,,,,,,,,


### Saving the data

In [14]:
avg_pull_request_close_time_per_month_structured.fillna(0, inplace=True)
avg_pull_request_close_time_per_month_structured.to_parquet('../../01_input/input/metrics/avg_pull_request_close_time_per_month.parquet')

## New Issue Author Count per Month

In [15]:
new_issue_author_count_per_month = df['get_new_issue_author_count_per_month']

# Create a new dataframe with months as columns
new_issue_author_count_per_month_structured = pd.DataFrame(index=new_issue_author_count_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(new_issue_author_count_per_month)):
    if new_issue_author_count_per_month.iloc[i] is np.NaN:
        continue
    df_entry = pd.json_normalize(new_issue_author_count_per_month.iloc[i])
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in new_issue_author_count_per_month_structured.columns:
            new_issue_author_count_per_month_structured.at[i, j] = df_entry[df_entry['date'] == j]['new_authors_count'].values[0]

new_issue_author_count_per_month_structured

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
196,,,,,,,,,,,,,,
197,,,,,,,,,,,,,,
198,,,,,,,,,,,,,,


### Saving the data

In [16]:
new_issue_author_count_per_month_structured.fillna(0, inplace=True)
new_issue_author_count_per_month_structured.to_parquet('../../01_input/input/metrics/new_issue_author_count_per_month.parquet')

  new_issue_author_count_per_month_structured.fillna(0, inplace=True)


## New PR Author Count per Month

In [17]:
new_pull_request_author_count_per_month = df['get_new_pull_request_author_count_per_month']

# Create a new dataframe with months as columns
new_pull_request_author_count_per_month_structured = pd.DataFrame(index=new_pull_request_author_count_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(new_pull_request_author_count_per_month)):
    if new_pull_request_author_count_per_month.iloc[i] is np.NaN:
        continue
    df_entry = pd.json_normalize(new_pull_request_author_count_per_month.iloc[i])
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in new_pull_request_author_count_per_month_structured.columns:
            new_pull_request_author_count_per_month_structured.at[i, j] = df_entry[df_entry['date'] == j]['new_authors_count'].values[0]

new_pull_request_author_count_per_month_structured

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,1,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,,,,,,,1,,,,,,,
197,,,,,,,,,,,,,,
198,0.0,0.0,,,,,,,,,,,,


### Saving the data

In [18]:
new_pull_request_author_count_per_month_structured.fillna(0, inplace=True)
new_pull_request_author_count_per_month_structured.to_parquet('../../01_input/input/metrics/new_pull_request_author_count_per_month.parquet')

  new_pull_request_author_count_per_month_structured.fillna(0, inplace=True)


## Average Issue Response Time per Month

In [19]:
avg_issue_response_time_per_month = df['get_avg_issue_response_time_per_month']

# Create a new dataframe with months as columns
avg_issue_response_time_per_month_structured = pd.DataFrame(index=avg_issue_response_time_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(avg_issue_response_time_per_month)):
    if avg_issue_response_time_per_month.iloc[i] is np.NaN:
        continue
    df_entry = pd.json_normalize(avg_issue_response_time_per_month.iloc[i])
    df_entry = array_to_duration(df_entry, 'avg_response_time')
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in avg_issue_response_time_per_month_structured.columns:
            avg_issue_response_time_per_month_structured.at[i, j] = df_entry[df_entry['date'] == j]['duration'].values[0]

avg_issue_response_time_per_month_structured

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.708715,0.0,0.008681,0.0,0.0,3.930295,0.0,3.930295,2.503646,0.198889,0.0
196,,,,,,,,,,,,,,
197,,,,,,,,,,,,,,
198,,,,,,,,,,,,,,


### Saving the data

In [20]:
avg_issue_response_time_per_month_structured.fillna(0, inplace=True)
avg_issue_response_time_per_month_structured.to_parquet('../../01_input/input/metrics/avg_issue_response_time_per_month.parquet')

  avg_issue_response_time_per_month_structured.fillna(0, inplace=True)


## Average PR Merge Time per Month

In [21]:
avg_pull_request_merge_time_per_month = df['get_avg_pull_request_merge_time_per_month']

# Create a new dataframe with months as columns
avg_pull_request_merge_time_per_month_structured = pd.DataFrame(index=avg_pull_request_merge_time_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(avg_pull_request_merge_time_per_month)):
    if avg_pull_request_merge_time_per_month.iloc[i] is np.NaN:
        continue
    df_entry = pd.json_normalize(avg_pull_request_merge_time_per_month.iloc[i])
    df_entry = array_to_duration(df_entry, 'avg_merge_duration')
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in avg_pull_request_merge_time_per_month_structured.columns:
            avg_pull_request_merge_time_per_month_structured.at[i, j] = df_entry[df_entry['date'] == j]['duration'].values[0]

avg_pull_request_merge_time_per_month_structured

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  repository["duration"].replace(np.inf, repository["duration"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.057981,0.059051,0.0,0.0,0.0,0.031383,0.0,1.57037,0.118353,0.006192,0.0
196,,,,,,,,,,,,,,
197,,,,,,,,,,,,,,
198,,,,,,,,,,,,,,


### Saving the data

In [22]:
avg_pull_request_merge_time_per_month_structured.fillna(0, inplace=True)
avg_pull_request_merge_time_per_month_structured.to_parquet('../../01_input/input/metrics/avg_pull_request_merge_time_per_month.parquet')

  avg_pull_request_merge_time_per_month_structured.fillna(0, inplace=True)


## Closed Issues per Month

In [23]:
closed_issues_per_month = df['get_closed_issues_per_month']

# Create a new dataframe with months as columns
closed_issues_per_month_opened_issues_structured = pd.DataFrame(index=closed_issues_per_month.index, columns=months)
closed_issues_per_month_closed_issues_structured = pd.DataFrame(index=closed_issues_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(closed_issues_per_month)):
    if closed_issues_per_month.iloc[i] is np.NaN:
        continue
    df_entry = pd.json_normalize(closed_issues_per_month.iloc[i])
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in closed_issues_per_month_opened_issues_structured.columns:
            closed_issues_per_month_opened_issues_structured.at[i, j] = df_entry[df_entry['date'] == j]['opened_issues'].values[0]
            closed_issues_per_month_closed_issues_structured.at[i, j] = df_entry[df_entry['date'] == j]['closed_issues'].values[0]

closed_issues_per_month_closed_issues_structured

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,48.0,0.0,49.0,0.0,0.0,50.0,0.0,50.0,57.0,61.0,0.0
196,,,,,,,,,,,,,,
197,,,,,,,,,,,,,,
198,,,,,,,,,,,,,,


### Saving the data

In [24]:
closed_issues_per_month_closed_issues_structured.fillna(0, inplace=True)
closed_issues_per_month_closed_issues_structured.to_parquet('../../01_input/input/metrics/closed_issues_per_month_closed_issues.parquet')
closed_issues_per_month_closed_issues_structured.fillna(0, inplace=True)
closed_issues_per_month_closed_issues_structured.to_parquet('../../01_input/input/metrics/closed_issues_per_month_closed_issues.parquet')

  closed_issues_per_month_closed_issues_structured.fillna(0, inplace=True)


## Closed PR per Month

In [25]:
closed_pull_requests_per_month = df['get_closed_pull_requests_per_month']

# Create a new dataframe with months as columns
closed_pull_requests_per_month_open_pull_requests_structured = pd.DataFrame(index=closed_pull_requests_per_month.index, columns=months)
closed_pull_requests_per_month_closed_pull_requests_structured = pd.DataFrame(index=closed_pull_requests_per_month.index, columns=months)

# Apply the function to each row
for i in range(len(closed_pull_requests_per_month)):
    if closed_pull_requests_per_month.iloc[i] is np.NaN:
        continue
    df_entry = pd.json_normalize(closed_pull_requests_per_month.iloc[i])
    df_entry = implement_months(df_entry)
    for j in df_entry['date']:
        if j in closed_pull_requests_per_month_open_pull_requests_structured.columns:
            closed_pull_requests_per_month_open_pull_requests_structured.at[i, j] = df_entry[df_entry['date'] == j]['open_pull_requests'].values[0]
            closed_pull_requests_per_month_closed_pull_requests_structured.at[i, j] = df_entry[df_entry['date'] == j]['closed_pull_requests'].values[0]

closed_pull_requests_per_month_closed_pull_requests_structured

Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,0,,,,,,,
4,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,89.0,91.0,0.0,0.0,0.0,93.0,0.0,95.0,109.0,115.0,0.0
196,,,,,,,0,,,,,,,
197,,,,,,,,,,,,,,
198,0.0,7.0,,,,,,,,,,,,


### Saving the data

In [26]:
closed_pull_requests_per_month_open_pull_requests_structured.fillna(0, inplace=True)
closed_pull_requests_per_month_open_pull_requests_structured.to_parquet('../../01_input/input/metrics/closed_pull_requests_per_month_open_pull_requests.parquet')
closed_pull_requests_per_month_closed_pull_requests_structured.fillna(0, inplace=True)
closed_pull_requests_per_month_closed_pull_requests_structured.to_parquet('../../01_input/input/metrics/closed_pull_requests_per_month_closed_pull_requests.parquet')

  closed_pull_requests_per_month_open_pull_requests_structured.fillna(0, inplace=True)
  closed_pull_requests_per_month_closed_pull_requests_structured.fillna(0, inplace=True)


## Get Project Information

In [27]:
project_information = pd.json_normalize(df['get_project_information'].apply(lambda x: x[0] if x is not None else None))
# Convert "archivedAt" and "createdAt" columns to datetime type
project_information["archivedAt"] = project_information["archivedAt"].apply(lambda x: pd.to_datetime(x) if x != "0001-01-01T01:01:01+00:00" else pd.to_datetime("1970-01-01T00:00:00+00:00"))
project_information["createdAt"] = pd.to_datetime(project_information["createdAt"])

# Extract year and month
project_information["create_year"] = project_information["createdAt"].dt.year.astype('Int64')
project_information["create_month"] = project_information["createdAt"].dt.month.astype('Int64')
project_information["archive_year"] = project_information["archivedAt"].dt.year.astype('Int64')
project_information["archive_month"] = project_information["archivedAt"].dt.month.astype('Int64')
isArchived = project_information["isArchived"].astype('Int64')
project_information.drop(columns=['archivedAt', 'createdAt'], inplace=True)

# project_information = project_information.astype(int)
project_information

Unnamed: 0,isArchived,create_year,create_month,archive_year,archive_month
0,False,2022,9,1970,1
1,False,2021,11,1970,1
2,False,2023,7,1970,1
3,False,2020,4,1970,1
4,True,2013,5,2019,1
...,...,...,...,...,...
195,False,2020,8,1970,1
196,False,2019,12,1970,1
197,False,2015,1,1970,1
198,False,2019,9,1970,1


In [28]:
# Create a new dataframe with months as columns
project_information_structured = pd.DataFrame(index=project_information.index, columns=months)

def fill_dataframe(df1, start_year, start_month, end_year, end_month):
    # Generate the months for the second table
    months = pd.date_range(start=f"{start_month}-{start_year}", end=f"{end_month}-{end_year}", freq='MS').strftime("%m-%Y").tolist()
    
    # Initialize the second dataframe with NaN values
    df2 = pd.DataFrame(np.nan, index=df1.index, columns=months)

    for idx, row in df1.iterrows():
        # Check for NAType or missing values
        if pd.isna(row['create_year']) or pd.isna(row['create_month']) or (row['isArchived'] and (pd.isna(row['archive_year']) or pd.isna(row['archive_month']))):
            print(f"Row {idx} contains missing data. Filling row with False values.")
            df2.loc[idx, months] = False
            continue  # Skip to the next iteration

        # Convert to integers
        create_year = int(row['create_year'])
        create_month = int(row['create_month'])
        create_date = pd.Period(year=create_year, month=create_month, freq='M')
        
        if row['isArchived']:
            archive_year = int(row['archive_year'])
            archive_month = int(row['archive_month'])
            archive_date = pd.Period(year=archive_year, month=archive_month, freq='M')
        else:
            archive_date = pd.Period(year=end_year, month=end_month, freq='M')
        
        start_date = pd.Period(year=start_year, month=start_month, freq='M')

        start_fill = max(create_date, start_date)
        end_fill = archive_date

        for month in months:
            period = pd.Period(month, freq='M')
            if start_fill <= period <= end_fill:
                df2.at[idx, month] = True
            else:
                df2.at[idx, month] = False

    return df2

# Apply the function to each row in the original dataframe
project_information_structured = fill_dataframe(project_information, start_year, start_month, end_year, end_month)
# project_information_structured = project_information_structured.fillna(False)
project_information_structured

  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True
  df2.at[idx, month] = True


Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,True,True,True,True,True,True,True,True,True,True,True,True,True,True
196,True,True,True,True,True,True,True,True,True,True,True,True,True,True
197,True,True,True,True,True,True,True,True,True,True,True,True,True,True
198,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [29]:
project_information_structured.to_parquet('../../01_input/input/metrics/project_information.parquet')

## Issues

In [30]:
issues = df["get_issues_and_issue_comments"]
# Create a new dataframe with months as columns
issues_structured = pd.DataFrame(index=issues.index, columns=months)

# # Apply the function to each row
for i in range(len(issues)):
    if issues[i] is np.NaN:
        continue
    inp = pd.json_normalize(issues[i])
    if inp.empty:
        continue
    df_entry = extract_comments_and_issues(inp)
    if df_entry is np.NaN:
        continue
    for j in df_entry['date']:
        if j in issues_structured.columns:
            issues_structured.at[i, j] = df_entry[df_entry['date'] == j]['sum'].values[0]

issues_structured.fillna(0, inplace=True)
issues_structured

  issues_structured.fillna(0, inplace=True)


Unnamed: 0,11-2022,12-2022,01-2023,02-2023,03-2023,04-2023,05-2023,06-2023,07-2023,08-2023,09-2023,10-2023,11-2023,12-2023
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,1,10,0,1,0,0,1,0,1,9,14,0
196,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Saving the data

In [31]:
# Save to Parquet
issues_structured.to_parquet('../../01_input/input/metrics/issues.parquet')