In [1]:
import pandas as pd
import json
from pathlib import Path

In [2]:
def load_json(path):
    with open(path, 'r') as file:
        loaded_data = json.load(file)
    
    data = loaded_data['items']

    return data

### get column names from test json

In [3]:
data = load_json(path='../data/helper/test_multiple_repos.json')
# get keys of loaded_data as list
keys = list(data[0].keys())

### get paths

In [4]:
directory_path = Path("../data/raw_data")
all_files = [file.name for file in directory_path.iterdir() if file.is_file()]

In [5]:
all_files[0]

'22_multiple_github_repos_page_2025-04-11_14-14-43.json'

### open all jsons and load repo metadata into dataframe

In [6]:
# create empty df with keys of loaded_data as columns
df_raw = pd.DataFrame(columns=keys)

In [7]:
for file in all_files:
    data = load_json(path=f'../data/raw_data/{file}')
    #  iterate through subdictionary in data and concatenate the content of the subdictionary to df_repos
    for repo in data:
        # create tmp df_repo for each repo
        df_tmp = pd.DataFrame(data=[repo], columns=keys)
        # concatenate df_repos with df_repo
        df_raw = pd.concat([df_raw, df_tmp], ignore_index=True)

  df_raw = pd.concat([df_raw, df_tmp], ignore_index=True)


In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           1050 non-null   object 
 1   node_id                      1050 non-null   object 
 2   name                         1050 non-null   object 
 3   full_name                    1050 non-null   object 
 4   private                      1050 non-null   object 
 5   owner                        1050 non-null   object 
 6   html_url                     1050 non-null   object 
 7   description                  1038 non-null   object 
 8   fork                         1050 non-null   object 
 9   url                          1050 non-null   object 
 10  forks_url                    1050 non-null   object 
 11  keys_url                     1050 non-null   object 
 12  collaborators_url            1050 non-null   object 
 13  teams_url         

### clean df_repos and remove unnecessary columns

In [9]:
# columns which are requiered for preprocessing and fruther steps
columns = [
    'id', 
    'name', 
    'full_name', 
    'html_url', 
    'description', 
    'url', 
    'labels_url', 
    'created_at', 
    'updated_at', 
    'pushed_at', 
    'size', 
    'stargazers_count', 
    'watchers_count', 
    'language', 
    'has_issues', 
    'has_projects', 
    'has_downloads', 
    'has_wiki', 
    'has_pages', 
    'has_discussions', 
    'forks_count', 
    'open_issues_count', 
    'license', 
    'allow_forking', 
    'topics', 
    'visibility', 
    'forks', 
    'open_issues', 
    'watchers', 
    'default_branch', 
    'score'
]

In [10]:
df_repos = df_raw

In [11]:
for k in keys:
    if k in columns:
        continue
    else:
        df_repos = df_repos.drop([k], axis=1)

In [12]:
df_repos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1050 non-null   object 
 1   name               1050 non-null   object 
 2   full_name          1050 non-null   object 
 3   html_url           1050 non-null   object 
 4   description        1038 non-null   object 
 5   url                1050 non-null   object 
 6   labels_url         1050 non-null   object 
 7   created_at         1050 non-null   object 
 8   updated_at         1050 non-null   object 
 9   pushed_at          1050 non-null   object 
 10  size               1050 non-null   object 
 11  stargazers_count   1050 non-null   object 
 12  watchers_count     1050 non-null   object 
 13  language           1050 non-null   object 
 14  has_issues         1050 non-null   object 
 15  has_projects       1050 non-null   object 
 16  has_downloads      1050 

In [13]:
df_repos.head(1)

Unnamed: 0,id,name,full_name,html_url,description,url,labels_url,created_at,updated_at,pushed_at,...,open_issues_count,license,allow_forking,topics,visibility,forks,open_issues,watchers,default_branch,score
0,266613704,practical-python,dabeaz-course/practical-python,https://github.com/dabeaz-course/practical-python,Practical Python Programming (course by @dabeaz),https://api.github.com/repos/dabeaz-course/pra...,https://api.github.com/repos/dabeaz-course/pra...,2020-05-24T19:50:08Z,2025-04-10T20:38:51Z,2024-08-10T03:38:37Z,...,16,"{'key': 'cc-by-sa-4.0', 'name': 'Creative Comm...",True,"[python, pythontutorial, tutorial]",public,6748,16,10155,master,1.0


### save df_repos in json file

In [14]:
tmp_json = df_repos.to_json(orient='records', lines=False, force_ascii=False)

In [15]:
with open('../data/helper/df_repos_metadata_stars_range_0_300k'
'.json', 'w') as file:
    file.write(tmp_json)

### load json data for further analysis (test)

In [18]:
with open('../data/helper/df_repos_metadata_stars_range_0_300k.json', 'r') as file:
    loaded_data = json.load(file)

In [19]:
test_df = pd.DataFrame(data=loaded_data)

In [20]:
test_df.head(1)

Unnamed: 0,id,name,full_name,html_url,description,url,labels_url,created_at,updated_at,pushed_at,...,open_issues_count,license,allow_forking,topics,visibility,forks,open_issues,watchers,default_branch,score
0,266613704,practical-python,dabeaz-course/practical-python,https://github.com/dabeaz-course/practical-python,Practical Python Programming (course by @dabeaz),https://api.github.com/repos/dabeaz-course/pra...,https://api.github.com/repos/dabeaz-course/pra...,2020-05-24T19:50:08Z,2025-04-10T20:38:51Z,2024-08-10T03:38:37Z,...,16,"{'key': 'cc-by-sa-4.0', 'name': 'Creative Comm...",True,"[python, pythontutorial, tutorial]",public,6748,16,10155,master,1.0
