**Author:** Lisa Wallner  
**Description:** In this notebook a file with the relevant metadata of multiple GitHub repositories will be created.  
**Depencencies:**  
+ data/raw_data_zip/raw_data_no_range.zip  
+ data/raw_data_zip/raw_data_0_22196.zip

In [26]:
import pandas as pd
import json
from pathlib import Path
from langdetect import detect
import zipfile
import os

In [27]:
def load_json(path):
    with open(path, 'r') as file:
        loaded_data = json.load(file)
    
    return loaded_data

In [28]:
def rezip_files(path, range_type):
    
    original_zip = path # path to the original ZIP file
    if range_type == 'range':
        extracted_dir = '../data/raw_data/range' # directory where the extract contents are saved
    else:
        extracted_dir = '../data/raw_data/no_range'

    with zipfile.ZipFile(original_zip, 'r') as zip_ref: 
        zip_ref.extractall(extracted_dir) # extract all files

In [29]:
# columns which are requiered for preprocessing and further steps
columns = [
    'id', 
    'name', 
    'full_name', 
    'html_url', 
    'description', 
    'url', 
    'labels_url', 
    'created_at', 
    'updated_at', 
    'pushed_at', 
    'size', 
    'stargazers_count', 
    'watchers_count', 
    'language', 
    'has_issues', 
    'has_projects', 
    'has_downloads', 
    'has_wiki', 
    'has_pages', 
    'has_discussions', 
    'forks_count', 
    'open_issues_count', 
    'license', 
    'allow_forking', 
    'topics', 
    'visibility', 
    'forks', 
    'open_issues', 
    'watchers', 
    'default_branch', 
    'score'
]

### Create two paths to save .json

In [30]:
# file with repos up to 22196 stars
file_no_range = '../data/df_repos_metadata_up_to_max_test.json'
# file with repos 0 to 22196 stars
file_range = '../data/df_repos_metadata_0_to_22196_test.json'

### Get column names for metadata file

In [31]:
help_columns = load_json(path='../data/helper/help_columns.json')
keys = list(help_columns[0].keys()) # get keys of loaded_data as list

In [32]:
len(keys)

82

### Get paths of raw data

In [33]:
# loaded zip files of metadata in raw_data
path_no_range = '../data/raw_data_zip/raw_data_no_range.zip'
path_range = '../data/raw_data_zip/raw_data_range_0_22196.zip'
rezip_files(path=path_no_range, range_type='no_range')
rezip_files(path=path_range, range_type='range')

In [34]:
path_range = Path('../data/raw_data/range') 
all_files_range = [file.name for file in path_range.iterdir() if file.is_file()]

path_no_range = Path('../data/raw_data/no_range') 
all_files_no_range = [file.name for file in path_no_range.iterdir() if file.is_file()]

### Open all jsons and load repo metadata into dataframe

In [35]:
# create empty df with keys of loaded_data as columns
df_raw_range = pd.DataFrame(columns=keys)
df_raw_no_range = pd.DataFrame(columns=keys)

In [36]:
for file in all_files_range:
    data = load_json(path=f'../data/raw_data/range/{file}')
    #  iterate through subdictionary in data and concatenate the content of the subdictionary to df_repos
    for repo in data:
        # create tmp df_repo for each repo
        df_tmp = pd.DataFrame(data=[repo], columns=keys)
        # concatenate df_repos with df_repo
        df_raw_range = pd.concat([df_raw_range, df_tmp], ignore_index=True)

  df_raw_range = pd.concat([df_raw_range, df_tmp], ignore_index=True)


# Hier weitermachen!!!! So fixen, dass es gut aussieht. Evtl. nochmal einen Stand aus vorherigen Branches ziehen.

In [37]:
for file in all_files_no_range:
    data = load_json(path=f'../data/raw_data/no_range/{file}')
    #  iterate through subdictionary in data and concatenate the content of the subdictionary to df_repos
    for repo in data:
        # create tmp df_repo for each repo
        df_tmp = pd.DataFrame(data=[repo], columns=keys)
        # concatenate df_repos with df_repo
        df_raw_no_range = pd.concat([df_raw_no_range, df_tmp], ignore_index=True)

ValueError: Shape of passed values is (1, 1), indices imply (1, 82)

### Clean dataframes and remove unnecessary columns

In [38]:
for k in keys:
    if k in columns:
        continue
    else:
        df_raw_range = df_raw_range.drop([k], axis=1)

In [39]:
df_raw_range.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591 entries, 0 to 590
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 591 non-null    object 
 1   name               591 non-null    object 
 2   full_name          591 non-null    object 
 3   html_url           591 non-null    object 
 4   description        579 non-null    object 
 5   url                591 non-null    object 
 6   labels_url         591 non-null    object 
 7   created_at         591 non-null    object 
 8   updated_at         591 non-null    object 
 9   pushed_at          591 non-null    object 
 10  size               591 non-null    object 
 11  stargazers_count   591 non-null    object 
 12  watchers_count     591 non-null    object 
 13  language           591 non-null    object 
 14  has_issues         591 non-null    object 
 15  has_projects       591 non-null    object 
 16  has_downloads      591 non

In [None]:
for k in keys:
    if k in columns:
        continue
    else:
        df_raw_no_range = df_raw_no_range.drop([k], axis=1)

### save df_repos in json file

In [41]:
tmp_json = df_raw_range.to_json(orient='records', lines=False, force_ascii=False)

In [42]:
with open(file_range, 'w') as file:
    file.write(tmp_json)

In [None]:
tmp_json = df_raw_no_range.to_json(orient='records', lines=False, force_ascii=False)

In [None]:
with open(file_no_range, 'w') as file:
    file.write(tmp_json)

### load json data for further analysis (test)

In [None]:
# with open(file_name, 'r') as file:
#     loaded_data = json.load(file)

In [None]:
# test_df = pd.DataFrame(data=loaded_data)

### combine df's

In [None]:
# directory_path = Path("../data")
# all_dfs = [file.name for file in directory_path.iterdir() if file.is_file()]

In [None]:
# all_dfs

['df_repos_metadata_star_up_to_max.json', 'df_repos_metadata_0_to_22196.json']

In [None]:
df_repos = pd.DataFrame()

In [None]:
# with open('../data/df_repos_metadata_star_up_to_max.json', 'r') as file:
#     loaded_data = json.load(file)
# tmp_df = pd.DataFrame(data=loaded_data)

In [None]:
#tmp_df.describe()

Unnamed: 0,id,size,stargazers_count,watchers_count,forks_count,open_issues_count,forks,open_issues,watchers,score
count,1050.0,1050.0,1050.0,1050.0,1050.0,1050.0,1050.0,1050.0,1050.0,1050.0
mean,284212900.0,131639.0,21701.457143,21701.457143,3555.066667,375.71619,3555.066667,375.71619,21701.457143,1.0
std,280021300.0,498665.2,29908.569921,29908.569921,5971.41457,1003.159613,5971.41457,1003.159613,29908.569921,0.0
min,26554.0,7.0,7193.0,7193.0,108.0,0.0,108.0,0.0,7193.0,1.0
25%,48804240.0,5378.5,8829.5,8829.5,989.0,56.0,989.0,56.0,8829.5,1.0
50%,161583700.0,24079.5,12306.5,12306.5,1814.0,139.5,1814.0,139.5,12306.5,1.0
75%,570278500.0,83913.75,21947.25,21947.25,3423.0,331.0,3423.0,331.0,21947.25,1.0
max,954873300.0,10870970.0,335520.0,335520.0,49242.0,16073.0,49242.0,16073.0,335520.0,1.0


In [None]:
# quantile_75 = tmp_df['stargazers_count'].describe().loc['75%']
# quantile_75

np.float64(21947.25)

In [None]:
#tmp_df = tmp_df[tmp_df['stargazers_count'] > quantile_75]

In [None]:
#tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 150 to 1019
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 263 non-null    int64  
 1   name               263 non-null    object 
 2   full_name          263 non-null    object 
 3   html_url           263 non-null    object 
 4   description        259 non-null    object 
 5   url                263 non-null    object 
 6   labels_url         263 non-null    object 
 7   created_at         263 non-null    object 
 8   updated_at         263 non-null    object 
 9   pushed_at          263 non-null    object 
 10  size               263 non-null    int64  
 11  stargazers_count   263 non-null    int64  
 12  watchers_count     263 non-null    int64  
 13  language           263 non-null    object 
 14  has_issues         263 non-null    bool   
 15  has_projects       263 non-null    bool   
 16  has_downloads      263 non-n

In [None]:
# with open('../data/df_repos_metadata_0_to_22196.json', 'r') as file:
#     loaded_data = json.load(file)
# tmp_df1 = pd.DataFrame(data=loaded_data)

In [None]:
df_repos = pd.concat([df_raw_range, df_raw_no_range])

In [None]:
df_repos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 854 entries, 0 to 1019
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 854 non-null    int64  
 1   name               854 non-null    object 
 2   full_name          854 non-null    object 
 3   html_url           854 non-null    object 
 4   description        838 non-null    object 
 5   url                854 non-null    object 
 6   labels_url         854 non-null    object 
 7   created_at         854 non-null    object 
 8   updated_at         854 non-null    object 
 9   pushed_at          854 non-null    object 
 10  size               854 non-null    int64  
 11  stargazers_count   854 non-null    int64  
 12  watchers_count     854 non-null    int64  
 13  language           854 non-null    object 
 14  has_issues         854 non-null    bool   
 15  has_projects       854 non-null    bool   
 16  has_downloads      854 non-nul

### filter df_repos for language

In [None]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return "error"

In [None]:
df_repos['language_spoken'] = df_repos['description'].apply(detect_language)

In [None]:
len(df_repos[df_repos['language_spoken'] == 'en'])

748

### remove duplicate rows

In [None]:
# the behavior of the api is sometimes confusing
# i got some duplicates in the dataframe --> remove them
df_cleaned = df_repos.drop_duplicates(subset=['full_name'])


### save df_repos as .json for further analysis / work

In [None]:
tmp_json = df_cleaned.to_json(orient='records', lines=False, force_ascii=False)

In [None]:
# with open('../data/df_repos_metadata.json', 'w') as file:
#     file.write(tmp_json)