**Author:** Lisa Wallner  
**Description:** In this notebook a file 'df_repos_metadata.json' will be loaded and it's content processed to create a sample of GitHub repository metadata for further processing.

In [59]:
import pandas as pd # package for data manipulation
import json # package to work with .json

### Load data

In [None]:
file_path = '../data/df_repos_metadata.json' # filepath of input file
with open(file_path, 'r') as file: # JSON file is loaded and it contents saved within a variable
    loaded_data = json.load(file)

df = pd.DataFrame(data=loaded_data) # create dataframe from previous loaded JSON

In [61]:
len(df) # check length of dataframe

821

### Filter dataframe with specific conditons which are requiered for further processing

In [62]:
# filter repos for spoken language English and save the data in a new dataframe
df = df[df['language_spoken'] == 'en']

In [None]:
len(df) # check length of dataframe

721

In [64]:
df['language_spoken'].value_counts() # check if dataframe only contains repositories with the value 'en' for the column 'language_spoken'

language_spoken
en    721
Name: count, dtype: int64

In [65]:
# add new column with license type of each repository
df['license_type'] = df['license'].apply(lambda x: list(x.values())[0] if isinstance(x, dict) else None) # (generated with Microsoft Copilot)

In [66]:
df.head(2)

Unnamed: 0,id,name,full_name,html_url,description,url,labels_url,created_at,updated_at,pushed_at,...,allow_forking,topics,visibility,forks,open_issues,watchers,default_branch,score,language_spoken,license_type
0,619825247,LMFlow,OptimalScale/LMFlow,https://github.com/OptimalScale/LMFlow,An Extensible Toolkit for Finetuning and Infer...,https://api.github.com/repos/OptimalScale/LMFlow,https://api.github.com/repos/OptimalScale/LMFl...,2023-03-27T13:56:29Z,2025-04-14T14:33:55Z,2025-04-13T05:44:14Z,...,True,"[chatgpt, deep-learning, instruction-following...",public,834,83,8400,main,1.0,en,apache-2.0
2,272811260,PathPlanning,zhm-real/PathPlanning,https://github.com/zhm-real/PathPlanning,Common used path planning algorithms with anim...,https://api.github.com/repos/zhm-real/PathPlan...,https://api.github.com/repos/zhm-real/PathPlan...,2020-06-16T21:00:44Z,2025-04-14T09:47:50Z,2023-02-06T07:54:46Z,...,True,"[anytime-dstar, anytime-repairing-astar, astar...",public,1704,30,8484,master,1.0,en,mit


In [None]:
# list with licenses which are allowing to use the content of the repository for further processing
licenses_list = ['apache-2.0', 'mit', 'isc', 'bsd-3-clause', 'bsd-2-clause', 'none'] 

In [None]:
df = df[df['license_type'].apply(lambda x: x in licenses_list)] # filter df for licenses_types which are in licenses_list and save dataframe

In [None]:
len(df) # check length of dataframe

451

In [70]:
df['license_type'].value_counts() # check which licenses_type the dataframe contains

license_type
mit             232
apache-2.0      174
bsd-3-clause     36
bsd-2-clause      8
isc               1
Name: count, dtype: int64

### Create sample

In [None]:
sample_df = df.sample(n=250) # create a sample of 250 elements

In [None]:
len(sample_df) # check length of dataframe

250

### Save sample in JSON file

In [None]:
tmp_json = sample_df.to_json(orient='records', lines=False, force_ascii=False) # save the sample dataframe in variable tmp_json
with open('../data/df_repos_sample_250.json', 'w') as file: # write content fo tmp_json to new JSON file
    file.write(tmp_json)