In [1]:
import pandas as pd
from config import REPOSITORIES_FILE, FILTERED_FILE, JUPYTER_REPOS_FILE, PYTHON_REPOS_FILE

## Filtering Repositories Collected from Github according to Corpus Criteria

In [2]:
df = pd.read_excel(REPOSITORIES_FILE, keep_default_na=False)
print('Total repositories: ',len(df))

Total repositories:  130159


### Removing repositories with no languages

In [3]:
nolanguages = df[df.languages==0]
df.drop(df[df['languages'] == 0].index, inplace = True)
print('Total repositories after filtering repositories with "no languages":',len(df))

Total repositories after filtering repositories with "no languages": 130159


### Removing repositories whose names contain the words "courses", "cours", "curso" or "cursos"

In [4]:
courses = df.loc[df.name.str.contains('course',case=False)]
df.drop(df.loc[df.name.str.contains('course',case=False)].index, inplace=True )
print('Total repositories after filtering "courses":', len(df))

Total repositories after filtering "courses": 130159


In [5]:
cours = df.loc[df.name.str.contains('cours',case=False)]
df.drop(df.loc[df.name.str.contains('cours',case=False)].index, inplace=True )
print('Total repositories after filtering "cours":', len(df))

Total repositories after filtering "cours": 130159


In [6]:
curso = df.loc[df.name.str.contains('curso',case=False)]
df.drop(df.loc[df.name.str.contains('curso',case=False)].index, inplace=True )
print('Total repositories after filtering "curso":', len(df))

Total repositories after filtering "curso": 130159


In [7]:
cursos = df.loc[df.name.str.contains('cursos',case=False)]
df.drop(df.loc[df.name.str.contains('cursos',case=False)].index, inplace=True )
print('Total repositories after filtering "cursos":', len(df))

Total repositories after filtering "cursos": 130159


### Removing repositories with no commits or no contributors

In [8]:
emptycommits = df.loc[df['commits'] == '']
df.drop(df[df['commits'] == ''].index, inplace = True)
print('Total repositories after filtering repositories with no commits:',len(df))

Total repositories after filtering repositories with no commits: 130159


In [9]:
nocontributors = df[df.contributors == 0]
df.drop(df[df['contributors'] == 0].index, inplace = True)
print('Total repositories after filtering repositories with no contributors:',len(df))

Total repositories after filtering repositories with no contributors: 130159


### Saving the remaining repositories to Filtered Repositories

In [10]:
df.createdAt = pd.to_datetime(df.createdAt).dt.tz_localize(None) #removes timezone from dates
df.pushedAt = pd.to_datetime(df.pushedAt).dt.tz_localize(None)
df.to_excel(FILTERED_FILE, index=False)

In [11]:
print(f"Total of Repositories: {len(df)} - Total Disk Usage: {df['diskUsage'].sum()}")

Total of Repositories: 130159 - Total Disk Usage: 2113687900


## Splitting by Language for futher extraction and analysis

### Jupyter Notebooks

In [18]:
jupyter_repos = df.query("primaryLanguage== 'Jupyter Notebook'")
print(f"Total repositories with 'Jupyter Notebook' as Primary Language: {len(jupyter_repos)}")
jupyter_repos[:10].to_excel(JUPYTER_REPOS_FILE, index=False)

Total repositories with 'Jupyter Notebook' as Primary Language: 82628


### Python

In [20]:
python_repos = df.query("primaryLanguage== 'Python'")
print(f"Total repositories with 'Python' as Primary Language: {len(python_repos)}")
python_repos[:10].to_excel(PYTHON_REPOS_FILE, index=False)

Total repositories with 'Python' as Primary Language: 18746
