In [15]:
import pandas as pd
from config import REPOSITORIES_FILE, FILTERED_FILE, SELECTED_REPOS_FILE

## Filtering Repositories Collected from Github according to Corpus Criteria

In [16]:
df = pd.read_excel(REPOSITORIES_FILE, keep_default_na=False)[:100]
print('Total repositories: ',len(df))

Total repositories:  100


### Removing repositories with no languages

In [17]:
nolanguages = df[df.languages==0]
df.drop(df[df['languages'] == 0].index, inplace = True)
print('Total repositories after filtering repositories with "no languages":',len(df))

Total repositories after filtering repositories with "no languages": 100


### Removing repositories whose names contain the words "courses", "cours", "curso" or "cursos"

In [18]:
courses = df.loc[df.name.str.contains('course',case=False)]
df.drop(df.loc[df.name.str.contains('course',case=False)].index, inplace=True )
print('Total repositories after filtering "courses":', len(df))

Total repositories after filtering "courses": 100


In [19]:
cours = df.loc[df.name.str.contains('cours',case=False)]
df.drop(df.loc[df.name.str.contains('cours',case=False)].index, inplace=True )
print('Total repositories after filtering "cours":', len(df))

Total repositories after filtering "cours": 100


In [20]:
curso = df.loc[df.name.str.contains('curso',case=False)]
df.drop(df.loc[df.name.str.contains('curso',case=False)].index, inplace=True )
print('Total repositories after filtering "curso":', len(df))

Total repositories after filtering "curso": 100


In [21]:
cursos = df.loc[df.name.str.contains('cursos',case=False)]
df.drop(df.loc[df.name.str.contains('cursos',case=False)].index, inplace=True )
print('Total repositories after filtering "cursos":', len(df))

Total repositories after filtering "cursos": 100


### Removing repositories with no commits or no contributors

In [22]:
emptycommits = df.loc[df['commits'] == '']
df.drop(df[df['commits'] == ''].index, inplace = True)
print('Total repositories after filtering repositories with no commits:',len(df))

Total repositories after filtering repositories with no commits: 100


In [23]:
nocontributors = df[df.contributors == 0]
df.drop(df[df['contributors'] == 0].index, inplace = True)
print('Total repositories after filtering repositories with no contributors:',len(df))

Total repositories after filtering repositories with no contributors: 100


### Saving the remaining repositories to Filtered Repositories

In [24]:
df.createdAt = pd.to_datetime(df.createdAt).dt.tz_localize(None) #removes timezone from dates
df.pushedAt = pd.to_datetime(df.pushedAt).dt.tz_localize(None)
df.to_excel(FILTERED_FILE, index=False)

In [25]:
print(f"Total of Repositories: {len(df)} - Total Disk Usage: {df['diskUsage'].sum()}")

Total of Repositories: 100 - Total Disk Usage: 7004553


## Selecting Repositories by Language for futher extraction and analysis

### Jupyter Notebooks and Python

In [26]:
from src.states import REP_FILTERED

filtered_repos = df.query("primaryLanguage == 'Jupyter Notebook' | primaryLanguage== 'Python'").copy()
print(f"Total repositories with 'Jupyter Notebook' "
      f"or 'Python' as Primary Language: {len(filtered_repos)}")

Total repositories with 'Jupyter Notebook' or 'Python' as Primary Language: 68


In [27]:
filtered_repos.loc[:, "repository"] = filtered_repos["owner"] + '/' + filtered_repos["name"]
filtered_repos = filtered_repos.drop(['owner','name'], axis=1)

In [60]:
from config import GITHUB
from src.db.database import connect, Repository

with connect() as session:
    for repo in filtered_repos[:10].itertuples(index=False):
        repository = session.query(Repository).filter(
            Repository.domain == GITHUB,
            Repository.repository == repo.repository,
        ).first()
        if repository is not None:
            print("\t>> Repository already exists: ID={}".format(repository.id))
        else:
            repo_row = Repository(
                state=REP_FILTERED, domain=GITHUB,
                repository=repo.repository, primary_language=repo.primaryLanguage,
                disk_usage=repo.diskUsage, is_mirror=repo.isMirror,
                git_created_at=repo.createdAt, git_pushed_at=repo.pushedAt,
                languages=repo.languages, contributors=repo.contributors, commits=repo.commits,
                pull_requests=repo.pullRequests, branches=repo.branches, watchers=repo.watchers,
                issues=repo.issues, stargazers=repo.stargazers, forks=repo.forks,
                description=repo.description, tags=repo.tags, releases=repo.releases
            )

            session.add(repo_row)
    session.commit()