In [11]:
import sys
import os
src = os.path.dirname(os.path.abspath(''))
if src not in sys.path: sys.path.append(src)

In [12]:
from datetime import timedelta
import pandas as pd

from config import GITHUB
from src.db.database import connect, Repository
from src.states import REP_FILTERED

## Filtering Repositories Collected from Github according to Corpus Criteria

In [13]:
with connect() as session:
    queries = pd.read_sql_table("queries", session.connection())
    total = len(queries)
print("  Repositories queried from GitHub   : ", total)

  Repositories queried from GitHub   :  330351


## Filtering out repositories with no Contributors, no Commits and no Languages

In [14]:
filtered = queries[(queries.contributors > 0)]
no_commits = len(queries) - len(filtered)
print('- Repositories with no contributors : ', no_commits )
print("---------------------------------------------")
queries2 = filtered
print("  Remaining filtered repositories   : ", len(queries2))

- Repositories with no contributors :  173
---------------------------------------------
  Remaining filtered repositories   :  330178


In [15]:
filtered = queries2[(queries2.commits > 0)]
no_commits = len(queries2) - len(filtered)
print('- Repositories with no commits     : ', no_commits )
print("---------------------------------------------")
queries3 = filtered
print("  Remaining filtered repositories  : ", len(queries3))

- Repositories with no commits     :  18596
---------------------------------------------
  Remaining filtered repositories  :  311582


In [16]:
filtered = queries3[(queries3.languages > 0) & (queries3.primary_language.notnull())]
no_languages = len(queries3) - len(filtered)
print('- Repositories with no languages   : ', no_languages)
print("---------------------------------------------")
queries4 = filtered
print("  Remaining filtered repositories  : ", len(queries4))

- Repositories with no languages   :  101884
---------------------------------------------
  Remaining filtered repositories  :  209698


##  Filtering out repositories whose name contain the words "course", "curso" or "cours"

In [17]:
queries4 = queries4.copy()

queries4[['user', 'name']] = queries4['repo'].str.split('/', expand=True)
rep_filtered = len(queries4)

filtered = queries4[~queries4['name'].str.contains('course')]
no_course = len(queries4) - len(filtered)
queries5 = filtered

filtered = queries5[~queries5['name'].str.contains('curso')]
no_curso = len(queries5) - len(filtered)
queries6 = filtered

filtered = queries6[~queries6['name'].str.contains('cours')]
no_cours = len(queries6) - len(filtered)
queries7 = filtered

In [18]:
print("            Filtered repositories  : ", rep_filtered)
print("- with course/courses in repo name : ", no_course, "   (English)")
print("- with cursos/cursos in repo name  : ", no_curso, "    (Spanish/Portuguese)")
print("- with cours in repo name          : ", no_cours, "      (French)")
print("---------------------------------------------")
print("  Remaining filtered repositories  : ", len(queries7))

            Filtered repositories  :  209698
- with course/courses in repo name :  6132    (English)
- with cursos/cursos in repo name  :  160     (Spanish/Portuguese)
- with cours in repo name          :  3       (French)
---------------------------------------------
  Remaining filtered repositories  :  203403


## Filtering out repositories that don't have at least one day between commits

In [19]:
filtered = queries7[(queries7.git_pushed_at-queries7.created_at) > timedelta(days=1)]
less_1_day = len(queries7) - len(filtered)
print('- Repositories with no languages   : ', less_1_day)
print("---------------------------------------------")
queries8 = filtered
print("  Remaining filtered repositories  : ", len(queries8))

- Repositories with no languages   :  203403
---------------------------------------------
  Remaining filtered repositories  :  0


## Selecting Repositories by Language for futher extraction and analysis

### Jupyter Notebooks and Python

In [20]:
filtered_repos = queries7.query("primary_language == 'Jupyter Notebook' | primary_language== 'Python'").copy()
print("Total repositories with 'Jupyter Notebook' "
      "or 'Python' as Primary Language: {}"
      .format(len(filtered_repos)))

filtered_repos["disk_usage"] = filtered_repos["disk_usage"].astype(int)
print(
    "Disk Usage for the {} repositories is estimated to be:\n"
    "{} KB - {:.2f} MB - {:.2f} GB - {:.2f} TB"
    .format(
          len(filtered_repos),
          filtered_repos.disk_usage.sum(),
          filtered_repos.disk_usage.sum() / 10**3,
          filtered_repos.disk_usage.sum() / 10**6,
          filtered_repos.disk_usage.sum() / 10**9
    )
)

Total repositories with 'Jupyter Notebook' or 'Python' as Primary Language: 166066
Disk Usage for the 166066 repositories is estimated to be:
2297646864 KB - 2297646.86 MB - 2297.65 GB - 2.30 TB


## Saving filtered repositories

In [21]:
with connect() as session:
    count = 0
    for repo in filtered_repos[:10].itertuples(index=False):
        repository = session.query(Repository).filter(
            Repository.domain == GITHUB,
            Repository.repository == repo.repo,
        ).first()
        if repository is not None:
            print(">> Repository already exists: ID={}".format(repository.id))
        else:
            count = count+1
            repo_row = Repository(
                state=REP_FILTERED, domain=GITHUB,
                repository=repo.repo, primary_language=repo.primary_language,
                disk_usage=repo.disk_usage, is_mirror=repo.is_mirror,
                git_created_at=repo.git_created_at, git_pushed_at=repo.git_pushed_at,
                languages=repo.languages, contributors=repo.contributors, commits=repo.commits,
                pull_requests=repo.pull_requests, branches=repo.branches, watchers=repo.watchers,
                issues=repo.issues, stargazers=repo.stargazers, forks=repo.forks,
                description=repo.description, tags=repo.tags, releases=repo.releases
            )

            session.add(repo_row)
    session.commit()

    print("Filtered {} repository into table Repositories".format(count))

Filtered 10 repository into table Repositories
