<a href="https://colab.research.google.com/github/fzanart/GHDomains/blob/main/GitHub_data_downloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install PyGithub

In [None]:
from github import Github, RateLimitExceededException
from github.GithubException import UnknownObjectException, BadCredentialsException, GithubException
from getpass import getpass
import os
import time
import logging
import logging.config
import json
from datetime import datetime
import pandas as pd
from google.colab import drive
import pickle
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
logging.config.fileConfig('/content/drive/MyDrive/GHDomains/logging.conf')
logger = logging.getLogger('jsonLogger')

In [None]:
token = getpass(prompt="Enter your Github token:")
g = Github(token, timeout=30, per_page=100)

Enter your Github token:··········


In [None]:
# with open ('/content/drive/MyDrive/GHDomains/new_popular_list.pickle', 'rb') as fp:
#     repos_ids = pickle.load(fp)

In [None]:
# len(repos_ids)

In [None]:
repos_ids = pd.read_excel('/content/drive/MyDrive/GHDomains/Bruno/Spinellis_dataset.xlsx', usecols=['Repo'])
repos_ids = repos_ids['Repo'].to_list()
aux = len(repos_ids)
repos_ids = list(set(repos_ids))

print('Duplicates: ', aux - len(repos_ids))

Duplicates:  35


In [None]:
# Get stars
def get_starscount(repo):
    try:
        return repo.stargazers_count
    except:
        return None

# Get Forks
def get_forkscount(repo):
    try:
        return repo.get_forks().totalCount
    except:
        return None

# Get description
def get_description(repo):
    try:
        return repo.description
    except:
        return None

# Get Url
def get_url(repo):
    try:
        return repo.url
    except:
        return None

# Get decoded readme
def get_readme(repo):
    try:
        return repo.get_readme().decoded_content.decode('UTF-8')
    except:
        return None

# Get topic label
def get_topics(repo):
    try:
        topics = repo.get_topics()
        if len(topics) != 0:
            return topics
        else:
            return None
    except:
        return None

# Get all programming languages .values retrives bytes of code in each language
# Returns a dictionary with bits of each language
def get_languages(repo):
    languages_list = []
    try:
        languages = repo.get_languages()
        languages_list.extend([language for language in languages])
        if len(languages_list) != 0:
            return languages_list
        else:
            return None
    except:
        return None

# Get type of license
def get_license(repo):
    try:
        return repo.get_license().license.name
    except:
        return None
    
# Get labels from pull requests
def get_labels(repo):
    try:
        labels_list = []
        labels = repo.get_labels()
        labels_list.extend([label.name for label in labels])
        if len(labels_list) != 0:
            return labels_list
        else:
            return None
    except:
        return None

# Get copntributors login names
def get_contributors(repo):

    contributor_list = []
    try:
        contributors = repo.get_contributors()
        contributor_list.extend([contributor.login for contributor in contributors])
        if len(contributor_list) != 0:
            return contributor_list
        else:
            return None
    except:
        return None

# Get the count of releases
def get_releases_count(repo):
    try:
        return repo.get_releases().totalCount
    except:
        return None

# Get names for types of events
def get_repoevents(repo):
    try:
        events = repo.get_events()
        events_list = []
        events_list.extend([event.type for event in events])
        if len(events_list) !=0:
            return events_list
        else:
            return None
    except:
        return None

# Get workflow names
def get_workflow(repo):
    try:
        workflows = repo.get_workflows()
        workflow_list = []
        workflow_list.extend([workflow.name for workflow in workflows])
        if len(workflow_list) !=0:
            return workflow_list
        else:
            return None
    except:
        return None

# Get home contents
def get_contents(repo):
    try:
        contents = repo.get_contents("")
        contents_list = []
        contents_list.extend([content.name for content in contents])
        if len(contents_list) != 0:
            return contents_list
        else:
            return None
    except:
        return None  

In [None]:
# Set of features

def get_repo(g, repo_id):
  
  repo = g.get_repo(repo_id)

  # Text data = description + readme + labels
  # Categorical = Contributors, programming languages, topics, head contents, licence
  # Numerical = Stars, forks, releases

  return {'Name':                   repo_id,
          'Description':            get_description(repo),
          'README':                 get_readme(repo),
          'Labels':                 get_labels(repo),
          'Contributors':           get_contributors(repo),
          'Languages':              get_languages(repo),
          'Topics':                 get_topics(repo),
          'Contents':               get_contents(repo),
          'Licence':                get_license(repo),
          'Stars':                  get_starscount(repo),
          'Forks':                  get_forkscount(repo),
          'Releases':               get_releases_count(repo),
          'Workflows':              get_workflow(repo)}



In [None]:
output_directory_path    = '/content/drive/MyDrive/GHDomains/Bruno/Repos'
repos_already_downloaded = [file_name.replace('.json', '').replace(':','/') for file_name in os.listdir(output_directory_path) if file_name.endswith('.json') and not file_name.startswith('all_repos')]
repos_to_download        = [repo_name for repo_name in repos_ids if repo_name not in repos_already_downloaded]

In [None]:
#repos_to_download = ['VGraupera/1on1-questions','dunovank/jupyter-themes','hackjutsu/Lepton','jupeter/clean-code-php']

In [None]:
for i, repo_id in enumerate(repos_to_download):
  try:

    commits = get_repo(g, repo_id)
    logger.info(f'repo: {repo_id}, message: #{i+1+len(repos_already_downloaded)} out of {len(repos_ids)}')
    repo_id = repo_id.replace('/', ':')
    json.dump(commits, open(output_directory_path + '/' + repo_id + '.json', 'w'))
    
  except RateLimitExceededException:
    logger.warning(f'repo: {repo_id}, message: RateLimitExceededException')
    time.sleep((g.get_rate_limit().core.reset - datetime.today()).seconds + 5)
    print('continue...')
  except UnknownObjectException:
    logger.error(f'repo: {repo_id}, message: Not found')
    continue
  except BadCredentialsException:
    logger.critical(f'repo: {repo_id}, message: BadCredentialsException')
    continue
  except GithubException:
    logger.critical(f'repo: {repo_id}, message: GithubException')
    continue

09/07/2022 06:54:25 AM - INFO - repo: appunite/AUMediaPlayer, message: #2090 out of 17217
09/07/2022 06:54:26 AM - INFO - repo: Froiden/laravel-rest-api, message: #2091 out of 17217
09/07/2022 06:54:27 AM - INFO - repo: mobilejazz/NibWrapper, message: #2092 out of 17217
09/07/2022 06:54:28 AM - INFO - repo: mailjet/mailjet-apiv3-go, message: #2093 out of 17217
09/07/2022 06:54:30 AM - INFO - repo: src-d/minhashcuda, message: #2094 out of 17217
09/07/2022 06:54:31 AM - INFO - repo: Shopify/shopify_django_app, message: #2095 out of 17217
09/07/2022 06:54:32 AM - INFO - repo: kickstarter/cfn-flow, message: #2096 out of 17217
09/07/2022 06:54:32 AM - ERROR - repo: SPICE/virt-viewer, message: Not found
09/07/2022 06:54:33 AM - INFO - repo: Moventes/bruit.io, message: #2098 out of 17217
09/07/2022 06:54:34 AM - INFO - repo: crowdfavorite/wp-tax-post-binding, message: #2099 out of 17217
09/07/2022 06:54:35 AM - INFO - repo: itext/i7j-pdfhtml, message: #2100 out of 17217
09/07/2022 06:54:37 AM

KeyboardInterrupt: ignored

In [None]:
def join_repos(output_directory_path):

    repos = []

    for file_name in os.listdir(output_directory_path):
        if file_name.endswith('.json') and not file_name.startswith('.') and file_name != "all_repos.json":
            _book = json.load(open(output_directory_path + '/' + file_name, 'r')) #, encoding='utf-8', errors='ignore'))
            repos.append(_book)

    return repos

repos = join_repos(output_directory_path)
json.dump(repos, open(output_directory_path +'/all_repos.json', 'w'))
repos_df = pd.read_json(output_directory_path +'/all_repos.json')
repos_df.to_csv(output_directory_path +'/all_repos.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv(output_directory_path +'/all_repos.csv')
df