<a href="https://colab.research.google.com/github/fzanart/GHDomains/blob/main/GitHub_data_downloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install PyGithub

In [2]:
from github import Github, RateLimitExceededException
from github.GithubException import UnknownObjectException
from getpass import getpass
import os
import time
import json
from datetime import datetime
import pandas as pd
from google.colab import drive
import pickle
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
token = getpass(prompt="Enter your Github token:")
g = Github(token, timeout=30, per_page=100)

Enter your Github token:··········


In [4]:
with open ('/content/drive/MyDrive/GHDomains/new_popular_list.pickle', 'rb') as fp:
    repos_ids = pickle.load(fp)

In [5]:
len(repos_ids)

893

In [6]:
# Get stars
def get_starscount(repo):
    try:
        return repo.stargazers_count
    except:
        return None

# Get Forks
def get_forkscount(repo):
    try:
        return repo.get_forks().totalCount
    except:
        return None

# Get description
def get_description(repo):
    try:
        return repo.description
    except:
        return None

# Get Url
def get_url(repo):
    try:
        return repo.url
    except:
        return None

# Get decoded readme
def get_readme(repo):
    try:
        return repo.get_readme().decoded_content.decode('UTF-8')
    except:
        return None

# Get topic label
def get_topics(repo):
    try:
        topics = repo.get_topics()
        if len(topics) != 0:
            return topics
        else:
            return None
    except:
        return None

# Get all programming languages .values retrives bytes of code in each language
# Returns a dictionary with bits of each language
def get_languages(repo):
    languages_list = []
    try:
        languages = repo.get_languages()
        languages_list.extend([language for language in languages])
        if len(languages_list) != 0:
            return languages_list
        else:
            return None
    except:
        return None

# Get type of license
def get_license(repo):
    try:
        return repo.get_license().license.name
    except:
        return None
    
# Get labels from pull requests
def get_labels(repo):
    try:
        labels_list = []
        labels = repo.get_labels()
        labels_list.extend([label.name for label in labels])
        if len(labels_list) != 0:
            return labels_list
        else:
            return None
    except:
        return None

# Get copntributors login names
def get_contributors(repo):

    contributor_list = []
    try:
        contributors = repo.get_contributors()
        contributor_list.extend([contributor.login for contributor in contributors])
        if len(contributor_list) != 0:
            return contributor_list
        else:
            return None
    except:
        return None

# Get the count of releases
def get_releases_count(repo):
    try:
        return repo.get_releases().totalCount
    except:
        return None

# Get names for types of events
def get_repoevents(repo):
    try:
        events = repo.get_events()
        events_list = []
        events_list.extend([event.type for event in events])
        if len(events_list) !=0:
            return events_list
        else:
            return None
    except:
        return None

# Get workflow names
def get_workflow(repo):
    try:
        workflows = repo.get_workflows()
        workflow_list = []
        workflow_list.extend([workflow.name for workflow in workflows])
        if len(workflow_list) !=0:
            return workflow_list
        else:
            return None
    except:
        return None

# Get home contents
def get_contents(repo):
    try:
        contents = repo.get_contents("")
        contents_list = []
        contents_list.extend([content.name for content in contents])
        if len(contents_list) != 0:
            return contents_list
        else:
            return None
    except:
        return None  

In [7]:
# Set of features

def get_repo(g, repo_id):
  
  repo = g.get_repo(repo_id)

  # Text data = description + readme + labels
  # Categorical = Contributors, programming languages, topics, head contents, licence
  # Numerical = Stars, forks, releases

  return {'Name':                   repo_id,
          'Description':            get_description(repo),
          'README':                 get_readme(repo),
          'Labels':                 get_labels(repo),
          'Contributors':           get_contributors(repo),
          'Languages':              get_languages(repo),
          'Topics':                 get_topics(repo),
          'Contents':               get_contents(repo),
          'Licence':                get_license(repo),
          'Stars':                  get_starscount(repo),
          'Forks':                  get_forkscount(repo),
          'Releases':               get_releases_count(repo),
          'Workflows':              get_workflow(repo)}



In [8]:
output_directory_path    = '/content/drive/MyDrive/GHDomains/New_popular_download'
repos_already_downloaded = [file_name.replace('.json', '').replace(':','/') for file_name in os.listdir(output_directory_path) if file_name.endswith('.json') and not file_name.startswith('all_repos')]
repos_to_download        = [repo_name for repo_name in repos_ids if repo_name not in repos_already_downloaded]

In [12]:
repos_to_download = ['VGraupera/1on1-questions','dunovank/jupyter-themes','hackjutsu/Lepton','jupeter/clean-code-php']

In [13]:
for i, repo_id in enumerate(repos_to_download):
  try:
    print('Downloading repo: ' + repo_id + ' ...')
    print(str(time.asctime(time.localtime(time.time()))) + ' ' + ': #' + str(i+1+len(repos_already_downloaded)) + ' out of ' + str(len(repos_ids)) + ' repos')

    commits = get_repo(g, repo_id)
    repo_id = repo_id.replace('/', ':')
    json.dump(commits, open(output_directory_path + '/' + repo_id + '.json', 'w'))

    print('- '*50)
  except RateLimitExceededException:
    print('sleeping...')
    time.sleep((g.get_rate_limit().core.reset - datetime.today()).seconds + 5)
    print('continue...')
  except UnknownObjectException:
    print(repo_id + ' not found')
    continue

Downloading repo: VGraupera/1on1-questions ...
Tue Mar 22 01:58:27 2022 : #442 out of 893 repos
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Downloading repo: dunovank/jupyter-themes ...
Tue Mar 22 01:58:28 2022 : #443 out of 893 repos
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Downloading repo: hackjutsu/Lepton ...
Tue Mar 22 01:58:30 2022 : #444 out of 893 repos
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Downloading repo: jupeter/clean-code-php ...
Tue Mar 22 01:58:31 2022 : #445 out of 893 repos
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


In [14]:
def join_repos(output_directory_path):

    repos = []

    for file_name in os.listdir(output_directory_path):
        if file_name.endswith('.json') and not file_name.startswith('.') and file_name != "all_repos.json":
            _book = json.load(open(output_directory_path + '/' + file_name, 'r')) #, encoding='utf-8', errors='ignore'))
            repos.append(_book)

    return repos

repos = join_repos(output_directory_path)
json.dump(repos, open(output_directory_path +'/all_repos.json', 'w'))
repos_df = pd.read_json(output_directory_path +'/all_repos.json')
repos_df.to_csv(output_directory_path +'/all_repos.csv', index=False, encoding='utf-8')

In [15]:
df = pd.read_csv(output_directory_path +'/all_repos.csv')
df

Unnamed: 0,Name,Description,README,Labels,Contributors,Languages,Topics,Contents,Licence,Stars,Forks,Releases,Workflows
0,vuejs/devtools,⚙️ Browser devtools extension for debugging Vu...,# vue-devtools\n\n![screenshot](./media/screen...,"['accepted proposition', 'bug', 'cannot reprod...","['Akryum', 'yyx990803', 'posva', 'bartlomieju'...","['TypeScript', 'Vue', 'JavaScript', 'HTML', 'S...",,"['.browserslistrc', '.circleci', '.eslintrc.js...",MIT License,22091,3684,73,['Create Release']
1,futurice/android-best-practices,"Do's and Don'ts for Android development, by Fu...",# Best practices in Android development\n\nAvo...,"['2018', 'answered', 'bug', 'discussion', 'dup...","['staltz', 'peter-tackage', 'minsoopark', 'and...",,"['best-practices', 'android-development', 'and...","['LICENSE', 'README.md', 'translations']",Other,19943,3305,0,
2,microsoft/Web-Dev-For-Beginners,"24 Lessons, 12 Weeks, Get Started as a Web Dev...",[![GitHub license](https://img.shields.io/gith...,"['bug', 'dependencies', 'documentation', 'dupl...","['jlooper', 'ManuSquall', 'San1ay', 'silversky...","['JavaScript', 'HTML', 'CSS', 'Vue']","['javascript', 'curriculum', 'html', 'css', 'e...","['.github', '.gitignore', '.nojekyll', '1-gett...",MIT License,44328,6279,0,"['Azure Static Web Apps CI/CD', 'Lock closed i..."
3,airbnb/react-sketchapp,render React components to Sketch ⚛️💎,"<div align=""center"">\n <img alt=""react-sketch...","['awaiting-review', 'bug', 'chore', 'discuss',...","['jongold', 'mathieudutour', 'macintoshhelper'...","['TypeScript', 'JavaScript']","['react-sketchapp', 'react', 'sketch', 'sketch...","['.bookignore', '.editorconfig', '.github', '....",MIT License,14874,864,28,
4,eugeneyan/applied-ml,📚 Papers & tech blogs by companies sharing the...,"# applied-ml\nCurated papers, articles, and bl...","['bug', 'documentation', 'duplicate', 'enhance...","['eugeneyan', 'shreyansh26', 'chmnsk', 'nilesh...",,"['applied-machine-learning', 'production', 'ap...","['CONTRIBUTING.md', 'LICENSE', 'README.md']",MIT License,19001,2615,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,graphql/dataloader,DataLoader is a generic utility to be used as ...,# DataLoader\n\nDataLoader is a generic utilit...,"['bug', 'dependencies', 'duplicate', 'enhancem...","['leebyron', 'wincent', 'gajus', 'brianwarner'...","['JavaScript', 'Shell']","['nodejs', 'dataloader', 'graphql', 'batch']","['.eslintrc', '.flowconfig', '.github', '.giti...",MIT License,11269,484,6,
889,junyanz/CycleGAN,Software that can generate photos from paintin...,"<img src='imgs/horse2zebra.gif' align=""right"" ...","['bug', 'duplicate', 'enhancement', 'help want...","['junyanz', 'taesungp', 'aferriss', 'asturur',...","['Lua', 'Python', 'Shell', 'TeX']","['gan', 'generative-adversarial-network', 'dee...","['.gitignore', 'LICENSE', 'README.md', 'data',...",Other,11093,1853,0,
890,Tencent/wcdb,WCDB is a cross-platform database framework de...,# WCDB\n\n[![PRs Welcome](https://img.shields....,"['bug', 'duplicate', 'enhancement', 'help want...","['RingoD', 'John-He-928', 'infinnie', 'drakeet...","['C', 'C++', 'Java', 'Swift', 'Objective-C++',...","['database', 'android', 'ios', 'wechat', 'mobi...","['.clang-format', '.github', '.gitignore', '.g...",Other,9331,1242,12,
891,halfrost/Halfrost-Field,✍🏻 这里是写博客的地方 —— Halfrost-Field 冰霜之地,# Halfrost-Field 冰霜之地\n\n<p align='center'>\n<...,"['bug', 'duplicate', 'enhancement', 'help want...","['halfrost', 'somnus-L', 'devSC']","['Go', 'CSS', 'Jupyter Notebook', 'Objective-C...","['ios', 'source-code', 'objective-c', 'swift',...","['.gitattributes', '.github', 'LICENSE', 'READ...",Creative Commons Attribution Share Alike 4.0 I...,10732,1680,0,['Deploy Blog']
