In order to extract data from Github, we are going to leverage the Github REST API v3, that can be found in this link https://developer.github.com/v3/.
In `config.py` file we need to define the following configuration variables, that are going to be accessed by the current notebook:
- `GITHUB_USERNAME`
- `GITHUB_TOKEN`
- `SQL_ALCHEMY_STRING` (only if we want to save our Github results in a relational database)

In [4]:
import json
import requests
from pandas.io.json import json_normalize
from sqlalchemy import create_engine, engine, text, types, MetaData, Table, String
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
from datetime import datetime

In [5]:
import config
import os

In [6]:
GITHUB_USERNAME = "eya-abid"
GITHUB_TOKEN = "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"

In [7]:
# function that converts all object columns to strings, in order to store them efficiently into the database
def objects_to_strings(table):
    measurer = np.vectorize(len)
    df_object = table.select_dtypes(include=[object])
    string_columns = dict(zip(df_object, measurer(
        df_object.values.astype(str)).max(axis=0)))
    string_columns = {key: String(length=value) if value > 0 else String(length=1)
                      for key, value in string_columns.items() }
    return string_columns

In [8]:
github_api = "https://api.github.com"
gh_session = requests.Session()
gh_session.auth = (GITHUB_USERNAME, GITHUB_TOKEN)

## A Specific User's Informations

In [9]:
def user_repos(user,api):
    next = True
    i = 1
    url = api + '/users/{}/repos'.format(user)
    repos_pg = gh_session.get(url = url)
    repos_to_json = repos_pg.json()
    return repos_to_json

In [10]:
repos = user_repos('bkmd100', github_api)
repos

[{'id': 316189498,
  'node_id': 'MDEwOlJlcG9zaXRvcnkzMTYxODk0OTg=',
  'name': 'codeforces-python',
  'full_name': 'Bkmd100/codeforces-python',
  'private': False,
  'owner': {'login': 'Bkmd100',
   'id': 29681960,
   'node_id': 'MDQ6VXNlcjI5NjgxOTYw',
   'avatar_url': 'https://avatars.githubusercontent.com/u/29681960?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/Bkmd100',
   'html_url': 'https://github.com/Bkmd100',
   'followers_url': 'https://api.github.com/users/Bkmd100/followers',
   'following_url': 'https://api.github.com/users/Bkmd100/following{/other_user}',
   'gists_url': 'https://api.github.com/users/Bkmd100/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/Bkmd100/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.github.com/users/Bkmd100/subscriptions',
   'organizations_url': 'https://api.github.com/users/Bkmd100/orgs',
   'repos_url': 'https://api.github.com/users/Bkmd100/repos',
   'events_url': 'https://api.github.com/us

In [37]:
def user_infos(user, api):
    url = api + '/users/{}'.format(user)
    collect_infos = gh_session.get(url = url)
    info_to_json = collect_infos.json()
    return info_to_json

In [38]:
info = user_infos('bkmd100', github_api) 
info

{'login': 'Bkmd100',
 'id': 29681960,
 'node_id': 'MDQ6VXNlcjI5NjgxOTYw',
 'avatar_url': 'https://avatars.githubusercontent.com/u/29681960?v=4',
 'gravatar_id': '',
 'url': 'https://api.github.com/users/Bkmd100',
 'html_url': 'https://github.com/Bkmd100',
 'followers_url': 'https://api.github.com/users/Bkmd100/followers',
 'following_url': 'https://api.github.com/users/Bkmd100/following{/other_user}',
 'gists_url': 'https://api.github.com/users/Bkmd100/gists{/gist_id}',
 'starred_url': 'https://api.github.com/users/Bkmd100/starred{/owner}{/repo}',
 'subscriptions_url': 'https://api.github.com/users/Bkmd100/subscriptions',
 'organizations_url': 'https://api.github.com/users/Bkmd100/orgs',
 'repos_url': 'https://api.github.com/users/Bkmd100/repos',
 'events_url': 'https://api.github.com/users/Bkmd100/events{/privacy}',
 'received_events_url': 'https://api.github.com/users/Bkmd100/received_events',
 'type': 'User',
 'site_admin': False,
 'name': None,
 'company': None,
 'blog': '',
 'loca

## Branches Of A Specific User's Repo

In [8]:
def branches_of_repo(repo, owner, api):
    branches = []
    next = True
    i = 1
    while next == True:
        url = api + '/repos/{}/{}/branches?page={}&per_page=100'.format(owner, repo, i)
        branch_pg = gh_session.get(url = url)
        branch_pg_list = [dict(item, **{'repo_name':'{}'.format(repo)}) for item in branch_pg.json()]    
        branch_pg_list = [dict(item, **{'owner':'{}'.format(owner)}) for item in branch_pg_list]
        branches = branches + branch_pg_list
        if 'Link' in branch_pg.headers:
            if 'rel="next"' not in branch_pg.headers['Link']:
                next = False
        i = i + 1
    return branches

In [9]:
branches = json_normalize(branches_of_repo('spark', 'apache', github_api))

  branches = json_normalize(branches_of_repo('spark', 'apache', github_api))


In [10]:
branches.to_csv('data/branches.csv')

## Commits Of A Specific Repo

In [12]:
def commits_of_repo_github(repo, owner, api):
    commits = []
    next = True
    i = 1
    while next == True:
        url = api + '/repos/{}/{}/commits?page={}&per_page=100'.format(owner, repo, i)
        commit_pg = gh_session.get(url = url)
        commit_pg_list = [dict(item, **{'repo_name':'{}'.format(repo)}) for item in commit_pg.json()]    
        commit_pg_list = [dict(item, **{'owner':'{}'.format(owner)}) for item in commit_pg_list]
        commits = commits + commit_pg_list
        if 'Link' in commit_pg.headers:
            if 'rel="next"' not in commit_pg.headers['Link']:
                next = False
        i = i + 1
    return commits

In [13]:
def create_commits_df(repo, owner, api):
    commits_list = commits_of_repo_github(repo, owner, api)
    return json_normalize(commits_list)

In [15]:
commits = create_commits_df('spark', 'apache', github_api)

  return json_normalize(commits_list)


In [16]:
commits.to_csv('data/commits.csv')

## Pull Requests Of A Repo

In [18]:
def pulls_of_repo(repo, owner, api):
    pulls = []
    next = True
    i = 1
    while next == True:
        url = api + '/repos/{}/{}/pulls?page={}&per_page=100'.format(owner, repo, i)
        pull_pg = gh_session.get(url = url)
        pull_pg_list = [dict(item, **{'repo_name':'{}'.format(repo)}) for item in pull_pg.json()]    
        pull_pg_list = [dict(item, **{'owner':'{}'.format(owner)}) for item in pull_pg_list]
        pulls = pulls + pull_pg_list
        if 'Link' in pull_pg.headers:
            if 'rel="next"' not in pull_pg.headers['Link']:
                next = False
        i = i + 1
    return pulls

In [19]:
pulls = json_normalize(pulls_of_repo('spark', 'apache', github_api))

  pulls = json_normalize(pulls_of_repo('spark', 'apache', github_api))


In [20]:
pulls.to_csv('data/pulls.csv')

## Issues Of A Repo

In [21]:
def issues_of_repo(repo, owner, api):
    issues = []
    next = True
    i = 1
    while next == True:
        url = api + '/repos/{}/{}/issues?page={}&per_page=100'.format(owner, repo, i)
        issue_pg = gh_session.get(url = url)
        issue_pg_list = [dict(item, **{'repo_name':'{}'.format(repo)}) for item in issue_pg.json()]    
        issue_pg_list = [dict(item, **{'owner':'{}'.format(owner)}) for item in issue_pg_list]
        issues = issues + issue_pg_list
        if 'Link' in issue_pg.headers:
            if 'rel="next"' not in issue_pg.headers['Link']:
                next = False
        i = i + 1
    return issues

In [22]:
issues = json_normalize(issues_of_repo('spark', 'apache', github_api))

  issues = json_normalize(issues_of_repo('spark', 'apache', github_api))


In [23]:
issues.to_csv('data/issues.csv')

## Generating All Repo Data

The following function is used for generating all the previously disscussed data in a single operation.

In [24]:
def generate_repo_data(repo, owner, api):
    branches = json_normalize(branches_of_repo(repo, owner, api))
    commits = create_commits_df(repo, owner, api)
    pulls = json_normalize(pulls_of_repo(repo, owner, api))
    issues = json_normalize(issues_of_repo(repo, owner, api))
    branches.to_csv('data/branches.csv')
    commits.to_csv('data/commits.csv')
    pulls.to_csv('data/pulls.csv')
    issues.to_csv('data/issues.csv')

In [26]:
generate_repo_data('spark', 'apache', github_api)

  branches = json_normalize(branches_of_repo(repo, owner, api))
  return json_normalize(commits_list)
  pulls = json_normalize(pulls_of_repo(repo, owner, api))
  issues = json_normalize(issues_of_repo(repo, owner, api))


## Contribution Statistics

In [28]:
def statistics_of_repo(repo, owner, api):
    contributors = []
    next = True
    i = 1
    while next == True:
        url = api + '/repos/{}/{}/stats/contributors?page={}&per_page=100'.format(owner, repo, i)
        contrib_pg = gh_session.get(url = url)
        contrib_pg_list = [dict(item, **{'repo_name':'{}'.format(repo)}) for item in contrib_pg.json()]    
        contrib_pg_list = [dict(item, **{'owner':'{}'.format(owner)}) for item in contrib_pg_list]
        contributors = contributors + contrib_pg_list
        if 'Link' in contrib_pg.headers:
            if 'rel="next"' not in contrib_pg.headers['Link']:
                next = False
        i = i + 1
    return contributors

In [None]:
contribs = statistics_of_repo('spark', 'apache', github_api)

In [None]:
weeks_list = []
for i in (contrib_list):
    for j in i['weeks']:
        j['author'] = i['author']['login']
weeks_list.append(j)
weeks_df = json_normalize(weeks_list)
weeks_df['date'] = pd.to_datetime(weeks_df['w'],unit='s')
weeks_df['week'] = weeks_df['date'].dt.week