# GitHub commit scraping v.2

In [15]:
# %load get_repo_contributions.py
import json
import requests

api_url_base = 'https://api.github.com/'
headers = {'Content-Type': 'application/json',
           'User-Agent': 'request',
           'Accept': 'application/vnd.github.v3+json'}

def get_repos(orgname):
    api_url = '{}orgs/{}/repos'.format(api_url_base, orgname)

    response = requests.get(api_url, headers=headers)

    if response.status_code == 200:
        return (response.json())
    else:
        print('[!] HTTP {0} calling [{1}]'.format(response.status_code, api_url))
        return None


def get_contributors(repo):
    name = repo['name']
    contrib_url = repo['contributors_url']
    
    response = requests.get(contrib_url, headers=headers)

    if response.status_code == 200:
        return (
            # returns `contribution_response`
            {'name': name,
             'data': response.json()}
        )
    else:
        print('[!] HTTP {0} calling repo [{1}]'.format(response.status_code, contrib_url))
        return None


def build_contribution_list(contribution_response):
    all_repo_contributions = list()

    for i in range(0,len(contribution_response['data'])):
        ctr = dict()
        ctr["repo"] = contribution_response['name']
        ctr["username"] = contribution_response['data'][i]['login']
        ctr["contributions"] = contribution_response['data'][i]['contributions']
        all_repo_contributions.append(ctr)
    
    return all_repo_contributions


all_org_contributions = list()
    
def get_all_contributions(org):
    repos = get_repos(org)

    for repo in repos:
        contributors = get_contributors(repo)
        contribution_list = build_contribution_list(contributors)
        all_org_contributions.append(contribution_list)

In [19]:
get_all_contributions('recursecenter')

[!] HTTP 403 calling [https://api.github.com/orgs/recursecenter/repos]


TypeError: 'NoneType' object is not iterable

### Let's modify to use Tor.
https://medium.com/@jasonrigden/using-tor-with-the-python-request-library-79015b2606cb


In [20]:
session = requests.session()
session.proxies = {}

session.proxies['http'] = 'socks5h://localhost:9050'
session.proxies['https'] = 'socks5h://localhost:9050'

In [23]:
r = session.get('http://httpbin.org/ip')
print(r.text)

{
  "origin": "185.220.102.7, 185.220.102.7"
}



In [24]:
api_url_base = 'https://api.github.com/'
headers = {'Content-Type': 'application/json',
           'User-Agent': 'python-requests/2.18.4',
           'Accept': 'application/vnd.github.v3+json'}

def get_repos(orgname):
    api_url = '{}orgs/{}/repos'.format(api_url_base, orgname)

    response = session.get(api_url, headers=headers)

    if response.status_code == 200:
        return (response.json())
    else:
        print('[!] HTTP {0} calling [{1}]'.format(response.status_code, api_url))
        return None


def get_contributors(repo):
    name = repo['name']
    contrib_url = repo['contributors_url']
    
    response = session.get(contrib_url, headers=headers)

    if response.status_code == 200:
        return (
            # returns `contribution_response`
            {'name': name,
             'data': response.json()}
        )
    else:
        print('[!] HTTP {0} calling repo [{1}]'.format(response.status_code, contrib_url))
        return None


def build_contribution_list(contribution_response):
    all_repo_contributions = list()

    for i in range(0,len(contribution_response['data'])):
        ctr = dict()
        ctr["repo"] = contribution_response['name']
        ctr["username"] = contribution_response['data'][i]['login']
        ctr["contributions"] = contribution_response['data'][i]['contributions']
        all_repo_contributions.append(ctr)
    
    return all_repo_contributions


all_org_contributions = list()
    
def get_all_contributions(org):
    repos = get_repos(org)

    for repo in repos:
        contributors = get_contributors(repo)
        contribution_list = build_contribution_list(contributors)
        all_org_contributions.append(contribution_list)

In [26]:
get_all_contributions('recursecenter')

In [28]:
import pandas as pd

In [44]:
# Need to flatten all_org_contributions for it to work
flat_contributions = [item for sublist in all_org_contributions for item in sublist]

In [58]:
df = pd.DataFrame(flat_contributions)

df.head()

Unnamed: 0,contributions,repo,username
0,51,hs-cli,zachallaun
1,2,hs-cli,davidbalbert
2,63,webstack.jl,danielmendel
3,36,webstack.jl,astrieanna
4,19,webstack.jl,zachallaun


In [57]:
df.pivot_table(index='username', values='contributions') \
    .sort_values(by='contributions', ascending=False)

Unnamed: 0_level_0,contributions
username,Unnamed: 1_level_1
josh,801.333333
hone,796.000000
tmm1,433.000000
sursh,278.000000
punchagan,249.000000
zachallaun,151.166667
sstephenson,143.000000
schneems,120.500000
kokeshii,103.000000
raggi,98.000000
