# Make Dataset for GitHub Repository Recommender System

**Goal:** make a dataset (pandas dataframe) of repo ratings 1/0 (star/no star)  
**Dataset Dimensions:** no. repos * no. users

1. get the names of the top 1000 repos (sorted by no. stars) --> list
2. get the names and starred repos of users with 30-50 followers --> dict of username:list of starred repos
3. make dataframe (repos*users) 
  * initialize with all zeros
  * row indices (i): names of the top 1000 repos
  * col indices (j): names of all users with 30-50 followers
  * if user j has starred repo i: df.loc[i,j]=1
4. make csv file out of dataframe

In [None]:
# imports
# libraries
import requests
import pandas as pd
import numpy as np
import re

import pickle
import time
from tqdm import tqdm

# python file with authentication details
import auth

In [None]:
# url base
github = 'https://api.github.com'

_note: after this point, it is possible to start executing code from any "# initialization" cell (assuming the necessary pickle files are present)_

## 1. get the names of the top 1000 repos (sorted by no. stars) --> list

In [None]:
# initialization
params = {
    'per_page': 100,
    'page': 1,
    'q': 'stars:>6000',  # 1,195 repos as of Aug. 8 2017
    'sort': 'stars'
}

lastPage = 10
repoList = []

In [None]:
# get 10*100=1000 top repos sorted by no. stars
# and place these repo names into repoList
for i in range(lastPage):
    print("getting page {}".format(params['page']))
    r = requests.get(github + '/search/repositories', auth=(auth.user, auth.pw), params=params)
    for repo in r.json()['items']:
        name = repo['full_name']
        repoList.append(name)
    params['page'] += 1

In [None]:
len(repoList)  # should be 1000

In [None]:
# save repoList
pickle.dump(repoList, open('repoList.pickle', 'wb'))

## 2. get the names and starred repos of users with 30-50 followers --> dict of username:list of starred repos

### 2.1 get the names of all users with 30-50 followers --> list of usernames

In [None]:
# initialization
params = {
    'per_page': 100,
    'page': 1,
    'q': 'followers:30',
    'sort': 'repositories'
}

userList = []

In [None]:
# testing time to process each request (interrupt kernel after a few examples)
# same code below without time.time()

for i in range(20):
    n = 30+(i)
    q = 'followers:' + str(n)
    print("query: {}".format(q))
    params['q'] = q
    
    lastPage = 1
    params['page'] = 1
    
    while params['page'] <= lastPage:
        start = time.time()
        r = requests.get(github + '/search/users', auth=(auth.user, auth.pw), params=params)

        if params['page'] == 1:
            s = r.headers['Link'].split(', ')[1]
            match = re.search(r'&page=([0-9]+)', s)
            lastPage = int(match.group(1))

        for user in r.json()['items']:
            name = user['login']
            userList.append(name)
            
        params['page'] += 1
        print(time.time() - start)

In [None]:
# initialization
params = {
    'per_page': 100,
    'page': 1,
    'q': 'followers:30',
    'sort': 'repositories'
}

userList = []

In [None]:
# get users with 30 <= no. followers < 50
# and place these usernames into userDict, i.e. userDict[name]=[]

# the search api has a rate limit of 30 requests per minute, or 1 request per 2s
# so adding time.sleep(1.5) to the time to process each request (~0.7s) 
# will make sure that the rate limit is not exceeded

# the search api returns up to 1,000 results for each search
# so 'q': 'followers:30..50' will return only 1000 users 
# when there are 36,254 users that fit this query (as of Aug. 9 2017)
# instead of 'followers:30..50', I will use 'followers:30', 'followers:31', ..., 'followers:49'
# to get the top 1000 users (or all users if no. users < 1000) for each of these queries 
# (users are sorted by no. repos)

for i in range(20):
    # change query
    n = 30+(i)
    q = 'followers:' + str(n)
    print("query: {}".format(q))
    params['q'] = q
    
    # reset params for while loop
    lastPage = 1
    params['page'] = 1
    
    while params['page'] <= lastPage:
        print("getting page {}".format(params['page']))
        r = requests.get(github + '/search/users', auth=(auth.user, auth.pw), params=params)

        if params['page'] == 1:
            s = r.headers['Link'].split(', ')[1]  # info for last page
            match = re.search(r'&page=([0-9]+)', s)
            lastPage = int(match.group(1))

        for user in r.json()['items']:
            name = user['login']
            userList.append(name)
            
        params['page'] += 1
        time.sleep(1.5)

In [None]:
len(userList)  # should be 20,000

In [None]:
# save userList
pickle.dump(userList, open('userList.pickle', 'wb'))

### 2.2 get the 100 most recently starred repos for all users in userList --> dict of username:list of starred repos

In [None]:
# initialization
params = {
    'per_page': 100,
    'page': 1
}

userList = pickle.load(open('userList.pickle', 'rb'))
userDict = {u:[] for u in userList}

In [None]:
# testing time to process each request (interrupt kernel after a few examples)
# same code below without time.time()

for username in userList:
    start = time.time()
    r = requests.get(github + '/users/' + username + '/starred', auth=(auth.user, auth.pw), params=params)
    
    if r.status_code == requests.codes.ok:
        for repo in r.json():
            name = repo['full_name']
            userDict[username].append(name)

    print(time.time() - start)

In [None]:
# initialization
params = {
    'per_page': 100,
    'page': 1
}

userList = pickle.load(open('userList.pickle', 'rb'))
userDict = {u:[] for u in userList}

In [None]:
# get each user's 100 most recently starred repos (or all repos if no. repos < 100)
# and place these repo names into userDict, i.e. userDict[username]=[repo0, repo1, ..., repo100]

# the standard rate limit is 5000 requests per hour, or 1.4 requests per second --> 1 request per 0.7s
# the average time needed to process each request seems to well exceed 0.7s so no time.sleep() is needed
# this loop takes ~6hrs

for username in tqdm(userList):
    r = requests.get(github + '/users/' + username + '/starred', auth=(auth.user, auth.pw), params=params)
    
    # important: check for successful get request in case e.g. someone deletes their account
    # (this error happened on my first try to make userDict: 11756/20000 requests and 4hrs wait...)
    if r.status_code == requests.codes.ok:
        for repo in r.json():
            name = repo['full_name']
            userDict[username].append(name)

In [None]:
len(userDict)  # should be 20,000

In [None]:
# save userDict
pickle.dump(userDict, open('userDict.pickle', 'wb'))

## 3. Make dataframe (repos*users)
* initialize with all zeros
* row indices (i): names of the top 1000 repos
* col indices (j): names of all users with 30-50 followers
* if user j has starred repo i: df.loc[i,j]=1

In [None]:
# initialization
repoList = pickle.load(open('repoList.pickle', 'rb'))
userList = pickle.load(open('userList.pickle', 'rb'))
userDict = pickle.load(open('userDict.pickle', 'rb'))

Y_df =  pd.DataFrame(0, index=repoList, columns=userList)

In [None]:
Y_df.shape  # should be (1000, 20000)

In [None]:
# if user j has starred repo i: df.loc[i,j]=1

for j,starList in tqdm(userDict.items()):
    for i in starList:
        if i in repoList:
            Y_df.loc[i,j] = 1

In [None]:
np.sum(Y_df.values)  # number of 1s in df should be 170,543

In [None]:
# check for empty rows, i.e. repos with no stars
for row in Y_df.iterrows():
    if np.sum(row[1].values) == 0:
        print(row[0])  # should be nothing printed, no empty rows

In [None]:
# check for empty columns, i.e. users that have not starred any of the top 1000 repos
emptyColNames = []
for col in Y_df.iteritems():
    if np.sum(col[1].values) == 0:
        emptyColNames.append(col[0])

In [None]:
len(emptyColNames)  # should be 2953

In [None]:
# drop empty columns from final df
Y_df.drop(emptyColNames, axis=1, inplace=True)

In [None]:
Y_df.shape  # should be (1000, 17047)

In [None]:
# sparsity: (no. stars where df.loc[i,j]=1)/(no. possible stars)
np.sum(Y_df.values)/np.size(Y_df.values)  # should be ~1%

In [None]:
# save Y_df
pickle.dump(Y_df, open('Y_df.pickle', 'wb'))

## 4. make csv file out of dataframe

In [None]:
# initialization
Y_df = pickle.load(open('Y_df.pickle', 'rb'))

In [None]:
# make csv file from Y_df
Y_df.to_csv('Y.csv')

In [None]:
# test that the csv file works
temp = pd.read_csv('Y.csv')

In [None]:
temp.head() # row names should be in the first col

In [None]:
temp.shape  # should be (1000, 17047+1) because of the extra row name col