In [1]:
import pymongo
import requests
import json
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
import datetime
from credentials import client_id,client_secret

In [2]:
def cleanLinks(link):
    links = link.split(",")
#     print(links)
    clean_links=[]
    urls = ""
    rel = ""
    for i in links:
        temp = []
        urls,rel= i.split(";")
        urls = urls.strip()
        rel = rel.strip()
        urls = urls[1:-1]
        rel = rel[5:-1]
    #     print(urls)
    #     print(rel)

        temp.append(urls)
        temp.append(rel)

        clean_links.append(temp)
        
    return clean_links

In [3]:
def getOrgNRepo(org_name,repo_name,coll_name):
    OrgNRepo_client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
    repodb = OrgNRepo_client["repo_collector"]
    tf_repo = repodb[coll_name]
    repo_details = tf_repo.find({"full_name":org_name+"/"+repo_name}).limit(1).sort("$natural",-1)
    repo_id = "NA"
    org_id = "NA"
    for res in repo_details:
        repo_id = (res['id'])
        owner = res['owner']
        org_id = owner['id']
    OrgNRepo_client.close()
    return repo_id, org_id
        

In [4]:
def addMISCDetails(repo_id, org_id, list_details, isMember, isContributor):
    updt_list = []
    if isMember or isContributor:
        add_details = {"org_id":org_id, "repo_id":repo_id, "isMember": isMember, "isContributor":isContributor}
        for ele in list_details:
            ele.update(add_details)
            updt_list.append(ele)
    else:    
        add_details = {"org_id":org_id, "repo_id":repo_id}
        for ele in list_details:
            ele.update(add_details)
            updt_list.append(ele)            
    return updt_list

In [5]:
def renameCollections(old_name, new_name):
    mongo_client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
    repodb = mongo_client["repo_collector"]
    if old_name in repodb.list_collection_names():
        repo_coll = repodb[old_name]

        if new_name in repodb.list_collection_names():
            drop_coll = repodb[new_name]
            drop_coll.drop()

        repo_coll.rename(new_name)
    mongo_client.close()

In [6]:
def populateToDB(url, org_name, repo_name, db_name, user):
    
    response = requests.get(url,auth=(client_id, client_secret))
    print("GitHub Rest API rate limit remaining: "+str(response.headers.get('X-RateLimit-Remaining')))
        
    if response.json():       
        raw_list = response.json()
    #     print(len(raw_list))

        mongo_client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
        repodb = mongo_client["repo_collector"]
        repo_coll = repodb[db_name]

    #     print(raw_list)
        repo_id, org_id = getOrgNRepo(org_name,repo_name,"tf_repo")
    #     print(str(repo_id)+" | "+str(org_id))
        isMember = isContributor = 0
        if user == "members":
            isMember = 1
        elif user == "contributors":
            isContributor = 1

        updt_list = addMISCDetails(repo_id, org_id, raw_list, isMember, isContributor)
        #print(issue_raw_list[-1])

        repo_coll.insert_many(updt_list)
        print("Inserted rows to "+db_name)

        if response.headers.get('Link'):
            print(response.headers.get('Link'))
            link = response.headers.get('Link')
        #     print(link)
            clean_links = cleanLinks(link)        
        #     print(clean_links)

            df = pd.DataFrame(clean_links, columns =['url', 'rel'])
        #     print(df)
            if int(response.headers.get('X-RateLimit-Remaining')) <= 2:
                print("Please wait for 30 minutes..............")
                time.sleep(60*30)
            
            while "next" in df['rel'].values:

                index = int(df[df['rel']=='next'].index.values)
                print(df['url'][index])
                
                url = df['url'][index]
                response = requests.get(url,auth=(client_id, client_secret))
                print("GitHub Rest API rate limit remaining: "+str(response.headers.get('X-RateLimit-Remaining')))
                raw_list = response.json()

                updt_list = addMISCDetails(repo_id, org_id, raw_list, isMember, isContributor)

                repo_coll.insert_many(updt_list)
                print("Inserted rows to "+db_name)
                
                link = response.headers.get('Link')
                clean_links = cleanLinks(link)
                df = pd.DataFrame(clean_links, columns =['url', 'rel'])
                
                if int(response.headers.get('X-RateLimit-Remaining')) <= 2:
                    print("Please wait for 30 minutes..............")
                    time.sleep(60*30)

        mongo_client.close()
    print("Inserting data into "+db_name+" is done.")

In [9]:
org_name = "tensorflow"
repo_name = "tensorflow"

In [10]:
renameCollections("repo_issuesV1.1", "repo_issues")
issues_url = "https://api.github.com/repos/"+org_name+"/"+repo_name+"/issues?labels=stalled&per_page=100"
populateToDB(issues_url, org_name, repo_name, "repo_issuesV1.1", "NA")

GitHub Rest API rate limit remaining: 4719
Inserted rows to repo_issuesV1.1
Inserting data into repo_issuesV1.1 is done.


In [39]:
renameCollections("repo_forksV1.1", "repo_forks")
forks_url = "https://api.github.com/repos/"+org_name+"/"+repo_name+"/forks?per_page=100"
# populateToDB(forks_url, org_name, repo_name, "repo_forksV1.1", "NA")

In [40]:
renameCollections("repo_releasesV1.1", "repo_releases")
releases_url = "https://api.github.com/repos/"+org_name+"/"+repo_name+"/releases?per_page=100"
# populateToDB(releases_url, org_name, repo_name, "repo_releasesV1.1", "NA")

In [41]:
renameCollections("repo_pullsV1.1", "repo_pulls")
pulls_url = "https://api.github.com/repos/"+org_name+"/"+repo_name+"/pulls?per_page=100"
# populateToDB(pulls_url, org_name, repo_name, "repo_pullsV1.1", "NA")

In [42]:
renameCollections("repo_usersV1.1", "repo_users")
members_url = "https://api.github.com/orgs/"+org_name+"/members?per_page=100"
# populateToDB(members_url, org_name, repo_name, "repo_usersV1.1", "members")

In [43]:
contributors_url = "https://api.github.com/repos/"+org_name+"/"+repo_name+"/contributors?per_page=100&anon=true"
# populateToDB(contributors_url, org_name, repo_name, "repo_usersV1.1", "contributors")

In [44]:
renameCollections("repo_labelsV1.1", "repo_labels")
labels_url = "https://api.github.com/repos/"+org_name+"/"+repo_name+"/labels?per_page=100"
# populateToDB(labels_url, org_name, repo_name, "repo_labelsV1.1", "NA")