In [1]:
import json
import requests
import pandas as pd

In [2]:
# load in api key
credentials = json.load(open('./apikey.json', 'r'), encoding='utf-8')

In [3]:
# GHE docs:
# curl -H "Authorization: token OAUTH-TOKEN" http(s)://[hostname]/api/v3  <-- endpoint
''' 
Declare global variables
'''
baseurl = 'git.generalassemb.ly' # of course may be different for you
header = {'Authorization': 'token {}'.format(credentials['token'])}
source_org = 'Data-science-immersive' # change this as you see fit
target_org = 'DSI-ME-1' # this one too
repo_names = []
data = { 'organization': target_org }

Get a list of all the orgs the API key has access to

In [4]:
'''
Example calls below
'''
# call the base endpoint
'''
r = requests.get('https://{}/api/v3'.format(baseurl), headers=header)
'''

# get a listing of orgs the user's api key has access to

r = requests.get('https://{}/api/v3/user/orgs'.format(baseurl), headers=header)
orgs = json.loads(r.content)
for org in orgs:
    print('{}: id = {}'.format(org['login'], org['id']))

dsi-unit-3: id = 261
DSI-ATX-3: id = 6931
data-part-time: id = 6944
DSI-EAST-1: id = 7793
DSI-DC-6: id = 7895
DSI-EAST-2: id = 9038
Data-science-immersive: id = 9634
DAT-ME-1: id = 11079
DSI-ME-1: id = 11622


Get a dataframe with all the repos from that org

In [17]:
def get_all_repos(baseurl, source_org, startpage=1):
    # set initial page to fetch
    page = startpage
    # create blank dataframe to put results in
    df_dict = {
        'id': [], 
        'name': [],
        'full_name': [],
        'ssh_url': [],
        'created_at': [],
        'updated_at': [],
        'pushed_at': [],
        'page': [] # this doesn't come from the api
    }
    repo_df = pd.DataFrame.from_dict(df_dict).set_index('id')
    # begin fetchin those shits
    while True:
        r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, source_org), \
                 headers=header, params={ 'type': 'all' , 'page': page })
        try:
            # see if there's data in the response
            repos = r.json()
            repos[0]
            print('Successfully fetched page {}'.format(page))

            for repo in repos:
                for k in df_dict.keys():
                    try:
                        df_dict[k].append(repo[k])
                    # this is kinda a hack to insert the page num we're on
                    except KeyError:
                        df_dict['page'].append(page)
            # put the values from the dict into res_df
            res_df = pd.DataFrame.from_dict(df_dict).set_index('id')
            # clear dict for next page from api
            for k in df_dict.keys():
                df_dict[k] = []
            # append res_df to initial blank dataframe
            repo_df = repo_df.copy().append(res_df)

            page += 1
        except IndexError:
            print('Page {} returned no data, exiting...'.format(page))
            
            # change the index from float to int
            repo_df.set_index(repo_df.index.astype(int), inplace=True)

            # make the date/time cols into ....datetime
            for col in ['created_at', 'updated_at', 'pushed_at']:
                repo_df[col] = pd.to_datetime(repo_df.copy()[col])

            # change page numbers from floats to ints
            repo_df['page'] = repo_df.copy()['page'].apply(lambda x: int(x))   
            
            return repo_df

In [19]:
data_science_immersive_df = get_all_repos(baseurl, source_org)

Successfully fetched page 1
Successfully fetched page 2
Successfully fetched page 3
Successfully fetched page 4
Successfully fetched page 5
Page 6 returned no data, exiting...


In [20]:
data_science_immersive_df.to_csv('./data-science-immersive.csv', index_label='id')

### Code below will clone an entire org's repos to a local folder; use with parseme.ipynb

In [None]:
'''
NEEDS CODE WRITTEN TO TRAVERSE PAGES (start at page 1, go until r.status_code = 202)
see docs here:
https://developer.github.com/v3/repos/#parameters-2

Note: the 'page' parameter is not documented with the github API, I found it by trial/error
'''

# get a listing of all repos in the target org, 'Data-science-immersive'
# from GHE docs: GET /orgs/:org/repos

r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, source_org), \
                 headers=header, params={ 'type': 'all' , 'page': 1 })
repos = json.loads(r.content)

clones = []
for repo in repos:
    clones.append(repo['ssh_url'])
    
# show first 5
for repo in repos[:5]:
    print('{}, {}'.format(repo['name'], repo['ssh_url']))

In [None]:
len(clones)

In [None]:
'''
This will create a place for all the cloned repos to live.
Use the parseme file, pointing to this directory, to
scan all clones repos for words
'''

%%bash
if [ ! -d "./course_dump" ]; then
    mkdir ./course_dump
    echo "created directory ./course_dump"
else
    echo "directory already exists: ./course_dump"
fi

In [None]:
# WARNING!!! ===
# this will clone ALL repositories in the organization to ./course_dump!!
# note - has bug; will not retry cloning of repository if network fails :'(

try:
    for repo in clones:
        # made it sleepy to be a good citizen
        !cd ./course_dump && git clone $repo && sleep 1
    print("Finished cloning {} repositories into ./course_dump".format(len(clones)))
except:
    # outstanding error handling here
    print("noooooo!!")

### This will fork all repositories from a source org to a target org

In [None]:
r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, source_org), \
                 headers=header)
repos = json.loads(r.content)

fork_urls = []
for repo in repos:
    fork_urls.append(repo['forks_url'])

In [None]:
# set the target org parameter, where the forks will go
data = { 'organization': target_org }

'''
This needs to be done as a session because the GET request does the
auth, and the POST request does the forking. You CAN'T pass a cookie
between the two sessions, I tried. Couldn't find any examples of anybody
doing this on SO, etc. Was able to get it working with CURL but wanted
to use python requests lib. Trick is passing the header with BOTH the 
GET *AND* the POST requests. You can't just send the header with the 
POST request, despite what the docs lead you to believe.

School of hard knocks... damn it feels good to be a gangster
'''

with requests.Session() as s:
    # this get request authenticates the user
    r1 = s.get('https://{}/api/v3'.format(baseurl), headers=header)
    # ...and then this post request creates the fork
    for fork_url in fork_urls:
        r2 = s.post(fork_url, headers=header, data=json.dumps(data))
        print('{} successfully forked to {}'.format( \
            fork_url.split('/')[-3] + '/' + fork_url.split('/')[-2], \
            json.loads(r2.text)['full_name'])
        )