In [2]:
import json
import requests
import pandas as pd

In [3]:
# load in api key
credentials = json.load(open('./apikey.json', 'r'), encoding='utf-8')

In [4]:
# GHE docs:
# curl -H "Authorization: token OAUTH-TOKEN" http(s)://[hostname]/api/v3  <-- endpoint
''' 
Declare global variables
'''
baseurl = 'git.generalassemb.ly' # of course may be different for you
header = {'Authorization': 'token {}'.format(credentials['token'])}
source_org = 'Data-science-immersive' # change this as you see fit
target_org = 'DSI-ME-1' # this one too
repo_names = []
data = { 'organization': target_org }

Get a list of all the orgs the API key has access to

dsi-unit-3: id = 261
DSI-ATX-3: id = 6931
data-part-time: id = 6944
DSI-EAST-1: id = 7793
DSI-DC-6: id = 7895
DSI-EAST-2: id = 9038
Data-science-immersive: id = 9634
DAT-ME-1: id = 11079
DSI-ME-1: id = 11622

In [5]:
'''
Example calls below
'''
# call the base endpoint
'''
r = requests.get('https://{}/api/v3'.format(baseurl), headers=header)
'''

# get a listing of orgs the user's api key has access to

r = requests.get('https://{}/api/v3/user/orgs'.format(baseurl), headers=header)
orgs = json.loads(r.content)
for org in orgs:
    print('{}: id = {}'.format(org['login'], org['id']))

dsi-unit-3: id = 261
DSI-ATX-3: id = 6931
data-part-time: id = 6944
DSI-EAST-1: id = 7793
DSI-DC-6: id = 7895
DSI-EAST-2: id = 9038
Data-science-immersive: id = 9634
DAT-ME-1: id = 11079
DSI-ME-1: id = 11622
DAT-COG-1: id = 16028


Here's an example of a single repo's attributes in `data-science-immersive`

In [6]:
r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, source_org), \
         headers=header, params={ 'type': 'all' , 'page': 4 })

In [7]:
len(r.json())

30

Get a dataframe with all the repos from that org

In [8]:
def get_all_repos(baseurl, source_org, startpage=1):
    # set initial page to fetch
    page = startpage
    # create blank dataframe to put results in
    df_dict = {
        'id': [], 
        'name': [],
        'full_name': [],
        'ssh_url': [],
        'created_at': [],
        'updated_at': [],
        'pushed_at': [],
        'forks_url': [],
        'forks_count': [],
        'ssh_url': [],
        'page': [] # this doesn't come from the api
    }
    repo_df = pd.DataFrame.from_dict(df_dict).set_index('id')
    # begin fetchin those shits
    while True:
        r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, source_org), \
                 headers=header, params={ 'type': 'all' , 'page': page })
        try:
            # see if there's data in the response
            repos = r.json()
            repos[0]
            print('Successfully fetched page {}'.format(page))

            for repo in repos:
                for k in df_dict.keys():
                    try:
                        df_dict[k].append(repo[k])
                    # this is kinda a hack to insert the page num we're on
                    except KeyError:
                        df_dict['page'].append(page)
            # put the values from the dict into res_df
            res_df = pd.DataFrame.from_dict(df_dict).set_index('id')
            # clear dict for next page from api
            for k in df_dict.keys():
                df_dict[k] = []
            # append res_df to initial blank dataframe
            repo_df = repo_df.copy().append(res_df)

            page += 1
        except IndexError:
            print('Page {} returned no data, exiting...'.format(page))
            
            # change the index from float to int
            repo_df.set_index(repo_df.index.astype(int), inplace=True)

            # make the date/time cols into ....datetime
            for col in ['created_at', 'updated_at', 'pushed_at']:
                repo_df[col] = pd.to_datetime(repo_df.copy()[col])

            # change page and forks_count numbers from floats to ints
            for col in ['page', 'forks_count']:
                repo_df[col] = repo_df.copy()[col].apply(lambda x: int(x))   
            
            return repo_df

Let's get the `DSI-ME-1` org repos

In [9]:
misk_df = get_all_repos(baseurl, 'DSI-ME-1')

Successfully fetched page 1
Successfully fetched page 2
Successfully fetched page 3
Successfully fetched page 4
Page 5 returned no data, exiting...


In [10]:
misk_df.head(3)

Unnamed: 0_level_0,name,full_name,ssh_url,created_at,updated_at,pushed_at,forks_url,forks_count,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
134818,templates,DSI-ME-1/templates,git@git.generalassemb.ly:DSI-ME-1/templates.git,2018-05-28 22:55:55,2018-05-28 22:55:57,2018-04-25 20:28:20,https://git.generalassemb.ly/api/v3/repos/DSI-...,0,1
134819,python-101-lesson,DSI-ME-1/python-101-lesson,git@git.generalassemb.ly:DSI-ME-1/python-101-l...,2018-05-28 22:55:56,2018-05-28 22:55:57,2018-08-01 06:17:41,https://git.generalassemb.ly/api/v3/repos/DSI-...,26,1
134820,python-comprehensions-lab,DSI-ME-1/python-comprehensions-lab,git@git.generalassemb.ly:DSI-ME-1/python-compr...,2018-05-28 22:55:56,2018-05-28 22:55:58,2018-04-30 18:09:19,https://git.generalassemb.ly/api/v3/repos/DSI-...,1,1


In [11]:
misk_df.to_csv('./misk.csv', index_label='id')

In [12]:
# filter just the new repos
clones = list(misk_df[(misk_df['created_at'] > '8/1/2018 12:00:00')]['ssh_url'])

Now let's get the `DSI-ME-1` org repos

In [39]:
dsi_me_1_df = get_all_repos(baseurl, 'DSI-ME-1')

Successfully fetched page 1
Successfully fetched page 2
Page 3 returned no data, exiting...


In [40]:
dsi_me_1_df.head(3)

Unnamed: 0_level_0,name,full_name,ssh_url,created_at,updated_at,pushed_at,forks_url,forks_count,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
134818,templates,DSI-ME-1/templates,git@git.generalassemb.ly:DSI-ME-1/templates.git,2018-05-28 22:55:55,2018-05-28 22:55:57,2018-04-25 20:28:20,https://git.generalassemb.ly/api/v3/repos/DSI-...,0,1
134819,python-101-lesson,DSI-ME-1/python-101-lesson,git@git.generalassemb.ly:DSI-ME-1/python-101-l...,2018-05-28 22:55:56,2018-05-28 22:55:57,2018-08-01 06:17:41,https://git.generalassemb.ly/api/v3/repos/DSI-...,26,1
134820,python-comprehensions-lab,DSI-ME-1/python-comprehensions-lab,git@git.generalassemb.ly:DSI-ME-1/python-compr...,2018-05-28 22:55:56,2018-05-28 22:55:58,2018-04-30 18:09:19,https://git.generalassemb.ly/api/v3/repos/DSI-...,1,1


In [41]:
dsi_me_1_df.to_csv('./dsi_me_1.csv', index_label='id')

And now let's get a list of repos that are in `data-science-immersive` but _not_ in `DSI-ME-1`

In [81]:
not_in_misk = data_science_immersive_df.merge(dsi_me_1_df, how='left', on='name')

In [83]:
not_in_misk = not_in_misk.copy()[not_in_misk.copy()['full_name_y'].isnull()][['name', 'forks_url_x', 'forks_count_x', 'created_at_x']] \
    .rename(columns={'forks_url_x': 'forks_url', 'forks_count_x': 'forks_count', 'created_at_x': 'created_at'})

In [84]:
not_in_misk.head(3)

Unnamed: 0,name,forks_url,forks_count,created_at
6,1.02-lesson-python-control-flow,https://git.generalassemb.ly/api/v3/repos/Data...,2,2018-04-30 18:12:39
14,2.03-lesson-eda,https://git.generalassemb.ly/api/v3/repos/Data...,2,2018-05-10 18:36:37
22,3.06-lesson-cross-validation-train-test-split,https://git.generalassemb.ly/api/v3/repos/Data...,2,2018-05-10 18:47:34


And a list of repos that are in `DSI-ME-1` but _not_ in `data-science-immersive`

In [86]:
not_in_dsi = dsi_me_1_df.merge(data_science_immersive_df, how='left', on='name')

In [87]:
not_in_dsi = not_in_dsi.copy()[not_in_dsi.copy()['full_name_y'].isnull()][['name', 'forks_url_x', 'forks_count_x', 'created_at_x']] \
    .rename(columns={'forks_url_x': 'forks_url', 'forks_count_x': 'forks_count', 'created_at_x': 'created_at'})

In [88]:
not_in_dsi.head(3)

Unnamed: 0,name,forks_url,forks_count,created_at
6,python-iteration_control_functions-lesson,https://git.generalassemb.ly/api/v3/repos/DSI-...,1,2018-05-28 22:55:58
14,2.03-basic_eda_walkthrough-lesson-eda,https://git.generalassemb.ly/api/v3/repos/DSI-...,1,2018-05-28 22:56:02
22,3.06-evaluation-cross_validation_train_test-le...,https://git.generalassemb.ly/api/v3/repos/DSI-...,1,2018-05-28 22:56:05


It looks like some asshole renamed the repos along the way so a join by name isn't perfectly accurate and I can't find an ID field that would trace it upstream. I'll have to go manuelle here. Let's make a short list.

In [91]:
not_in_misk = not_in_misk.copy().sort_values('created_at')

In [92]:
not_in_misk.head(3)

Unnamed: 0,name,forks_url,forks_count,created_at
6,1.02-lesson-python-control-flow,https://git.generalassemb.ly/api/v3/repos/Data...,2,2018-04-30 18:12:39
14,2.03-lesson-eda,https://git.generalassemb.ly/api/v3/repos/Data...,2,2018-05-10 18:36:37
22,3.06-lesson-cross-validation-train-test-split,https://git.generalassemb.ly/api/v3/repos/Data...,2,2018-05-10 18:47:34


In [94]:
not_in_misk.to_csv('./not_in_misk.csv', index=False)

Let's fork.

In [104]:
fork_urls = []
with open('./finalfork.csv','r') as f:        
     for row in f:
         data = row.split()
         try:
            fork_urls.append(data[0])
         except IndexError:
            print('You have an empty row')

In [108]:
fork_urls = fork_urls[1:]
fork_urls[:3]

['https://git.generalassemb.ly/api/v3/repos/Data-science-immersive/6.02-lesson-bagging-rfs/forks',
 'https://git.generalassemb.ly/api/v3/repos/Data-science-immersive/6.05-lesson-gradient_descent/forks',
 'https://git.generalassemb.ly/api/v3/repos/Data-science-immersive/6.01-trees-CARTs-lesson/forks']

In [111]:
# set the target org parameter, where the forks will go
data = { 'organization': 'DSI-ME-1' }

with requests.Session() as s:
    # this get request authenticates the user
    r1 = s.get('https://{}/api/v3'.format(baseurl), headers=header)
    # ...and then this post request creates the fork
    for fork_url in fork_urls:
        r2 = s.post(fork_url, headers=header, data=json.dumps(data))
        print('{} successfully forked to {}'.format( \
            fork_url.split('/')[-3] + '/' + fork_url.split('/')[-2], \
            json.loads(r2.text)['full_name'])
        )

Data-science-immersive/6.02-lesson-bagging-rfs successfully forked to DSI-ME-1/6.02-lesson-bagging-rfs
Data-science-immersive/6.05-lesson-gradient_descent successfully forked to DSI-ME-1/6.05-lesson-gradient_descent
Data-science-immersive/6.01-trees-CARTs-lesson successfully forked to DSI-ME-1/6.01-trees-CARTs-lesson
Data-science-immersive/6.04-lesson-support_vector_machines-svm successfully forked to DSI-ME-1/6.04-lesson-support_vector_machines-svm
Data-science-immersive/7.03-lesson-regularizing-optimizing-nn successfully forked to DSI-ME-1/7.03-lesson-regularizing-optimizing-nn
Data-science-immersive/7.06-tensorflow successfully forked to DSI-ME-1/7.06-tensorflow
Data-science-immersive/7.05-lesson-cnns_ii successfully forked to DSI-ME-1/7.05-lesson-cnns_ii
Data-science-immersive/project-3 successfully forked to DSI-ME-1/project-3
Data-science-immersive/7.01-lesson-keras successfully forked to DSI-ME-1/7.01-lesson-keras
Data-science-immersive/7.02-lab-neural_network_applied successful

### Code below will clone an entire org's repos to a local folder; use with parseme.ipynb

In [42]:
'''
NEEDS CODE WRITTEN TO TRAVERSE PAGES (start at page 1, go until r.status_code = 202)
see docs here:
https://developer.github.com/v3/repos/#parameters-2

Note: the 'page' parameter is not documented with the github API, I found it by trial/error
'''

# get a listing of all repos in the target org, 'Data-science-immersive'
# from GHE docs: GET /orgs/:org/repos

r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, 'DSI-ME-1'), \
                 headers=header, params={ 'type': 'all' , 'page': 1 })
repos = json.loads(r.content)

clones = []
for repo in repos:
    clones.append(repo['ssh_url'])
    
# show first 5
for repo in repos[:5]:
    print('{}, {}'.format(repo['name'], repo['ssh_url']))

templates, git@git.generalassemb.ly:DSI-ME-1/templates.git
python-101-lesson, git@git.generalassemb.ly:DSI-ME-1/python-101-lesson.git
python-comprehensions-lab, git@git.generalassemb.ly:DSI-ME-1/python-comprehensions-lab.git
python-comprehensions-lesson, git@git.generalassemb.ly:DSI-ME-1/python-comprehensions-lesson.git
python-functions-lesson, git@git.generalassemb.ly:DSI-ME-1/python-functions-lesson.git


In [43]:
repos[0]

{'id': 134818,
 'name': 'templates',
 'full_name': 'DSI-ME-1/templates',
 'owner': {'login': 'DSI-ME-1',
  'id': 11622,
  'avatar_url': 'https://avatars.git.generalassemb.ly/u/11622?',
  'gravatar_id': '',
  'url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1',
  'html_url': 'https://git.generalassemb.ly/DSI-ME-1',
  'followers_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/followers',
  'following_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/following{/other_user}',
  'gists_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/gists{/gist_id}',
  'starred_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/starred{/owner}{/repo}',
  'subscriptions_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/subscriptions',
  'organizations_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/orgs',
  'repos_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/repos',
  'events_url': 'https://git.generalassemb.ly/api/v3/users/DSI-ME-1/ev

In [None]:
len(clones)

Create a directory for the cloned repos to live

In [49]:
'''
This will create a place for all the cloned repos to live.
Use the parseme file, pointing to this directory, to
scan all clones repos for words
'''

'\nThis will create a place for all the cloned repos to live.\nUse the parseme file, pointing to this directory, to\nscan all clones repos for words\n'

In [13]:
%%bash
if [ ! -d "./course_dump" ]; then
    mkdir ./course_dump
    echo "created directory ./course_dump"
else
    echo "directory already exists: ./course_dump"
fi

created directory ./course_dump


In [14]:
# WARNING!!! ===
# this will clone ALL repositories in the organization to ./course_dump!!
# note - has bug; will not retry cloning of repository if network fails :'(
failed_clones = []
for repo in clones:
    try:
        print("Cloning {} into ./course_dump...".format(repo))
        # made it sleepy to be a good citizen
        !cd ./course_dump && git clone $repo && sleep 1
    except:
        # outstanding error handling here
        failed_clones.append(repo)
        print("noooooo!!")
        continue
print('Cloned {}/{} repositories sucksessfulli'.format((len(clones)-(len(failed_clones))), len(clones)))

Cloning git@git.generalassemb.ly:DSI-ME-1/10.02-lesson-timeseries.git into ./course_dump...
Cloning into '10.02-lesson-timeseries'...
remote: Counting objects: 43, done.[K
remote: Total 43 (delta 0), reused 0 (delta 0), pack-reused 43[K
Receiving objects: 100% (43/43), 924.38 KiB | 50.00 KiB/s, done.
Resolving deltas: 100% (17/17), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/3.04-regression-metrics-lesson.git into ./course_dump...
Cloning into '3.04-regression-metrics-lesson'...
remote: Counting objects: 20, done.[K
remote: Total 20 (delta 0), reused 0 (delta 0), pack-reused 20[K
Receiving objects: 100% (20/20), 16.20 KiB | 281.00 KiB/s, done.
Resolving deltas: 100% (6/6), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/6.02-lesson-bagging-rfs.git into ./course_dump...
Cloning into '6.02-lesson-bagging-rfs'...
remote: Counting objects: 36, done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 36 (delta 11), reused 36 (delta 11), pack-reused 0[K
Receiving o

Resolving deltas: 100% (8/8), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/8.04-lesson-dbscan_hdbscan.git into ./course_dump...
Cloning into '8.04-lesson-dbscan_hdbscan'...
remote: Counting objects: 18, done.[K
remote: Total 18 (delta 0), reused 0 (delta 0), pack-reused 18[K
Receiving objects: 100% (18/18), 13.19 MiB | 378.00 KiB/s, done.
Resolving deltas: 100% (4/4), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/9.03-lesson-bayesian_inference.git into ./course_dump...
Cloning into '9.03-lesson-bayesian_inference'...
remote: Counting objects: 10, done.[K
remote: Total 10 (delta 0), reused 0 (delta 0), pack-reused 10[K
Receiving objects: 100% (10/10), 19.56 MiB | 381.00 KiB/s, done.
Resolving deltas: 100% (1/1), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/9.07-lesson-pymc3-bayes.git into ./course_dump...
Cloning into '9.07-lesson-pymc3-bayes'...
remote: Counting objects: 69, done.[K
remote: Total 69 (delta 0), reused 0 (delta 0), pack-reused 69[K
Receiving objects: 100% (

remote: Total 13 (delta 0), reused 0 (delta 0), pack-reused 13[K
Receiving objects: 100% (13/13), 5.35 MiB | 402.00 KiB/s, done.
Resolving deltas: 100% (1/1), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/11.04-lesson-explainable_models-ethics.git into ./course_dump...
Cloning into '11.04-lesson-explainable_models-ethics'...
remote: Counting objects: 61, done.[K
remote: Total 61 (delta 0), reused 0 (delta 0), pack-reused 61[K
Receiving objects: 100% (61/61), 7.32 MiB | 382.00 KiB/s, done.
Resolving deltas: 100% (22/22), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/10.03-lesson-arma-timeseries.git into ./course_dump...
Cloning into '10.03-lesson-arma-timeseries'...
remote: Counting objects: 31, done.[K
remote: Total 31 (delta 0), reused 0 (delta 0), pack-reused 31[K
Receiving objects: 100% (31/31), 2.72 MiB | 303.00 KiB/s, done.
Resolving deltas: 100% (12/12), done.
Cloning git@git.generalassemb.ly:DSI-ME-1/11.03-lesson-dataframes.git into ./course_dump...
Cloning into '11.03-

### This will fork all repositories from a source org to a target org

In [118]:
target_org = 'DAT-COG-1'
source_org = 'data-part-time'

In [119]:
r = requests.get('https://{}/api/v3/orgs/{}/repos'.format(baseurl, source_org), \
                 headers=header)
repos = json.loads(r.content)

fork_urls = []
for repo in repos:
    fork_urls.append(repo['forks_url'])

In [120]:
fork_urls

['https://git.generalassemb.ly/api/v3/repos/data-part-time/your-development-environment/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/what-is-data-science/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/python-foundations/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/unit-1_project/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/statistics-in-python/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/experiments-hypothesis-tests/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/eda-with-pandas/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/data-visualization-in-python/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/unit-2_project/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/linear-regression/forks',
 'https://git.generalassemb.ly/api/v3/repos/data-part-time/train-test-split-and-bias-variance/forks',
 'https://git.ge

In [121]:
# set the target org parameter, where the forks will go
data = { 'organization': target_org }

'''
This needs to be done as a session because the GET request does the
auth, and the POST request does the forking. You CAN'T pass a cookie
between the two sessions, I tried. Couldn't find any examples of anybody
doing this on SO, etc. Was able to get it working with CURL but wanted
to use python requests lib. Trick is passing the header with BOTH the 
GET *AND* the POST requests. You can't just send the header with the 
POST request, despite what the docs lead you to believe.

School of hard knocks... damn it feels good to be a gangster
'''

with requests.Session() as s:
    # this get request authenticates the user
    r1 = s.get('https://{}/api/v3'.format(baseurl), headers=header)
    # ...and then this post request creates the fork
    for fork_url in fork_urls:
        r2 = s.post(fork_url, headers=header, data=json.dumps(data))
        print('{} successfully forked to {}'.format( \
            fork_url.split('/')[-3] + '/' + fork_url.split('/')[-2], \
            json.loads(r2.text)['full_name'])
        )

data-part-time/your-development-environment successfully forked to DAT-COG-1/your-development-environment
data-part-time/what-is-data-science successfully forked to DAT-COG-1/what-is-data-science
data-part-time/python-foundations successfully forked to DAT-COG-1/python-foundations
data-part-time/unit-1_project successfully forked to DAT-COG-1/unit-1_project
data-part-time/statistics-in-python successfully forked to DAT-COG-1/statistics-in-python
data-part-time/experiments-hypothesis-tests successfully forked to DAT-COG-1/experiments-hypothesis-tests
data-part-time/eda-with-pandas successfully forked to DAT-COG-1/eda-with-pandas
data-part-time/data-visualization-in-python successfully forked to DAT-COG-1/data-visualization-in-python
data-part-time/unit-2_project successfully forked to DAT-COG-1/unit-2_project
data-part-time/linear-regression successfully forked to DAT-COG-1/linear-regression
data-part-time/train-test-split-and-bias-variance successfully forked to DAT-COG-1/train-test-sp

In [15]:
failed_clones

[]