In [94]:
import ujson
import gzip
import matplotlib
import pandas
import os
import re
import time

In [3]:
types = []
with gzip.open('2015-10-01-15.json.gz') as f:
    for i, line in enumerate(f):
        json_data = ujson.loads(line)
        types.append(json_data['type'])
    f.close()


In [4]:
pandas.DataFrame({"types":types}).groupby("types").size().order(ascending=False)

types
PushEvent                        18993
CreateEvent                       4944
IssueCommentEvent                 4299
WatchEvent                        2916
PullRequestEvent                  2531
IssuesEvent                       1846
ForkEvent                         1138
PullRequestReviewCommentEvent     1002
DeleteEvent                        924
GollumEvent                        346
MemberEvent                        330
CommitCommentEvent                 227
ReleaseEvent                       134
PublicEvent                         49
dtype: int64

In [55]:
def parse_types():
    types = []
    with gzip.open('2015-10-01-15.json.gz') as f:
        for i, line in enumerate(f):
            json_data = ujson.loads(line)
            types.append(json_data['type'])
        f.close()
    return(types)

def parse_watch_events(file):
    repository = []
    repo_owner = []
    starer = []
    time = []
    with gzip.open(file) as f:
        for i, line in enumerate(f):
            json_data = ujson.loads(line)
            if (json_data['type'] == 'WatchEvent'):
                repo_name = json_data['repo']['name']
                repository.append(repo_name)
                repo_owner.append(repo_name.split('/')[0])
                starer.append(json_data['actor']['login'])
                time.append(json_data['created_at'])
        f.close()
    return({"repository": repository, "repo_owner": repo_owner, "starer": starer, "time": time})
        

In [93]:
def parse_type(type, num_examples = 5):
    examples = []
    num_found = 0
    with gzip.open('2015-10-01-15.json.gz') as f:
        for i, line in enumerate(f):
            json_data = ujson.loads(line)
            if (json_data['type'] == type):
                examples.append(json_data['type'])
                num_found += 1
                if (num_examples < num_found):
                    f.close()
                    return(examples)
        f.close()
    return(examples)

In [79]:
files = [f for f in os.listdir('.') if re.match(r'2015-(08|09|10).*.gz$', f)]
files

['2015-08-01-1.json.gz',
 '2015-08-01-10.json.gz',
 '2015-08-01-11.json.gz',
 '2015-08-01-12.json.gz',
 '2015-08-01-13.json.gz',
 '2015-08-01-14.json.gz',
 '2015-08-01-15.json.gz',
 '2015-08-01-16.json.gz',
 '2015-08-01-17.json.gz',
 '2015-08-01-18.json.gz',
 '2015-08-01-19.json.gz',
 '2015-08-01-2.json.gz',
 '2015-08-01-20.json.gz',
 '2015-08-01-21.json.gz',
 '2015-08-01-22.json.gz',
 '2015-08-01-23.json.gz',
 '2015-08-01-3.json.gz',
 '2015-08-01-4.json.gz',
 '2015-08-01-5.json.gz',
 '2015-08-01-6.json.gz',
 '2015-08-01-7.json.gz',
 '2015-08-01-8.json.gz',
 '2015-08-01-9.json.gz',
 '2015-08-02-1.json.gz',
 '2015-08-02-10.json.gz',
 '2015-08-02-11.json.gz',
 '2015-08-02-12.json.gz',
 '2015-08-02-13.json.gz',
 '2015-08-02-14.json.gz',
 '2015-08-02-15.json.gz',
 '2015-08-02-16.json.gz',
 '2015-08-02-17.json.gz',
 '2015-08-02-18.json.gz',
 '2015-08-02-19.json.gz',
 '2015-08-02-2.json.gz',
 '2015-08-02-20.json.gz',
 '2015-08-02-21.json.gz',
 '2015-08-02-22.json.gz',
 '2015-08-02-23.json.gz

In [85]:
def parse_files():
    all_data = {"repository": [], "repo_owner": [], "starer": [], "time": []}
    for file in files:
        output = parse_watch_events(file)
        all_data["repository"] = all_data["repository"] + output["repository"]
        all_data["repo_owner"] = all_data["repo_owner"] + output["repo_owner"]
        all_data["starer"] = all_data["starer"] +output["starer"]
        all_data["time"] = all_data["time"] + output["time"]
    return all_data
#test_parse = parse_watch_events('2015-10-01-15.json.gz')


'Fri Jan 22 13:10:22 2016'

In [87]:
print("start time: " + time.ctime())
july_to_oct = parse_files()
print("end time: " + time.ctime())

start time: Fri Jan 22 13:11:17 2016
end time: Fri Jan 22 13:46:14 2016


In [88]:
all_df = pandas.DataFrame(july_to_oct)
all_df

Unnamed: 0,repo_owner,repository,starer,time
0,riot,riot/riot,vclub,2015-08-01T01:00:17Z
1,NativeScript,NativeScript/NativeScript,sayanghosh123,2015-08-01T01:00:20Z
2,borzunov,borzunov/remoteink,neon89,2015-08-01T01:00:27Z
3,shadowsocks,shadowsocks/shadowsocks-android,Lrya,2015-08-01T01:00:28Z
4,ariya,ariya/phantomjs,christopherbalz,2015-08-01T01:00:34Z
5,apache,apache/activemq-artemis,windedge,2015-08-01T01:00:37Z
6,vsouza,vsouza/awesome-ios,py110,2015-08-01T01:00:38Z
7,qiujuer,qiujuer/Genius-Android,artshell,2015-08-01T01:00:45Z
8,Varying-Vagrant-Vagrants,Varying-Vagrant-Vagrants/VVV,synthetiv,2015-08-01T01:00:45Z
9,gawel,gawel/pyquery,lsemel,2015-08-01T01:00:47Z


In [90]:
all_df.groupby('starer').size().order(ascending=False)

starer
mcanthony          6804
timelyportfolio    2553
pranavlathigara    2390
trietptm           1555
3ll0               1527
hsavit1            1459
mettwillen         1443
nimblemachine      1399
vilmarbfilho       1378
yuxingxin          1346
maoabc1818         1344
beni55             1287
hardikamal         1273
runo280            1235
swhgoon            1157
...
rafakwolf        1
rafal98          1
dhedges01        1
dheatov          1
dheater          1
dhealy05         1
dheailhamdhan    1
dhduvall         1
dhdnzk           1
dhdmaster        1
dhddhdwcg        1
rafal81          1
dhcrain          1
dhconstantino    1
cbourjau         1
Length: 509748, dtype: int64

In [91]:
all_df.groupby('repository').size().order(ascending=False)

repository
minimaxir/big-list-of-naughty-strings    8736
gloomyson/StarCraft                      7453
isocpp/CppCoreGuidelines                 6936
FreeCodeCamp/FreeCodeCamp                6709
chrissimpkins/Hack                       6584
zenorocha/clipboard.js                   5955
HannahMitt/HomeMirror                    5549
yudai/gotty                              5525
sindresorhus/awesome                     5250
getlantern/lantern                       5223
facebook/relay                           4567
Microsoft/WinObjC                        4361
facebook/react-native                    4067
twbs/bootstrap                           3951
Netflix/falcor                           3705
...
knowitall/documentextractor               1
knowitall/ollie                           1
knowledge-cloud/AllStartFromHere          1
knowledge-map/simple-storage              1
knowledgeanyhow/notebooks                 1
knowledgetechnologyuhh/docks              1
knowlet/8Comic-Viewer          

In [92]:
pandas.DataFrame.to_csv(all_df, "aug_to_oct.csv")