In [1]:
import sys, os, time
from timeit import default_timer as timer
from humanfriendly import format_timespan

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%matplotlib inline

In [17]:
start = timer()
d = []
with open('data/', 'r') as f:
    for i, line in enumerate(f):
        if line[0] == "#":
            continue
        line = line.strip().split(' ')
        this_row = {}
        this_row['cl'] = int(line[0].split(':')[0])
        this_row['ef'] = float(line[1])
        this_row['paper_id'] = line[2].strip('"')
        d.append(this_row)

In [18]:
print(format_timespan(timer()-start))

3 minutes and 48.36 seconds


In [19]:
start = timer()
df = pd.DataFrame(d)
print(format_timespan(timer()-start))

2 minutes and 22.8 seconds


In [20]:
df.shape

(77071690, 3)

In [21]:
df.cl.value_counts().head(50)

1395247    70336
17462      67606
47090      64902
8254       62911
37791      59260
3895735    56040
45274      54693
12398      52291
517846     50968
1772       47894
2255739    47543
1043449    44848
3375       44309
159478     43645
184076     42870
731335     42071
89137      41534
923732     40981
80655      40492
93175      39935
87447      39631
1418889    39312
143350     38569
1592944    38355
262031     38345
556759     38265
76943      37691
3240167    37562
222735     37201
45545      36921
5170       36892
1188399    35869
1184175    35830
3786801    35829
418593     35779
52347      34922
6520       34440
200753     34252
1037619    33789
2412369    33786
4109       33133
15856      32196
2117       32184
6011       32167
1671832    31918
7126       31841
3634907    31635
11561      31542
100896     31018
2371095    30927
Name: cl, dtype: int64

In [22]:
vc = df.cl.value_counts()
num_bigger_than_100 = (vc>100).sum()
num_singletons = (vc==1).sum()
num_clusters = df.cl.nunique()
print("num_clusters: {}".format(num_clusters))
print("num_bigger_than_100: {} ({:.1f}%)".format(num_bigger_than_100, float(num_bigger_than_100)/num_clusters*100))
print("num_singletons: {} ({:.1f}%)".format(num_singletons, float(num_singletons)/num_clusters*100))

num_clusters: 3930675
num_bigger_than_100: 66884 (1.7%)
num_singletons: 1948810 (49.6%)


In [23]:
vc.sum()

77071690L

In [24]:
def output_stats(vc, subset):
    total_num_clusters = len(vc)
    subset_num_clusters = len(subset)
    total_num_papers = vc.sum()
    subset_num_papers = subset.sum()
    proportion_clusters = float(subset_num_clusters) / total_num_clusters
    proportion_papers = float(subset_num_papers) / total_num_papers
    print("number of clusters in subset: {} ({:.1f}%)".format(subset_num_clusters, proportion_clusters*100))
    print("number of papers in subset: {} ({:.1f}%)".format(subset_num_papers, proportion_papers*100))

In [33]:
print("singleton clusters (one paper in the cluster)")
print("-"*40)
output_stats(vc, vc[vc==1])
print("")

for i in [10, 20, 50, 100, 500, 1000, 10000, 20000, 50000, 60000, 70000, 80000]:
    print(">={} papers".format(i))
    print("-"*40)
    output_stats(vc, vc[vc>=i])
    print("")

print("-"*40)
print("")

ranges = [10, 100, 200, 300, 400, 500, 1000, 2000, 4000, 5000, 10000, 80000]
for i in range(len(ranges)-1):
    lower = ranges[i]
    upper = ranges[i+1]
    print("between {} and {}".format(lower, upper))
    print("-"*40)
    output_stats(vc, vc[(vc>=lower) & (vc<upper)])
    print("")

singleton clusters (one paper in the cluster)
----------------------------------------
number of clusters in subset: 1948810 (49.6%)
number of papers in subset: 1948810 (2.5%)

>=10 papers
----------------------------------------
number of clusters in subset: 815568 (20.7%)
number of papers in subset: 70760532 (91.8%)

>=20 papers
----------------------------------------
number of clusters in subset: 428241 (10.9%)
number of papers in subset: 65352293 (84.8%)

>=50 papers
----------------------------------------
number of clusters in subset: 131552 (3.3%)
number of papers in subset: 56738069 (73.6%)

>=100 papers
----------------------------------------
number of clusters in subset: 67466 (1.7%)
number of papers in subset: 52379941 (68.0%)

>=500 papers
----------------------------------------
number of clusters in subset: 18409 (0.5%)
number of papers in subset: 42210319 (54.8%)

>=1000 papers
----------------------------------------
number of clusters in subset: 10373 (0.3%)
number o

In [35]:
top_cluster = df.cl.apply(lambda x: x)

KeyboardInterrupt: 

In [36]:
vc.to_csv()

1395247    70336
17462      67606
47090      64902
8254       62911
37791      59260
3895735    56040
45274      54693
12398      52291
517846     50968
1772       47894
2255739    47543
1043449    44848
3375       44309
159478     43645
184076     42870
731335     42071
89137      41534
923732     40981
80655      40492
93175      39935
87447      39631
1418889    39312
143350     38569
1592944    38355
262031     38345
556759     38265
76943      37691
3240167    37562
222735     37201
45545      36921
           ...  
2459920        1
2983952        1
1347376        1
1216497        1
1282001        1
1478513        1
1609393        1
2002545        1
2068049        1
1936913        1
3705520        1
3836656        1
3640144        1
3574640        1
1412880        1
1806032        1
1609392        1
2068048        1
1871408        1
232912         1
36272          1
429424         1
363792         1
560304         1
625808         1
822320         1
3312624        1
3378128       

In [39]:
for i, (cluster_name, cluster_size) in enumerate(vc.iteritems()):
    print(cluster_name)
    break

1395247
