In [1]:
import sys, os, time
import itertools
from timeit import default_timer as timer
from humanfriendly import format_timespan

In [2]:
from collections import Counter

In [3]:
import pandas as pd
import numpy as np

In [4]:
from dotenv import load_dotenv
load_dotenv('admin.env')

True

In [5]:
from db_connect_mag import Session, Paper, PaperAuthorAffiliation

In [6]:
session = Session()

In [7]:
# review paper on community detection in graphs
review_paper_id = 2127048411
review_paper = session.query(Paper).get(review_paper_id)
review_paper.title

'community detection in graphs'

In [8]:
papers = [pr.paper_cited for pr in review_paper.paperrefs_citing]
print(len(papers))

447


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_papers, target_papers = train_test_split(papers, train_size=50, random_state=999)



In [11]:
target_papers = set(target_papers)

In [13]:
len(target_papers)

397

In [14]:
start = timer()
test_papers = set()
c = Counter()
cur_papers = list(train_papers)
print("looping through {} papers".format(len(cur_papers)))
for i, paper in enumerate(cur_papers):
    for p in [pr.paper_cited for pr in paper.paperrefs_citing]:
        c[p.Paper_ID] += 1
        test_papers.add(p)
    for p in [pr.paper_citing for pr in paper.paperrefs_cited]:
        c[p.Paper_ID] += 1
        test_papers.add(p)
    print("done with {} papers. len(test_papers)=={}".format(i+1, len(test_papers)))
print(format_timespan(timer()-start))
print(len(test_papers))

looping through 50 papers
done with 1 papers. len(test_papers)==1031
done with 2 papers. len(test_papers)==1100
done with 3 papers. len(test_papers)==1408
done with 4 papers. len(test_papers)==3121
done with 5 papers. len(test_papers)==3172
done with 6 papers. len(test_papers)==3323
done with 7 papers. len(test_papers)==3379
done with 8 papers. len(test_papers)==3401
done with 9 papers. len(test_papers)==6800
done with 10 papers. len(test_papers)==7966
done with 11 papers. len(test_papers)==8034
done with 12 papers. len(test_papers)==8065
done with 13 papers. len(test_papers)==33110
done with 14 papers. len(test_papers)==33198
done with 15 papers. len(test_papers)==33203
done with 16 papers. len(test_papers)==33392
done with 17 papers. len(test_papers)==33434
done with 18 papers. len(test_papers)==52126
done with 19 papers. len(test_papers)==52530
done with 20 papers. len(test_papers)==52632
done with 21 papers. len(test_papers)==57916
done with 22 papers. len(test_papers)==57938
done 

In [15]:
len(target_papers.intersection(test_papers))

197

In [19]:
def tree_distance(n1, n2, sep=":"):
    # https://en.wikipedia.org/wiki/Lowest_common_ancestor
    # the distance from v to w can be computed as 
    # the distance from the root to v, plus the distance from 
    # the root to w, minus twice the distance from 
    # the root to their lowest common ancestor
    v, w = [n.split(sep) for n in [n1, n2]]
    distance_root_to_v = len(v)
    distance_root_to_w = len(w)
    
    distance_root_to_lca = 0
    for i in range(min(distance_root_to_v, distance_root_to_w)):
        if v[i] == w[i]:
            distance_root_to_lca += 1
        else:
            break
    return distance_root_to_v + distance_root_to_w - (2*distance_root_to_lca)
    

In [20]:
def distances_two_groups(g1, g2):
    distances = []
    for n1 in g1:
        for n2 in g2:
            if n1 == n2:
                continue
            distances.append(tree_distance(n1, n2))
    return distances

In [21]:
def describe_distances(g1, g2):
    distances = distances_two_groups(g1, g2)
    distances = pd.Series(distances)
    return distances.describe()

In [23]:
start = timer()
clusters = [p.cl for p in test_papers if p.cl]
print(format_timespan(timer()-start))
print(len(clusters))

4 minutes and 40.75 seconds
107626


In [25]:
def avg_distance(cl, cl_group):
    distances = []
    for x in cl_group:
        distances.append(tree_distance(cl, x))
    return sum(distances) / len(distances)

In [28]:
start = timer()
rows = []
train_cls = [p.cl for p in train_papers if p.cl]
for p in test_papers:
    pid = p.Paper_ID
    title = p.title
    ef = p.EF
    cl = p.cl
    if cl:
        avg_distance_to_train = avg_distance(cl, train_cls)
    else:
        avg_distance_to_train = None
    rows.append({
        'Paper_ID': pid,
        'title': title,
        'EF': ef,
        'cl': cl,
        'avg_distance_to_train': avg_distance_to_train
    })
print("{} rows in {}".format(len(rows), format_timespan(timer()-start)))

107626 rows in 3 minutes and 3.93 seconds


In [37]:
df = pd.DataFrame(rows)

In [38]:
target_pids = set([p.Paper_ID for p in target_papers])

In [39]:
df['target'] = df.Paper_ID.apply(lambda x: x in target_pids)

In [40]:
df = df.sort_values('avg_distance_to_train').reset_index(drop=True)

In [45]:
print(df[df.target==True].avg_distance_to_train.mean())
print(df[df.target==False].avg_distance_to_train.mean())

5.591269035532995
6.790269666477396


In [46]:
print(df[df.target==True].EF.mean())
print(df[df.target==False].EF.mean())

9.313932740609136e-07
2.2271047048469217e-08


In [60]:
print(len(df))
print("contains {} target papers".format(df.target.sum()))
print("")
ef_thresh = df.EF.min()
print("removing papers with EF<={}".format(ef_thresh))
subset = df[df.EF>ef_thresh]
print(len(subset))
print("contains {} target papers".format(subset.target.sum()))

107626
contains 197 target papers

removing papers with EF<=5.43474e-09
83008
contains 197 target papers


In [61]:
print(subset[subset.target==True].avg_distance_to_train.mean())
print(subset[subset.target==False].avg_distance_to_train.mean())

5.591269035532995
6.813137868157614


In [62]:
print(subset[subset.target==True].EF.mean())
print(subset[subset.target==False].EF.mean())

9.313932740609136e-07
2.727613341283163e-08


In [72]:
paper.paperrefs_cited[2].Paper_ID

18156537

In [31]:
cluster_addresses = [p.cl for p in papers]
len(cluster_addresses)

447

In [39]:
distances_train_pairs = []
train_clusters = [p.cl for p in train_papers]
for n1, n2 in itertools.combinations(train_clusters, 2):
    distances_train_pairs.append(tree_distance(n1, n2))

In [42]:
distances_train_pairs = pd.Series(distances_train_pairs)
distances_train_pairs.describe()

count    190.000000
mean       6.552632
std        2.154083
min        2.000000
25%        6.000000
50%        7.000000
75%        8.000000
max       10.000000
dtype: float64

In [43]:
journal_id = 137773608  # Nature
nature_papers = session.query(Paper).filter_by(Journal_ID=journal_id).all()
print(len(nature_papers))

248211


In [55]:
nature_papers = pd.Series(nature_papers)
nature_sample = nature_papers.sample(n=200, random_state=999)
nature_samples_clusters = [p.cl for p in nature_sample if p.cl]
len(nature_samples_clusters)

137

In [56]:
within_review_papers_distances = distances_two_groups(cluster_addresses, cluster_addresses)
within_review_papers_distances = pd.Series(within_review_papers_distances)
within_review_papers_distances.describe()

count    199362.000000
mean          6.061837
std           2.034215
min           2.000000
25%           5.000000
50%           7.000000
75%           7.000000
max          10.000000
dtype: float64

In [57]:
distances_review_to_nature = distances_two_groups(cluster_addresses, nature_samples_clusters)
distances_review_to_nature = pd.Series(distances_review_to_nature)
distances_review_to_nature.describe()

count    61239.000000
mean         6.764856
std          1.046019
min          4.000000
25%          6.000000
50%          7.000000
75%          7.000000
max         10.000000
dtype: float64

In [60]:
paas = session.query(PaperAuthorAffiliation).filter_by(Author_ID=2151641964).all()

In [64]:
grinstaff_papers = [paa.paper for paa in paas if paa.paper]
print(len(grinstaff_papers))
grinstaff_clusters = [p.cl for p in grinstaff_papers if p.cl]
print(len(grinstaff_clusters))

289
250


In [65]:
describe_distances(cluster_addresses, grinstaff_clusters)

count    111750.000000
mean          6.664564
std           0.810777
min           4.000000
25%           6.000000
50%           7.000000
75%           7.000000
max           9.000000
dtype: float64

In [70]:
start = timer()
rosvall_paas = session.query(PaperAuthorAffiliation).filter_by(Author_ID=1999253335).all()
print("{} records in {}".format(len(rosvall_paas), format_timespan(timer()-start)))
rosvall_papers = [paa.paper for paa in rosvall_paas if paa.paper]
print("{} papers".format(len(rosvall_papers)))
rosvall_clusters = [p.cl for p in rosvall_papers if p.cl]
print("{} clusters".format(len(rosvall_clusters)))

67 records in 0 seconds
67 papers
63 clusters


In [71]:
describe_distances(cluster_addresses, rosvall_clusters)

count    28158.000000
mean         5.940585
std          1.965064
min          2.000000
25%          4.000000
50%          7.000000
75%          7.000000
max         10.000000
dtype: float64