In [38]:
import sys, os, time
import itertools
from timeit import default_timer as timer
from humanfriendly import format_timespan

In [32]:
import pandas as pd
import numpy as np

In [2]:
from dotenv import load_dotenv
load_dotenv('admin.env')

True

In [59]:
from db_connect_mag import Session, Paper, PaperAuthorAffiliation

In [5]:
session = Session()

In [9]:
# review paper on community detection in graphs
review_paper_id = 2127048411
review_paper = session.query(Paper).get(review_paper_id)
review_paper.title

'community detection in graphs'

In [17]:
papers = [pr.paper_cited for pr in review_paper.paperrefs_citing]
print(len(papers))

447


In [23]:
def tree_distance(n1, n2, sep=":"):
    # https://en.wikipedia.org/wiki/Lowest_common_ancestor
    # the distance from v to w can be computed as 
    # the distance from the root to v, plus the distance from 
    # the root to w, minus twice the distance from 
    # the root to their lowest common ancestor
    v, w = [n.split(":") for n in [n1, n2]]
    distance_root_to_v = len(v)
    distance_root_to_w = len(w)
    
    distance_root_to_lca = 0
    for i in range(min(distance_root_to_v, distance_root_to_w)):
        if v[i] == w[i]:
            distance_root_to_lca += 1
        else:
            break
    return distance_root_to_v + distance_root_to_w - (2*distance_root_to_lca)
    

In [31]:
cluster_addresses = [p.cl for p in papers]
len(cluster_addresses)

447

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
train_papers, test_papers = train_test_split(papers, train_size=20, random_state=999)



In [39]:
distances_train_pairs = []
train_clusters = [p.cl for p in train_papers]
for n1, n2 in itertools.combinations(train_clusters, 2):
    distances_train_pairs.append(tree_distance(n1, n2))

In [42]:
distances_train_pairs = pd.Series(distances_train_pairs)
distances_train_pairs.describe()

count    190.000000
mean       6.552632
std        2.154083
min        2.000000
25%        6.000000
50%        7.000000
75%        8.000000
max       10.000000
dtype: float64

In [43]:
journal_id = 137773608  # Nature
nature_papers = session.query(Paper).filter_by(Journal_ID=journal_id).all()
print(len(nature_papers))

248211


In [55]:
nature_papers = pd.Series(nature_papers)
nature_sample = nature_papers.sample(n=200, random_state=999)
nature_samples_clusters = [p.cl for p in nature_sample if p.cl]
len(nature_samples_clusters)

137

In [54]:
def distances_two_groups(g1, g2):
    distances = []
    for n1 in g1:
        for n2 in g2:
            if n1 == n2:
                continue
            distances.append(tree_distance(n1, n2))
    return distances

In [58]:
def describe_distances(g1, g2):
    distances = distances_two_groups(g1, g2)
    distances = pd.Series(distances)
    return distances.describe()

In [56]:
within_review_papers_distances = distances_two_groups(cluster_addresses, cluster_addresses)
within_review_papers_distances = pd.Series(within_review_papers_distances)
within_review_papers_distances.describe()

count    199362.000000
mean          6.061837
std           2.034215
min           2.000000
25%           5.000000
50%           7.000000
75%           7.000000
max          10.000000
dtype: float64

In [57]:
distances_review_to_nature = distances_two_groups(cluster_addresses, nature_samples_clusters)
distances_review_to_nature = pd.Series(distances_review_to_nature)
distances_review_to_nature.describe()

count    61239.000000
mean         6.764856
std          1.046019
min          4.000000
25%          6.000000
50%          7.000000
75%          7.000000
max         10.000000
dtype: float64

In [60]:
paas = session.query(PaperAuthorAffiliation).filter_by(Author_ID=2151641964).all()

In [64]:
grinstaff_papers = [paa.paper for paa in paas if paa.paper]
print(len(grinstaff_papers))
grinstaff_clusters = [p.cl for p in grinstaff_papers if p.cl]
print(len(grinstaff_clusters))

289
250


In [65]:
describe_distances(cluster_addresses, grinstaff_clusters)

count    111750.000000
mean          6.664564
std           0.810777
min           4.000000
25%           6.000000
50%           7.000000
75%           7.000000
max           9.000000
dtype: float64

In [70]:
start = timer()
rosvall_paas = session.query(PaperAuthorAffiliation).filter_by(Author_ID=1999253335).all()
print("{} records in {}".format(len(rosvall_paas), format_timespan(timer()-start)))
rosvall_papers = [paa.paper for paa in rosvall_paas if paa.paper]
print("{} papers".format(len(rosvall_papers)))
rosvall_clusters = [p.cl for p in rosvall_papers if p.cl]
print("{} clusters".format(len(rosvall_clusters)))

67 records in 0 seconds
67 papers
63 clusters


In [71]:
describe_distances(cluster_addresses, rosvall_clusters)

count    28158.000000
mean         5.940585
std          1.965064
min          2.000000
25%          4.000000
50%          7.000000
75%          7.000000
max         10.000000
dtype: float64