In [1]:
import sys, os, time
import itertools
from timeit import default_timer as timer
from humanfriendly import format_timespan

In [2]:
from collections import Counter

In [3]:
import pandas as pd
import numpy as np

In [4]:
from dotenv import load_dotenv
load_dotenv('admin.env')

True

In [5]:
from db_connect_mag import Session, Paper, PaperAuthorAffiliation, db

In [6]:
# review paper on community detection in graphs
review_paper_id = 2127048411
start = timer()
tbl = db.tables['PaperReferences']
sq = tbl.select(tbl.c.Paper_ID==review_paper_id)
r = db.engine.execute(sq).fetchall()
reference_ids = [x['Paper_reference_ID'] for x in r]
print("{} references found in {}".format(len(reference_ids), format_timespan(timer()-start)))

447 references found in 0.01 seconds


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
import numpy as np
import pandas as pd

In [9]:
rng = np.random.RandomState(1)

In [10]:
seed_papers, target_papers = train_test_split(r, train_size=50, random_state=rng)



In [11]:
seed_papers[0]['Paper_reference_ID']

2157305458

In [12]:
from get_papers_2_degrees_out import get_papers

In [13]:
seed_papers = get_papers([x.Paper_reference_ID for x in seed_papers])

In [15]:
x = seed_papers[0]
x['title']

'a novel similarity based modularity function for graph partitioning'

In [14]:
seed_papers[0]['title']

'a novel similarity based modularity function for graph partitioning'

In [None]:
tbl1 = db.tables['Papers']
tbl2 = db.tables['rank']
j = tbl1.join(tbl2, tbl1.c.Paper_ID==tbl2.c.Paper_ID)

In [None]:
result = db.engine.execute(j.select(tbl1.c.Paper_ID==review_paper_id)).fetchall()

In [None]:
result[0].title

In [None]:
isinstance(rng, np.random.RandomState)

In [None]:
papers = [pr.paper_cited for pr in review_paper.paperrefs_citing]
print(len(papers))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_papers, target_papers = train_test_split(papers, train_size=50, random_state=999)

In [None]:
target_papers = set(target_papers)

In [None]:
len(target_papers)

In [None]:
start = timer()
test_papers = set()
c = Counter()
cur_papers = list(train_papers)
print("looping through {} papers".format(len(cur_papers)))
for i, paper in enumerate(cur_papers):
    for p in [pr.paper_cited for pr in paper.paperrefs_citing]:
        c[p.Paper_ID] += 1
        test_papers.add(p)
    for p in [pr.paper_citing for pr in paper.paperrefs_cited]:
        c[p.Paper_ID] += 1
        test_papers.add(p)
    print("done with {} papers. len(test_papers)=={}".format(i+1, len(test_papers)))
print(format_timespan(timer()-start))
print(len(test_papers))

In [None]:
len(target_papers.intersection(test_papers))

In [None]:
def tree_distance(n1, n2, sep=":"):
    # https://en.wikipedia.org/wiki/Lowest_common_ancestor
    # the distance from v to w can be computed as 
    # the distance from the root to v, plus the distance from 
    # the root to w, minus twice the distance from 
    # the root to their lowest common ancestor
    v, w = [n.split(sep) for n in [n1, n2]]
    distance_root_to_v = len(v)
    distance_root_to_w = len(w)
    
    distance_root_to_lca = 0
    for i in range(min(distance_root_to_v, distance_root_to_w)):
        if v[i] == w[i]:
            distance_root_to_lca += 1
        else:
            break
    return distance_root_to_v + distance_root_to_w - (2*distance_root_to_lca)
    

In [None]:
def distances_two_groups(g1, g2):
    distances = []
    for n1 in g1:
        for n2 in g2:
            if n1 == n2:
                continue
            distances.append(tree_distance(n1, n2))
    return distances

In [None]:
def describe_distances(g1, g2):
    distances = distances_two_groups(g1, g2)
    distances = pd.Series(distances)
    return distances.describe()

In [None]:
start = timer()
clusters = [p.cl for p in test_papers if p.cl]
print(format_timespan(timer()-start))
print(len(clusters))

In [None]:
def avg_distance(cl, cl_group):
    distances = []
    for x in cl_group:
        distances.append(tree_distance(cl, x))
    return sum(distances) / len(distances)

In [None]:
start = timer()
rows = []
train_cls = [p.cl for p in train_papers if p.cl]
for p in test_papers:
    pid = p.Paper_ID
    title = p.title
    ef = p.EF
    cl = p.cl
    if cl:
        avg_distance_to_train = avg_distance(cl, train_cls)
    else:
        avg_distance_to_train = None
    rows.append({
        'Paper_ID': pid,
        'title': title,
        'EF': ef,
        'cl': cl,
        'avg_distance_to_train': avg_distance_to_train
    })
print("{} rows in {}".format(len(rows), format_timespan(timer()-start)))

In [None]:
df = pd.DataFrame(rows)

In [None]:
target_pids = set([p.Paper_ID for p in target_papers])

In [None]:
df['target'] = df.Paper_ID.apply(lambda x: x in target_pids)

In [None]:
df = df.sort_values('avg_distance_to_train').reset_index(drop=True)

In [None]:
print(df[df.target==True].avg_distance_to_train.mean())
print(df[df.target==False].avg_distance_to_train.mean())

In [None]:
print(df[df.target==True].EF.mean())
print(df[df.target==False].EF.mean())

In [None]:
print(len(df))
print("contains {} target papers".format(df.target.sum()))
print("")
ef_thresh = df.EF.min()
print("removing papers with EF<={}".format(ef_thresh))
subset = df[df.EF>ef_thresh]
print(len(subset))
print("contains {} target papers".format(subset.target.sum()))

In [None]:
print(subset[subset.target==True].avg_distance_to_train.mean())
print(subset[subset.target==False].avg_distance_to_train.mean())

In [None]:
print(subset[subset.target==True].EF.mean())
print(subset[subset.target==False].EF.mean())

In [None]:
paper.paperrefs_cited[2].Paper_ID

In [None]:
cluster_addresses = [p.cl for p in papers]
len(cluster_addresses)

In [None]:
distances_train_pairs = []
train_clusters = [p.cl for p in train_papers]
for n1, n2 in itertools.combinations(train_clusters, 2):
    distances_train_pairs.append(tree_distance(n1, n2))

In [None]:
distances_train_pairs = pd.Series(distances_train_pairs)
distances_train_pairs.describe()

In [None]:
journal_id = 137773608  # Nature
nature_papers = session.query(Paper).filter_by(Journal_ID=journal_id).all()
print(len(nature_papers))

In [None]:
nature_papers = pd.Series(nature_papers)
nature_sample = nature_papers.sample(n=200, random_state=999)
nature_samples_clusters = [p.cl for p in nature_sample if p.cl]
len(nature_samples_clusters)

In [None]:
within_review_papers_distances = distances_two_groups(cluster_addresses, cluster_addresses)
within_review_papers_distances = pd.Series(within_review_papers_distances)
within_review_papers_distances.describe()

In [None]:
distances_review_to_nature = distances_two_groups(cluster_addresses, nature_samples_clusters)
distances_review_to_nature = pd.Series(distances_review_to_nature)
distances_review_to_nature.describe()

In [None]:
paas = session.query(PaperAuthorAffiliation).filter_by(Author_ID=2151641964).all()

In [None]:
grinstaff_papers = [paa.paper for paa in paas if paa.paper]
print(len(grinstaff_papers))
grinstaff_clusters = [p.cl for p in grinstaff_papers if p.cl]
print(len(grinstaff_clusters))

In [None]:
describe_distances(cluster_addresses, grinstaff_clusters)

In [None]:
start = timer()
rosvall_paas = session.query(PaperAuthorAffiliation).filter_by(Author_ID=1999253335).all()
print("{} records in {}".format(len(rosvall_paas), format_timespan(timer()-start)))
rosvall_papers = [paa.paper for paa in rosvall_paas if paa.paper]
print("{} papers".format(len(rosvall_papers)))
rosvall_clusters = [p.cl for p in rosvall_papers if p.cl]
print("{} clusters".format(len(rosvall_clusters)))

In [None]:
describe_distances(cluster_addresses, rosvall_clusters)