In [1]:
import sys, os, time, pickle
from timeit import default_timer as timer
from humanfriendly import format_timespan

In [2]:
import pandas as pd
import numpy as np

In [3]:
from dotenv import load_dotenv
load_dotenv('admin.env')

True

In [4]:
from db_connect_mag import Session, Paper, PaperAuthorAffiliation, db

In [5]:
# review paper about machine learning with imbalanced datasets
start = timer()
test_papers_df = pd.read_pickle('data/collect_haystack_2490420619/test_papers.pickle')
target_papers_df = pd.read_pickle('data/collect_haystack_2490420619/target_papers.pickle')
train_papers_df = pd.read_pickle('data/collect_haystack_2490420619/train_papers.pickle')
print(format_timespan(timer()-start))

0.49 seconds


In [6]:
with open('data/collect_haystack_2490420619/counter.pickle', 'rb') as f:
    c = pickle.load(f)

In [7]:
def get_target_in_test(test, target, id_colname='Paper_ID'):
    return set.intersection(set(test[id_colname]), set(target[id_colname]))
len(get_target_in_test(test_papers_df, target_papers_df))

193

In [8]:
len(target_papers_df)

193

In [9]:
len(test_papers_df)

963851

In [10]:
# remove the train (seed) papers from the test set (haystack)
n_before = len(test_papers_df)
test_papers_df = test_papers_df.drop(train_papers_df.index, errors='ignore')
n_after = len(test_papers_df)
print("removed {} seed papers from the haystack. size of haystack: {}".format(n_before-n_after, n_after))

removed 50 seed papers from the haystack. size of haystack: 963801


In [11]:
start = timer()
target_ids = set(target_papers_df.Paper_ID)
test_papers_df['target'] = test_papers_df.Paper_ID.apply(lambda x: x in target_ids)
print(format_timespan(timer()-start))

0.23 seconds


In [12]:
# def tree_distance(n1, n2, sep=":"):
#     # https://en.wikipedia.org/wiki/Lowest_common_ancestor
#     # the distance from v to w can be computed as 
#     # the distance from the root to v, plus the distance from 
#     # the root to w, minus twice the distance from 
#     # the root to their lowest common ancestor
#     v, w = [n.split(sep) for n in [n1, n2]]
#     distance_root_to_v = len(v)
#     distance_root_to_w = len(w)
    
#     distance_root_to_lca = 0
#     for i in range(min(distance_root_to_v, distance_root_to_w)):
#         if v[i] == w[i]:
#             distance_root_to_lca += 1
#         else:
#             break
#     return distance_root_to_v + distance_root_to_w - (2*distance_root_to_lca)

In [13]:
def tree_distance(n1, n2, sep=":"):
    # since depth is sort of arbitrary, let's try this
    v, w = [n.split(sep) for n in [n1, n2]]
    distance_root_to_v = len(v)
    distance_root_to_w = len(w)
    avg_depth = (distance_root_to_v + distance_root_to_w) * .5
    
    distance_root_to_lca = 0
    for i in range(min(distance_root_to_v, distance_root_to_w)):
        if v[i] == w[i]:
            distance_root_to_lca += 1
        else:
            break
    return (avg_depth - distance_root_to_lca) / avg_depth

In [14]:
def avg_distance(cl, cl_group):
    distances = []
    for x in cl_group:
        distances.append(tree_distance(cl, x))
    return sum(distances) / len(distances)

In [15]:
start = timer()
test_papers_df['avg_distance_to_train'] = test_papers_df.cl.apply(avg_distance, cl_group=train_papers_df.cl.tolist())
print(format_timespan(timer()-start))

1 minute and 30.58 seconds


In [16]:
test_papers_df.sort_values(['avg_distance_to_train', 'EF'], ascending=[True, False]).head(50)

Unnamed: 0,EF,Paper_ID,cl,title,year,target,avg_distance_to_train
895071,2.60908e-07,1993220166,459185:1:5,a study of the behavior of several methods for...,2004.0,False,0.553333
882145,1.01703e-07,2104167780,459185:1:26,exploratory undersampling for class imbalance ...,2009.0,False,0.553333
883800,6.06873e-08,2152325113,459185:1:53,concept learning in the presence of between cl...,2001.0,False,0.553333
892516,5.60104e-08,102369970,459185:1:41,class imbalances versus class overlapping an a...,2004.0,False,0.553333
895104,5.46122e-08,2015452969,459185:1:47,classification of imbalanced data a review,2009.0,False,0.553333
893087,4.98148e-08,1551909886,459185:1:69,applying support vector machines to imbalanced...,2004.0,False,0.553333
884452,4.86167e-08,2164330572,459185:1:56,an insight into classification with imbalanced...,2013.0,False,0.553333
882865,3.8985e-08,2128965734,459185:1:67,cluster based under sampling approaches for im...,2009.0,False,0.553333
882836,3.83066e-08,2122591164,459185:1:68,noisy replication in skewed binary classification,2000.0,False,0.553333
884986,2.74491e-08,2563095622,459185:1:105,boosting prediction accuracy on imbalanced dat...,2006.0,False,0.553333


In [17]:
test_papers_df.groupby('target')['EF', 'avg_distance_to_train'].describe().T

Unnamed: 0,target,False,True
EF,count,963608.0,193.0
EF,mean,4.089817e-08,1.752005e-07
EF,std,4.451369e-07,7.704503e-07
EF,min,5.43474e-09,5.46763e-09
EF,25%,5.62722e-09,8.27584e-09
EF,50%,7.1168e-09,1.7832e-08
EF,75%,1.40492e-08,7.60516e-08
EF,max,0.000171636,8.65959e-06
avg_distance_to_train,count,963608.0,193.0
avg_distance_to_train,mean,0.9983434,0.6711831


In [18]:
import matplotlib.pyplot as plt

In [19]:
%matplotlib inline

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [21]:
X = test_papers_df[['EF', 'avg_distance_to_train']]
y = test_papers_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

In [22]:
start = timer()
model = LogisticRegression()
model.fit(X_train, y_train)
print(format_timespan(timer()-start))

2.97 seconds


In [23]:
# y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_proba = model.predict_proba(X)[:, 1]
y_pred_proba

array([9.71363365e-05, 9.71363365e-05, 9.71363365e-05, ...,
       9.71363365e-05, 9.71363365e-05, 9.71363365e-05])

In [24]:
y_pred_proba.shape

(963801,)

In [25]:
pred_ranks = pd.Series(y_pred_proba, index=X.index, name='pred_ranks')
test_papers_df.join(pred_ranks).sort_values('pred_ranks', ascending=False).head()

Unnamed: 0,EF,Paper_ID,cl,title,year,target,avg_distance_to_train,pred_ranks
895071,2.60908e-07,1993220166,459185:1:5,a study of the behavior of several methods for...,2004.0,False,0.553333,0.053024
882145,1.01703e-07,2104167780,459185:1:26,exploratory undersampling for class imbalance ...,2009.0,False,0.553333,0.053024
883800,6.06873e-08,2152325113,459185:1:53,concept learning in the presence of between cl...,2001.0,False,0.553333,0.053024
892516,5.60104e-08,102369970,459185:1:41,class imbalances versus class overlapping an a...,2004.0,False,0.553333,0.053024
895104,5.46122e-08,2015452969,459185:1:47,classification of imbalanced data a review,2009.0,False,0.553333,0.053024


In [26]:
len(test_papers_df)

963801

In [27]:
top_predictions = test_papers_df.join(pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

In [28]:
top_predictions.groupby('target')['Paper_ID'].count()

target
False    136
True      57
Name: Paper_ID, dtype: int64

In [29]:
top_predictions.pred_ranks.min()

0.04845664482993548

In [30]:
start = timer()
toplevels = test_papers_df.cl.apply(lambda x: x.split(":")[0])
print(format_timespan(timer()-start))

0.49 seconds


In [31]:
toplevels.name = 'toplevel'

In [32]:
toplevels_set = set(toplevels)

In [33]:
start = timer()
tbl = db.tables['clusters_meta_tree']
sq = tbl.select(tbl.c.toplevel_in_tree.in_(toplevels_set))
# r = db.engine.execute(sq).fetchall()
cl_meta = db.read_sql(sq)
print(format_timespan(timer()-start))

20.43 seconds


  result = self._query(query)


In [34]:
cl_meta = cl_meta.set_index('id')

In [35]:
train_papers_df['toplevel'] = train_papers_df.cl.apply(lambda x: x.split(":")[0]).astype(int)

In [36]:
meta_map = cl_meta.set_index('toplevel_in_tree').meta_cl

In [37]:
train_papers_df['cl_meta'] = train_papers_df.toplevel.map(meta_map)

In [38]:
test_papers_df['toplevel'] = toplevels.astype(int)
test_papers_df['cl_meta'] = test_papers_df.toplevel.map(meta_map)

In [39]:
start = timer()
test_papers_df['meta_avg_distance_to_train'] = test_papers_df.cl_meta.apply(avg_distance, cl_group=train_papers_df.cl_meta.tolist())
print(format_timespan(timer()-start))

1 minute and 30.75 seconds


In [40]:
# logistic regression including meta cl
X = test_papers_df[['EF', 'avg_distance_to_train', 'meta_avg_distance_to_train']]
y = test_papers_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

start = timer()
model_meta = LogisticRegression()
model_meta.fit(X_train, y_train)
print(format_timespan(timer()-start))

# y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_proba_meta = model_meta.predict_proba(X)[:, 1]
#y_pred_proba

print(y_pred_proba_meta.shape)

pred_ranks_meta = pd.Series(y_pred_proba_meta, index=X.index, name='pred_ranks')
#test_papers_df.join(_pred_ranks).sort_values('pred_ranks', ascending=False).head()



top_predictions_meta = test_papers_df.join(pred_ranks_meta).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

top_predictions_meta.groupby('target')['Paper_ID'].count()

1.72 second
(963801,)


target
False    136
True      57
Name: Paper_ID, dtype: int64

In [41]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y, y_pred_proba))
print(roc_auc_score(y, y_pred_proba_meta))
print(roc_auc_score(y, _y_pred_proba))

0.9589185547168299
0.9789305864621148


NameError: name '_y_pred_proba' is not defined