In [1]:
import sys, os, time, pickle
from timeit import default_timer as timer
from humanfriendly import format_timespan

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report

In [5]:
from dotenv import load_dotenv
load_dotenv('admin.env')

True

In [6]:
from db_connect_mag import Session, Paper, PaperAuthorAffiliation, db

In [7]:
# test_papers_df = pd.read_pickle('data/collect_haystack_20180409/test_papers.pickle')
# target_papers_df = pd.read_pickle('data/collect_haystack_20180409/target_papers.pickle')
# train_papers_df = pd.read_pickle('data/collect_haystack_20180409/train_papers.pickle')

In [8]:
# this is the data for the fortunato review on Community Detection in Graphs
start = timer()
test_papers_df = pd.read_pickle('data/collect_haystack_20180409_2/test_papers.pickle')
target_papers_df = pd.read_pickle('data/collect_haystack_20180409_2/target_papers.pickle')
train_papers_df = pd.read_pickle('data/collect_haystack_20180409_2/train_papers.pickle')
print(format_timespan(timer()-start))

3.44 seconds


In [9]:
with open('data/collect_haystack_20180409_2/counter.pickle', 'rb') as f:
    c = pickle.load(f)

In [10]:
def get_target_in_test(test, target, id_colname='Paper_ID'):
    return set.intersection(set(test[id_colname]), set(target[id_colname]))
len(get_target_in_test(test_papers_df, target_papers_df))

397

In [11]:
len(target_papers_df)

397

In [12]:
test_subset = test_papers_df.sample(n=100000, random_state=999)

In [13]:
len(get_target_in_test(test_subset, target_papers_df))

18

In [14]:
# remove the train (seed) papers from the test set (haystack)
n_before = len(test_subset)
test_subset = test_subset.drop(train_papers_df.index, errors='ignore')
n_after = len(test_subset)
print("removed {} seed papers from the haystack. size of haystack: {}".format(n_before-n_after, n_after))

removed 2 seed papers from the haystack. size of haystack: 99998


In [15]:
start = timer()
target_ids = set(target_papers_df.Paper_ID)
test_subset['target'] = test_subset.Paper_ID.apply(lambda x: x in target_ids)
print(format_timespan(timer()-start))

0.03 seconds


In [16]:
# def tree_distance(n1, n2, sep=":"):
#     # https://en.wikipedia.org/wiki/Lowest_common_ancestor
#     # the distance from v to w can be computed as 
#     # the distance from the root to v, plus the distance from 
#     # the root to w, minus twice the distance from 
#     # the root to their lowest common ancestor
#     v, w = [n.split(sep) for n in [n1, n2]]
#     distance_root_to_v = len(v)
#     distance_root_to_w = len(w)
    
#     distance_root_to_lca = 0
#     for i in range(min(distance_root_to_v, distance_root_to_w)):
#         if v[i] == w[i]:
#             distance_root_to_lca += 1
#         else:
#             break
#     return distance_root_to_v + distance_root_to_w - (2*distance_root_to_lca)

In [17]:
def tree_distance(n1, n2, sep=":"):
    # since depth is sort of arbitrary, let's try this
    v, w = [n.split(sep) for n in [n1, n2]]
    distance_root_to_v = len(v)
    distance_root_to_w = len(w)
    avg_depth = (distance_root_to_v + distance_root_to_w) * .5
    
    distance_root_to_lca = 0
    for i in range(min(distance_root_to_v, distance_root_to_w)):
        if v[i] == w[i]:
            distance_root_to_lca += 1
        else:
            break
    return (avg_depth - distance_root_to_lca) / avg_depth

In [18]:
def avg_distance(cl, cl_group):
    distances = []
    for x in cl_group:
        distances.append(tree_distance(cl, x))
    return sum(distances) / len(distances)

In [19]:
n_before = len(test_subset)
test_subset = test_subset.dropna(subset=['title'])
n_after = len(test_subset)
print("dropped {} rows".format(n_before-n_after))

dropped 6757 rows


In [20]:
test_subset = test_subset.reset_index()

In [21]:
test_subset

Unnamed: 0,index,EF,Paper_ID,cl,title,year,target
0,1411908,8.745410e-09,2068632553,672554:2:77,strongly interacting traveling waves and quasi...,1992.0,False
1,683273,2.829400e-07,2150816008,6520:30:12,towards a practical public key cryptosystem,1978.0,False
2,224275,5.434740e-09,2101484911,1469834:13,precise mishandling of the digital image struc...,2011.0,False
3,598357,5.434740e-09,1606508719,1184212:14:253,discovering frequent pattern pairs,2013.0,False
4,646149,2.048950e-08,1964492749,713072:8:48,a rate of convergence result for the largest e...,2006.0,False
5,61283,9.907340e-09,2167749088,1772:5:792,exploiting modularity hierarchy and repetition...,2004.0,False
6,730330,1.297790e-08,2085217561,746809:2:110,an evolutionary approach to multi objective sc...,1999.0,False
7,1429860,6.239990e-09,2010634520,311672:27:1:2,an efficient algorithm for solving coupled sch...,1996.0,False
8,993048,1.551290e-08,2169940797,638376:1:8:126,a coupled model of photosynthesis stomatal con...,2003.0,False
9,1051217,2.730090e-08,2054250120,1037035:28,bootstrap based goodness of fit tests,1993.0,False


In [22]:
# http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [23]:
class ClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, colname='cl'):
        self.colname = colname
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, df):
        avg_dist = df[self.colname].apply(avg_distance, cl_group=train_papers_df.cl.tolist())
        return avg_dist.as_matrix().reshape(-1, 1)

In [24]:
class DataFrameColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, colname):
        self.colname = colname
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, df):
        return df[self.colname].as_matrix().reshape(-1, 1)

In [25]:
class PipelineExperiment(object):
    def __init__(self, clf):
        self.clf = clf
        self.pipeline_init()
        
    def pipeline_init(self):
        
        pipeline = Pipeline([
            ('union', FeatureUnion(
                transformer_list = [
                    ('avg_distance_to_train', Pipeline([
        #                 ('selector', ItemSelector(key='avg_distance_to_train')),
        #                 ('vect', DictVectorizer(X.avg_distance_to_train.to_dict))
                        ('cl_feat', ClusterTransformer()),
                    ])),
                    ('ef', Pipeline([
        #                 ('selector', ItemSelector(key='avg_distance_to_train')),
        #                 ('vect', DictVectorizer(X.avg_distance_to_train.to_dict))
                        ('ef_feat', DataFrameColumnTransformer('EF')),
                    ])),


                ],
            )),

            ('clf', self.clf)
        ])
        self.pipeline = pipeline
        return self
    
    def fit(self, X, y):
        self.pipeline.fit(X, y)
        return self
    

In [26]:
# X = test_papers_df[['EF', 'avg_distance_to_train']]
X = test_subset[test_subset.title.notnull()]
# Fortunato paper was published in 2010
X = X[X.year<=2010]
X = X.reset_index(drop=True)

# y = test_papers_df['target']
y = X['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

In [27]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [28]:
start = timer()
# pipeline.fit(X_train, y_train)
experiment = PipelineExperiment(GaussianNB())
pipeline = experiment.pipeline
pipeline.fit(X_train, y_train)
print(format_timespan(timer()-start))

4.66 seconds


In [29]:
start = timer()
# y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_proba = pipeline.predict_proba(X)[:, 1]
print(format_timespan(timer()-start))
y_pred_proba


5.84 seconds


array([1.05270222e-05, 1.18656551e-05, 1.05540610e-05, ...,
       1.05302697e-05, 1.05372845e-05, 1.05302944e-05])

In [30]:
y_pred_proba.shape

(58309,)

In [31]:
pred_ranks = pd.Series(y_pred_proba, index=X.index, name='pred_ranks')
X.join(pred_ranks).sort_values('pred_ranks', ascending=False).head()

Unnamed: 0,index,EF,Paper_ID,cl,title,year,target,pred_ranks
57303,347261,2.00905e-08,2030407863,3372652:1:5:1:23,curvature and temperature of complex networks,2009.0,False,1.0
55686,10998,6.52631e-09,2154408973,3372652:1:1:1881,report for the office of scientific and techni...,2006.0,False,1.0
43188,2589255,5.43474e-09,2076799116,3372652:1:84:41,comment on markets come to bits,2007.0,False,1.0
41307,119371,2.34888e-06,1971421925,3372652:1:1:1,community structure in social and biological n...,2002.0,True,1.0
45518,2433701,5.98092e-09,102198652,3372652:1:728:2,measuring information propagation and retentio...,2006.0,False,1.0


In [32]:
# top_predictions = test_papers_df.join(pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))
top_predictions = X.join(pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

In [33]:
top_predictions.groupby('target')['Paper_ID'].count()

target
False    388
True       9
Name: Paper_ID, dtype: int64

In [34]:
top_predictions.pred_ranks.min()

0.017822750385069884

In [35]:
start = timer()
y_test_pred = pipeline.predict(X_test)
print(format_timespan(timer()-start))

1.24 second


In [36]:
print(classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

      False       1.00      0.99      1.00     11659
       True       0.02      0.67      0.04         3

avg / total       1.00      0.99      1.00     11662



In [40]:
print(pipeline.steps)

[('union', FeatureUnion(n_jobs=1,
       transformer_list=[('avg_distance_to_train', Pipeline(memory=None, steps=[('cl_feat', ClusterTransformer(colname='cl'))])), ('ef', Pipeline(memory=None,
     steps=[('ef_feat', DataFrameColumnTransformer(colname='EF'))]))],
       transformer_weights=None)), ('clf', GaussianNB(priors=None))]


In [41]:
from pipeline_experiments import PipelineExperiment

In [46]:
experiment = PipelineExperiment(LogisticRegression(), train_papers_df)

In [49]:
print(experiment.pipeline.named_steps)

{'union': FeatureUnion(n_jobs=1,
       transformer_list=[('avg_distance_to_train', Pipeline(memory=None,
     steps=[('cl_feat', ClusterTransformer(colname='cl',
          seed_papers=              EF    Paper_ID               cl  \
0   6.005770e-07  2044881936    2189348:1:5:1
1   2.393100e-08  2069629462  3372652:1:1:170
2   8.128820e-... detection in delay toler...  2007
49  mixture models and exploratory analysis in net...  2007))]))],
       transformer_weights=None), 'clf': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)}


In [59]:
print(experiment.pipeline.named_steps.keys())

dict_keys(['union', 'clf'])


In [58]:
x = experiment.pipeline.named_steps['union']
print([_[0] for _ in x.transformer_list])

['avg_distance_to_train', 'ef', 'avg_title_tfidf_cosine_similarity']


In [71]:
experiment.pipeline._final_estimator.__class__.__name__

'LogisticRegression'

In [66]:
x.transformer_list

[('avg_distance_to_train', Pipeline(memory=None,
       steps=[('cl_feat', ClusterTransformer(colname='cl',
            seed_papers=              EF    Paper_ID               cl  \
  0   6.005770e-07  2044881936    2189348:1:5:1
  1   2.393100e-08  2069629462  3372652:1:1:170
  2   8.128820e-08  2091202730   3372652:1:1:52
  3   7.454170e-07  2171707538      1223566:1:7
  4   1.597520e-08  2024529797  3372652:1:1:2...ty detection in delay toler...  2007
  49  mixture models and exploratory analysis in net...  2007))])),
 ('ef', Pipeline(memory=None,
       steps=[('ef_feat', DataFrameColumnTransformer(colname='EF'))])),
 ('avg_title_tfidf_cosine_similarity', Pipeline(memory=None,
       steps=[('title_feat', AverageTfidfCosSimTransformer(colname='title',
                 seed_papers=              EF    Paper_ID               cl  \
  0   6.005770e-07  2044881936    2189348:1:5:1
  1   2.393100e-08  2069629462  3372652:1:1:170
  2   8.128820e-08  2091202730   3372652:1:1:52
  3   7.45417

In [63]:
print(experiment.seed_papers.head())

             EF    Paper_ID               cl  \
0  6.005770e-07  2044881936    2189348:1:5:1   
1  2.393100e-08  2069629462  3372652:1:1:170   
2  8.128820e-08  2091202730   3372652:1:1:52   
3  7.454170e-07  2171707538      1223566:1:7   
4  1.597520e-08  2024529797  3372652:1:1:262   

                                               title  year  
0  a critical point for random graphs with a give...  1995  
1  comparison and validation of community structu...  2006  
2  detect overlapping and hierarchical community ...  2009  
3      a faster algorithm for betweenness centrality  2001  
4  communicability graph and community structures...  2009  


In [50]:
print(experiment.pipeline.named_steps['union'])

FeatureUnion(n_jobs=1,
       transformer_list=[('avg_distance_to_train', Pipeline(memory=None,
     steps=[('cl_feat', ClusterTransformer(colname='cl',
          seed_papers=              EF    Paper_ID               cl  \
0   6.005770e-07  2044881936    2189348:1:5:1
1   2.393100e-08  2069629462  3372652:1:1:170
2   8.128820e-... detection in delay toler...  2007
49  mixture models and exploratory analysis in net...  2007))]))],
       transformer_weights=None)


In [43]:
experiment.run(X, y, num_target=len(target_papers_df))

07:23:27 __main__.pipeline_experiments.118 INFO : TOP PREDICTIONS: True is count of target papers in the top predicted
07:23:27 __main__.pipeline_experiments.119 INFO : target
False    389
True       8
Name: Paper_ID, dtype: int64


In [44]:
experiment.pipeline._final_estimator

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
SVC()

In [83]:
start = timer()
vect = CountVectorizer()
data = test_subset.title.append(train_papers_df.title).tolist()
vect.fit(data)

print(format_timespan(timer()-start))

1.33 second


In [84]:
start = timer()
tf_train = vect.transform(train_papers_df.title.tolist())
print(format_timespan(timer()-start))

0 seconds


In [85]:
start = timer()
tf_test = vect.transform(test_subset.title.tolist())
print(format_timespan(timer()-start))

1.16 second


In [86]:
start = timer()
tf_global = vect.transform(data)
print(format_timespan(timer()-start))

1.66 second


In [87]:
tf_transform = TfidfTransformer()
tf_transform.fit(tf_global)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [88]:
tfidf_train = tf_transform.transform(tf_train)

In [90]:
tfidf_test = tf_transform.transform(tf_test)

In [92]:
tfidf_test.shape

(93395, 50876)

In [94]:
from sklearn.metrics.pairwise import cosine_similarity

In [95]:
csims = cosine_similarity(tfidf_test, tfidf_train.mean(axis=0))

In [97]:
test_subset = test_subset.join(pd.Series(csims.flatten(), name='title_tfidf_cosine_similarity'))

In [99]:
test_subset.sort_values('title_tfidf_cosine_similarity', ascending=False)

Unnamed: 0,index,EF,Paper_ID,cl,title,year,title_tfidf_cosine_similarity
59445,1171939,7.707870e-09,2092102472,3372652:1:1:1281,detection of community structure in networks b...,2012.0,0.583353
3435,107272,1.992670e-06,2095293504,3372652:1:1:2,finding and evaluating community structure in ...,2004.0,0.582090
38142,166307,5.605830e-09,2483140568,1115862:1,community detection in social networks,2015.0,0.528806
89978,1528215,5.434740e-09,2554784184,3372652:1:1:2817,evolutionary community detection in complex an...,2016.0,0.522203
39834,79044,1.703530e-08,2032721088,902576:3:10,o r in the community,1981.0,0.516389
57222,1268234,6.230730e-09,2053229448,3372652:1:1:1613,finding community structure in spatially const...,2015.0,0.513658
86563,123169,1.621630e-08,2026143132,22916:2:2:9,adaptive clustering algorithm for community de...,2008.0,0.513363
50236,1599825,7.191750e-09,2014541072,3372652:1:1:665,detecting the community structure in complex n...,2008.0,0.493491
24901,2541308,5.525310e-09,2616094075,3372652:1:1:2732,adaptive community detection in complex networ...,2017.0,0.480367
30605,144134,6.579060e-08,125376580,3372652:1:1:65,an algorithm to find overlapping community str...,2007.0,0.470975


In [30]:
start = timer()
test_papers_df['avg_distance_to_train'] = test_papers_df.cl.apply(avg_distance, cl_group=train_papers_df.cl.tolist())
print(format_timespan(timer()-start))

4 minutes and 0.95 seconds


In [31]:
test_papers_df.sort_values(['avg_distance_to_train', 'EF'], ascending=[True, False]).head(50)

Unnamed: 0,EF,Paper_ID,cl,title,year,target,avg_distance_to_train
107272,1.99267e-06,2095293504,3372652:1:1:2,finding and evaluating community structure in ...,2004.0,False,0.611111
110154,9.50108e-07,2131681506,3372652:1:1:9,fast unfolding of communities in large networks,2008.0,False,0.611111
109495,8.57968e-07,2120043163,3372652:1:1:7,comparing community structure identification,2005.0,False,0.611111
114759,3.48473e-07,2606584716,3372652:1:1:29,e mail as spectroscopy automated discovery of ...,2005.0,False,0.611111
110902,8.71138e-08,2139818818,3372652:1:1:55,mixture models and exploratory analysis in net...,2007.0,False,0.611111
107228,8.12882e-08,2091202730,3372652:1:1:52,detect overlapping and hierarchical community ...,2009.0,False,0.611111
109443,6.87394e-08,2117526408,3372652:1:1:68,towards real time community detection in large...,2009.0,False,0.611111
118641,2.72823e-08,1967752035,3372652:1:1:148,finding instabilities in the community structu...,2005.0,False,0.611111
123223,2.55093e-08,2033507223,3372652:1:1:128,quantifying and identifying the overlapping co...,2009.0,False,0.611111
106490,2.3931e-08,2069629462,3372652:1:1:170,comparison and validation of community structu...,2006.0,False,0.611111


In [32]:
test_papers_df.groupby('target')['EF', 'avg_distance_to_train'].describe().T

Unnamed: 0,target,False,True
EF,count,2612894.0,397.0
EF,mean,3.942335e-08,9.342797e-07
EF,std,3.210841e-07,2.630972e-06
EF,min,5.43474e-09,7.33858e-09
EF,25%,5.71776e-09,2.81918e-08
EF,50%,7.59413e-09,9.1914e-08
EF,75%,1.66744e-08,4.85766e-07
EF,max,0.000171636,2.70753e-05
avg_distance_to_train,count,2612894.0,397.0
avg_distance_to_train,mean,0.9971833,0.8228729


In [33]:
import matplotlib.pyplot as plt

In [34]:
%matplotlib inline

In [159]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list = [
            ('avg_distance_to_train', Pipeline([
#                 ('selector', ItemSelector(key='avg_distance_to_train')),
#                 ('vect', DictVectorizer(X.avg_distance_to_train.to_dict))
                ('cl_feat', ClusterTransformer()),
            ])),
            ('ef', Pipeline([
#                 ('selector', ItemSelector(key='avg_distance_to_train')),
#                 ('vect', DictVectorizer(X.avg_distance_to_train.to_dict))
                ('ef_feat', DataFrameColumnTransformer('EF')),
            ])),
            
            # NOTE: this is just to test.
            # we probably want features that relate the titles to the seed papers. not just straight features in test set.
#             ('title_bow', Pipeline([
#                 ('selector', ItemSelector(key='title')),
#                 ('tfidf', TfidfVectorizer(min_df=10)),
#             ]))
        ],
    )),
    
    ('logreg', LogisticRegression())
])

In [160]:
# X = test_papers_df[['EF', 'avg_distance_to_train']]
X = test_papers_df[test_papers_df.title.notnull()]
# Fortunato paper was published in 2010
X = X[X.year<=2010]

# y = test_papers_df['target']
y = X['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

In [161]:
start = timer()
pipeline.fit(X_train, y_train)
print(format_timespan(timer()-start))

1 minute and 59.51 seconds


In [162]:
start = timer()
# y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_proba = pipeline.predict_proba(X)[:, 1]
print(format_timespan(timer()-start))
y_pred_proba


2 minutes and 21.45 seconds


array([0.00016925, 0.00016925, 0.00016925, ..., 0.00016925, 0.00016925,
       0.00016925])

In [163]:
y_pred_proba.shape

(1521097,)

In [164]:
pred_ranks = pd.Series(y_pred_proba, index=X.index, name='pred_ranks')
test_papers_df.join(pred_ranks).sort_values('pred_ranks', ascending=False).head()

Unnamed: 0,EF,Paper_ID,cl,title,year,target,avg_distance_to_train,pred_ranks
107272,1.99267e-06,2095293504,3372652:1:1:2,finding and evaluating community structure in ...,2004.0,False,0.611111,0.062686
110154,9.50108e-07,2131681506,3372652:1:1:9,fast unfolding of communities in large networks,2008.0,False,0.611111,0.062686
109495,8.57968e-07,2120043163,3372652:1:1:7,comparing community structure identification,2005.0,False,0.611111,0.062686
114759,3.48473e-07,2606584716,3372652:1:1:29,e mail as spectroscopy automated discovery of ...,2005.0,False,0.611111,0.062686
110902,8.71138e-08,2139818818,3372652:1:1:55,mixture models and exploratory analysis in net...,2007.0,False,0.611111,0.062686


In [165]:
len(test_papers_df)

2613291

In [166]:
len(X)

1521097

In [167]:
# top_predictions = test_papers_df.join(pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))
top_predictions = X.join(pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

In [168]:
top_predictions.groupby('target')['Paper_ID'].count()

target
False    270
True     127
Name: Paper_ID, dtype: int64

In [169]:
top_predictions.pred_ranks.min()

0.058318146270451454

In [170]:
start = timer()
y_test_pred = pipeline.predict(X_test)
print(format_timespan(timer()-start))

28.94 seconds


In [171]:
print(classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

      False       1.00      1.00      1.00    304138
       True       0.00      0.00      0.00        82

avg / total       1.00      1.00      1.00    304220



  'precision', 'predicted', average, warn_for)


In [32]:
# what if we only use pagerank?
X = test_papers_df[['EF']]
y = test_papers_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

start = timer()
_model = LogisticRegression()
_model.fit(X_train, y_train)
print(format_timespan(timer()-start))

# y_pred_proba = model.predict_proba(X_test)[:, 1]
_y_pred_proba = _model.predict_proba(X)[:, 1]
#y_pred_proba

print(y_pred_proba.shape)

_pred_ranks = pd.Series(_y_pred_proba, index=X.index, name='pred_ranks')
#test_papers_df.join(_pred_ranks).sort_values('pred_ranks', ascending=False).head()



_top_predictions = test_papers_df.join(_pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

_top_predictions.groupby('target')['Paper_ID'].count()

4.23 seconds
(2613291,)


target
False    388
True       9
Name: Paper_ID, dtype: int64

In [33]:
# what if we only use avg distance?
X = test_papers_df[['avg_distance_to_train']]
y = test_papers_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

start = timer()
_model = LogisticRegression()
_model.fit(X_train, y_train)
print(format_timespan(timer()-start))

# y_pred_proba = model.predict_proba(X_test)[:, 1]
_y_pred_proba = _model.predict_proba(X)[:, 1]
#y_pred_proba

print(y_pred_proba.shape)

_pred_ranks = pd.Series(_y_pred_proba, index=X.index, name='pred_ranks')
#test_papers_df.join(_pred_ranks).sort_values('pred_ranks', ascending=False).head()



_top_predictions = test_papers_df.join(_pred_ranks).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

_top_predictions.groupby('target')['Paper_ID'].count()

4.6 seconds
(2613291,)


target
False    377
True      20
Name: Paper_ID, dtype: int64

In [34]:
start = timer()
toplevels = test_papers_df.cl.apply(lambda x: x.split(":")[0])
print(format_timespan(timer()-start))

2.39 seconds


In [55]:
toplevels.name = 'toplevel'

In [37]:
toplevels_set = set(toplevels)

In [46]:
start = timer()
tbl = db.tables['clusters_meta_tree']
sq = tbl.select(tbl.c.toplevel_in_tree.in_(toplevels_set))
# r = db.engine.execute(sq).fetchall()
cl_meta = db.read_sql(sq)
print(format_timespan(timer()-start))

  result = self._query(query)


19.19 seconds


In [50]:
cl_meta = cl_meta.set_index('id')

In [82]:
train_papers_df['toplevel'] = train_papers_df.cl.apply(lambda x: x.split(":")[0]).astype(int)

In [83]:
meta_map = cl_meta.set_index('toplevel_in_tree').meta_cl

In [84]:
train_papers_df['cl_meta'] = train_papers_df.toplevel.map(meta_map)

In [87]:
test_papers_df['toplevel'] = toplevels.astype(int)
test_papers_df['cl_meta'] = test_papers_df.toplevel.map(meta_map)

In [89]:
start = timer()
test_papers_df['meta_avg_distance_to_train'] = test_papers_df.cl_meta.apply(avg_distance, cl_group=train_papers_df.cl_meta.tolist())
print(format_timespan(timer()-start))

4 minutes and 10.75 seconds


In [94]:
# logistic regression including meta cl
X = test_papers_df[['EF', 'avg_distance_to_train', 'meta_avg_distance_to_train']]
y = test_papers_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

start = timer()
model_meta = LogisticRegression()
model_meta.fit(X_train, y_train)
print(format_timespan(timer()-start))

# y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred_proba_meta = model_meta.predict_proba(X)[:, 1]
#y_pred_proba

print(y_pred_proba_meta.shape)

pred_ranks_meta = pd.Series(y_pred_proba_meta, index=X.index, name='pred_ranks')
#test_papers_df.join(_pred_ranks).sort_values('pred_ranks', ascending=False).head()



top_predictions_meta = test_papers_df.join(pred_ranks_meta).sort_values('pred_ranks', ascending=False).head(len(target_papers_df))

top_predictions_meta.groupby('target')['Paper_ID'].count()

6.54 seconds
(2613291,)


target
False    289
True     108
Name: Paper_ID, dtype: int64

In [105]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y, y_pred_proba))
print(roc_auc_score(y, y_pred_proba_meta))
print(roc_auc_score(y, _y_pred_proba))

0.9553407108497369
0.8686914172329787
0.7952530679672806
