# Journal Recommendation for Preprints

The goal of this notebook is to help users know which journal would be most appropriate for their preprint. The central idea is to use euclidean distance between documents to gauge which journal similar works have been sent.

In [1]:
%load_ext autoreload
%autoreload 2

from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier

from tqdm import tqdm_notebook

# Load bioRxiv Document Vectors

In [2]:
biorxiv_journal_df = (
    pd.read_csv("output/mapped_published_doi.tsv", sep="\t")
    .groupby("doi")
    .agg({
        "document":"last",
        "category":"first",
        "journal":"first",
        "doi":"last",
        "published_doi":"first",  
        "pmcid":"first", 
    })
    .reset_index(drop=True)
)
biorxiv_journal_df.head()

Unnamed: 0,document,category,journal,doi,published_doi,pmcid
0,000026_v1.xml,genetics,PLoS Genetics,10.1101/000026,10.1371/journal.pgen.1004412,PMC4125079
1,000042_v2.xml,genomics,Nature Reviews Genetics,10.1101/000042,10.1038/nrg3723,
2,000067_v1.xml,genomics,Nature,10.1101/000067,10.1038/nature12904,PMC4285441
3,000091_v1.xml,synthetic biology,,10.1101/000091,,
4,000109_v1.xml,evolutionary biology,PLoS Genetics,10.1101/000109,10.1371/journal.pgen.1004410,PMC4072524


In [3]:
# Count number of Non-NaN elements
print(f"Number of Non-NaN entries: {biorxiv_journal_df.pmcid.count()}")
print(f"Total number of entries: {biorxiv_journal_df.shape[0]}")
print(f"Percent Covered: {biorxiv_journal_df.pmcid.count()/biorxiv_journal_df.shape[0]:.2f}")

Number of Non-NaN entries: 17120
Total number of entries: 71118
Percent Covered: 0.24


In [4]:
golden_set_df = biorxiv_journal_df.query("pmcid.notnull()")
golden_set_df.head()

Unnamed: 0,document,category,journal,doi,published_doi,pmcid
0,000026_v1.xml,genetics,PLoS Genetics,10.1101/000026,10.1371/journal.pgen.1004412,PMC4125079
2,000067_v1.xml,genomics,Nature,10.1101/000067,10.1038/nature12904,PMC4285441
4,000109_v1.xml,evolutionary biology,PLoS Genetics,10.1101/000109,10.1371/journal.pgen.1004410,PMC4072524
6,000141_v1.xml,cancer biology,PLoS Computational Biology,10.1101/000141,10.1371/journal.pcbi.1003433,PMC3894166
7,000158_v1.xml,bioinformatics,BMC Genomics,10.1101/000158,10.1186/1471-2164-15-398,PMC4041996


# Load Pubmed Central Document Vectors

In [6]:
pmc_articles_df = (
    pd.read_csv(
        "../../pmc/exploratory_data_analysis/output/pubmed_central_journal_paper_map.tsv.xz", 
        sep="\t"
    )
    .query("article_type=='research-article'")
)
pmc_articles_df.head()

Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [7]:
print(pmc_articles_df.journal.value_counts().shape)
journals = pmc_articles_df.journal.value_counts()
journals

(9112,)


PLoS_One                                     218509
Sci_Rep                                      101554
Nat_Commun                                    23812
Acta_Crystallogr_Sect_E_Struct_Rep_Online     23537
J_Exp_Med                                     22687
                                              ...  
Wildl                                             1
Psychoanal_Hist                                   1
Trends_Neurosci                                   1
Int_J_Electrochem_Sci                             1
Int_J_Anesthesiol_Pain_Med                        1
Name: journal, Length: 9112, dtype: int64

In [8]:
# Filter out low count journals
pmc_articles_df = pmc_articles_df.query(f"journal in {journals[journals > 100].index.tolist()}")
print(pmc_articles_df.shape)
pmc_articles_df.head()

(1865604, 4)


Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [9]:
pmc_embedding_df = pd.read_csv(
    "../../pmc/word_vector_experiment/output/pmc_document_vectors.tsv.xz", 
    sep="\t"
)
pmc_embedding_df.head()

Unnamed: 0,document,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,PMC1552054,-0.428596,0.004477,-0.153677,0.059131,-0.394598,-0.066383,0.358798,-0.245809,0.431974,...,-0.067994,0.033121,-0.077121,-0.433617,-1.111247,-0.40526,0.293903,0.613155,-0.500204,-0.214057
1,PMC1226148,-0.434013,-0.185516,-0.070654,0.488759,-0.280275,0.042681,0.116685,0.037062,0.530896,...,0.318905,-0.065244,0.159258,-0.380412,-0.665559,-0.485417,0.180982,0.523053,-0.485158,-0.251244
2,PMC5635510,-0.248388,0.11866,-0.103008,0.267721,0.278939,-0.220584,0.089593,-0.159638,0.86619,...,0.081127,-0.110091,-0.031617,-0.188741,-0.936947,-0.461118,0.618781,0.647195,-0.27024,-0.409643
3,PMC3125232,-0.507224,0.024099,-0.039696,-0.171992,0.371326,-0.137797,-0.202616,-0.156994,0.695519,...,-0.218274,-0.022597,-0.028885,-0.25056,-1.14073,-0.568136,0.482088,0.899122,-0.250971,-0.592976
4,PMC3533997,-0.424435,0.175519,-0.048797,0.178023,-0.042913,-0.282377,0.317917,-0.007002,0.586774,...,-0.111805,-0.26262,-0.187817,-0.463804,-1.308874,-0.352892,0.251982,0.550853,-0.659678,-0.614042


# Train Recommendation System

In [10]:
def cross_validation(dataset, cv=10, n_neighbors=10, random_state=100, centroid=False):
    
    folds = KFold(n_splits=cv, random_state = random_state, shuffle=True)
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    cv_fold_accs = []
    
    for train, val in folds.split(dataset):
        
        X = (
            dataset
            .drop("journal", axis=1)
            .values
            .astype('float32')
        )
        
        Y = (
            dataset
            .journal
            .values
        )
        
        if centroid:
            journal_centroid_df = (
                dataset
                .iloc[train]
                .groupby("journal")
                .agg("mean")
                .reset_index()
            )
            
            centroid_X = (
                journal_centroid_df
                .drop("journal", axis=1)
                .values
                .astype('float32')
            )
            
            centroid_Y = (
                journal_centroid_df
                .journal
                .values
            )
            knn_model.fit(centroid_X, centroid_Y)
            
            distance, neighbors = knn_model.kneighbors(X[val])
            
            accs = [
                 (
                     1 if Y[val[data_idx]] in centroid_Y[neighbor_group]
                     else 0 
                 )
                 for data_idx, neighbor_group in enumerate(neighbors)
            ]
            
        else:
            knn_model.fit(X[train],Y[train])
        
            distance, neighbors = knn_model.kneighbors(X[val])

            accs = [
                 (
                     1 if Y[val[data_idx]] in Y[train][neighbor_group]
                     else 0 
                 )
                 for data_idx, neighbor_group in enumerate(neighbors)
            ]
        
        cv_fold_accs.append(np.sum(accs)/len(accs))
        print(f"{np.sum(accs)} out of {len(accs)}")
    print(np.mean(cv_fold_accs))
        

## KNearestNeighbors Paper by Paper Comparison

The first idea for a classifier is to compare which papers are similar to other papers. Due to the overflow of PLOS One papers I sub-sampled each journal to have only 100 papers for representation. Then trained a KNearestNeighbors to determine how often does the correct journal appear in the top ten neighbors as well as top twenty neighbors.

In [11]:
training_dataset = (
    pmc_articles_df
    .query(f"pmcid not in {golden_set_df.pmcid.tolist()}")
    [["pmcid", "journal"]]
    .merge(pmc_embedding_df, left_on="pmcid", right_on="document")
    .drop("pmcid", axis=1)
    .groupby("journal", group_keys=False)
    .apply(lambda x: x.sample(min(len(x), 100), random_state=100))
    .set_index("document")
)
print(training_dataset.shape)
training_dataset.head()

(202318, 301)


Unnamed: 0_level_0,journal,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PMC4624147,3_Biotech,-0.07148,-0.387928,-0.035851,0.33771,-0.15622,-0.194954,-0.145836,0.344392,0.471456,...,0.689164,-0.498426,0.267979,-0.636604,-0.020882,-0.564956,0.030512,0.538904,-0.569104,-0.08537
PMC5388654,3_Biotech,-0.296794,-0.227548,-0.245893,0.078555,0.006864,-0.34917,-0.143056,0.185239,0.482329,...,0.846457,-0.349902,0.209968,-0.480532,-0.448197,-0.610338,0.128367,0.48634,-0.442132,-0.215916
PMC4235884,3_Biotech,-0.36513,-0.085597,-0.274732,0.007035,-0.159408,-0.212104,-0.008792,-0.082695,0.528267,...,0.758521,-0.142343,0.003153,0.01485,-0.675041,-0.549012,0.190555,0.924604,-0.037728,-0.596335
PMC3781262,3_Biotech,-0.071894,-0.25223,-0.281491,0.206118,-0.116529,-0.232623,-0.186404,0.420554,0.639731,...,0.842077,-0.441535,0.345108,-0.549495,-0.245084,-0.780866,0.007046,0.477205,-0.535834,0.280595
PMC4746198,3_Biotech,-0.147786,-0.105705,-0.021799,0.331766,-0.252517,-0.353953,-0.124228,0.13573,0.409271,...,0.516315,-0.23398,0.113518,-0.369316,-0.421376,-0.544923,0.107955,0.433599,-0.456963,-0.23302


In [12]:
cross_validation(training_dataset, cv=10, n_neighbors=10, random_state=100)

8142 out of 20232
8090 out of 20232
8115 out of 20232
8019 out of 20232
8075 out of 20232
8023 out of 20232
8155 out of 20232
7993 out of 20232
8164 out of 20231
8115 out of 20231
0.39982109853806175


In [13]:
cross_validation(training_dataset, cv=10, n_neighbors=20, random_state=100)

10086 out of 20232
10067 out of 20232
10079 out of 20232
9984 out of 20232
10121 out of 20232
10024 out of 20232
10119 out of 20232
10015 out of 20232
10049 out of 20231
10030 out of 20231
0.4971085037899255


## KNearestNeighbors Centroid analysis

Following up on the original idea, I thought a helpful experiment would be to perform a centroid analysis (i.e. take the average of all papers within each journal). Similar to above I trained a KNearestNeighbors classifier to see if the correct journal will appear in the top 10/20 neighbors.

In [11]:
training_dataset = (
    pmc_articles_df
    .query(f"pmcid not in {golden_set_df.pmcid.tolist()}")
    [["pmcid", "journal"]]
    .merge(pmc_embedding_df, left_on="pmcid", right_on="document")
    .drop("pmcid", axis=1)
    .groupby("journal", group_keys=False)
    .apply(lambda x: x.sample(min(len(x), 100), random_state=100))
    .set_index("document")
)
print(training_dataset.shape)
training_dataset.head()

(202318, 301)


Unnamed: 0_level_0,journal,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PMC4624147,3_Biotech,-0.07148,-0.387928,-0.035851,0.33771,-0.15622,-0.194954,-0.145836,0.344392,0.471456,...,0.689164,-0.498426,0.267979,-0.636604,-0.020882,-0.564956,0.030512,0.538904,-0.569104,-0.08537
PMC5388654,3_Biotech,-0.296794,-0.227548,-0.245893,0.078555,0.006864,-0.34917,-0.143056,0.185239,0.482329,...,0.846457,-0.349902,0.209968,-0.480532,-0.448197,-0.610338,0.128367,0.48634,-0.442132,-0.215916
PMC4235884,3_Biotech,-0.36513,-0.085597,-0.274732,0.007035,-0.159408,-0.212104,-0.008792,-0.082695,0.528267,...,0.758521,-0.142343,0.003153,0.01485,-0.675041,-0.549012,0.190555,0.924604,-0.037728,-0.596335
PMC3781262,3_Biotech,-0.071894,-0.25223,-0.281491,0.206118,-0.116529,-0.232623,-0.186404,0.420554,0.639731,...,0.842077,-0.441535,0.345108,-0.549495,-0.245084,-0.780866,0.007046,0.477205,-0.535834,0.280595
PMC4746198,3_Biotech,-0.147786,-0.105705,-0.021799,0.331766,-0.252517,-0.353953,-0.124228,0.13573,0.409271,...,0.516315,-0.23398,0.113518,-0.369316,-0.421376,-0.544923,0.107955,0.433599,-0.456963,-0.23302


In [12]:
cross_validation(training_dataset, cv=10, n_neighbors=10, random_state=100, centroid=True)

7277 out of 20232
7238 out of 20232
7327 out of 20232
7349 out of 20232
7376 out of 20232
7195 out of 20232
7340 out of 20232
7286 out of 20232
7400 out of 20231
7298 out of 20231
0.3612432112931153


In [13]:
cross_validation(training_dataset, cv=10, n_neighbors=20, random_state=100, centroid=True)

9205 out of 20232
9130 out of 20232
9250 out of 20232
9294 out of 20232
9291 out of 20232
9125 out of 20232
9256 out of 20232
9210 out of 20232
9273 out of 20231
9115 out of 20231
0.4554661373180102


# Golden Set Analysis

In [None]:
#biorxiv_journal_embedding_df = pd.read_csv(
#    "../word_vector_experiment/output/word2vec_output/biorxiv_all_articles_300.tsv.xz", 
#    sep="\t"
#)
#biorxiv_journal_embedding_df.head()

In [None]:
#golden_set_df

Conclusions for this notebook:
1. Prediction accuracy is low when it comes to journal predictions on pubmed central data.
2. Centroid analysis performs a bit worse compared to paper by paper basis.
3. 300 Dimensions might not be the correct number of dimensions when prediction journals. A parameter sweep on embeddings might be needed.