# Journal Recommendation for Preprints

The goal of this notebook is to help users know which journal would be most appropriate for their preprint. The central idea is to use euclidean distance between documents to gauge which journal similar works have been sent.

In [1]:
%load_ext autoreload
%autoreload 2

from collections import Counter, defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier

from tqdm import tqdm_notebook

In [2]:
cluster = not True

# Load bioRxiv Document Vectors

In [3]:
if cluster:
    biorxiv_journal_df = (
        pd.read_csv("mapped_published_doi.tsv", sep="\t")
        .groupby("doi")
        .agg({
            "document":"last",
            "category":"first",
            "journal":"first",
            "doi":"last",
            "published_doi":"first",  
            "pmcid":"first", 
        })
        .reset_index(drop=True)
    )
else:
    biorxiv_journal_df = (
        pd.read_csv("output/mapped_published_doi.tsv", sep="\t")
        .groupby("doi")
        .agg({
            "document":"last",
            "category":"first",
            "journal":"first",
            "doi":"last",
            "published_doi":"first",  
            "pmcid":"first", 
        })
        .reset_index(drop=True)
    )
biorxiv_journal_df.head()

Unnamed: 0,document,category,journal,doi,published_doi,pmcid
0,000026_v1.xml,genetics,PLoS Genetics,10.1101/000026,10.1371/journal.pgen.1004412,PMC4125079
1,000042_v2.xml,genomics,Nature Reviews Genetics,10.1101/000042,10.1038/nrg3723,
2,000067_v1.xml,genomics,Nature,10.1101/000067,10.1038/nature12904,PMC4285441
3,000091_v1.xml,synthetic biology,,10.1101/000091,,
4,000109_v1.xml,evolutionary biology,PLoS Genetics,10.1101/000109,10.1371/journal.pgen.1004410,PMC4072524


In [4]:
# Count number of Non-NaN elements
print(f"Number of Non-NaN entries: {biorxiv_journal_df.pmcid.count()}")
print(f"Total number of entries: {biorxiv_journal_df.shape[0]}")
print(f"Percent Covered: {biorxiv_journal_df.pmcid.count()/biorxiv_journal_df.shape[0]:.2f}")

Number of Non-NaN entries: 17120
Total number of entries: 71118
Percent Covered: 0.24


In [5]:
golden_set_df = biorxiv_journal_df.query("pmcid.notnull()")
golden_set_df.head()

Unnamed: 0,document,category,journal,doi,published_doi,pmcid
0,000026_v1.xml,genetics,PLoS Genetics,10.1101/000026,10.1371/journal.pgen.1004412,PMC4125079
2,000067_v1.xml,genomics,Nature,10.1101/000067,10.1038/nature12904,PMC4285441
4,000109_v1.xml,evolutionary biology,PLoS Genetics,10.1101/000109,10.1371/journal.pgen.1004410,PMC4072524
6,000141_v1.xml,cancer biology,PLoS Computational Biology,10.1101/000141,10.1371/journal.pcbi.1003433,PMC3894166
7,000158_v1.xml,bioinformatics,BMC Genomics,10.1101/000158,10.1186/1471-2164-15-398,PMC4041996


# Load Pubmed Central Document Vectors

In [6]:
if cluster:
    pmc_articles_df = (
        pd.read_csv(
            "pubmed_central_journal_paper_map.tsv.xz", 
            sep="\t"
        )
        .query("article_type=='research-article'")
    )
else:
    pmc_articles_df = (
        pd.read_csv(
            "../../pmc/exploratory_data_analysis/output/pubmed_central_journal_paper_map.tsv.xz", 
            sep="\t"
        )
        .query("article_type=='research-article'")
    )
print(pmc_articles_df.head())

          journal      article_type                        doi       pmcid
0  Environ_Health  research-article     10.1186/1476-069X-5-22  PMC1552054
1  Environ_Health  research-article     10.1186/1476-069X-4-12  PMC1226148
3  Environ_Health  research-article  10.1186/s12940-017-0316-3  PMC5635510
4  Environ_Health  research-article    10.1186/1476-069X-10-46  PMC3125232
5  Environ_Health  research-article    10.1186/1476-069X-11-91  PMC3533997


In [7]:
print(pmc_articles_df.journal.value_counts().shape)
journals = pmc_articles_df.journal.value_counts()
journals

(9112,)


PLoS_One                                     218509
Sci_Rep                                      101554
Nat_Commun                                    23812
Acta_Crystallogr_Sect_E_Struct_Rep_Online     23537
J_Exp_Med                                     22687
                                              ...  
Restor_Ecol                                       1
J_Micro_Nanolithogr_MEMS_MOEMS                    1
J_Neurosci_Neuropharmacol                         1
Flavour_Fragr_J                                   1
J_Aggress_Maltreat_Trauma                         1
Name: journal, Length: 9112, dtype: int64

In [8]:
# Filter out low count journals
pmc_articles_df = pmc_articles_df.query(f"journal in {journals[journals > 100].index.tolist()}")
print(pmc_articles_df.shape)
pmc_articles_df.head()

(1865604, 4)


Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [9]:
if cluster:
    pmc_embedding_dict = {
        int(path.stem[-7:-4]):pd.read_csv(
            str(path), 
            sep="\t"
        )
        for path in Path("pmc_vectors/").rglob("*tsv.xz")
    }
else:
    pmc_embedding_dict = {
        int(path.stem[-7:-4]):pd.read_csv(
            str(path), 
            sep="\t"
        )
        for path in Path("../../pmc/word_vector_experiment/output/").rglob("*tsv.xz")
    }
pmc_embedding_dict[300].head()

Unnamed: 0,document,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,PMC1552054,-0.428596,0.004477,-0.153677,0.059131,-0.394598,-0.066383,0.358798,-0.245809,0.431974,...,-0.067994,0.033121,-0.077121,-0.433617,-1.111247,-0.40526,0.293903,0.613155,-0.500204,-0.214057
1,PMC1226148,-0.434013,-0.185516,-0.070654,0.488759,-0.280275,0.042681,0.116685,0.037062,0.530896,...,0.318905,-0.065244,0.159258,-0.380412,-0.665559,-0.485417,0.180982,0.523053,-0.485158,-0.251244
2,PMC5635510,-0.248388,0.11866,-0.103008,0.267721,0.278939,-0.220584,0.089593,-0.159638,0.86619,...,0.081127,-0.110091,-0.031617,-0.188741,-0.936947,-0.461118,0.618781,0.647195,-0.27024,-0.409643
3,PMC3125232,-0.507224,0.024099,-0.039696,-0.171992,0.371326,-0.137797,-0.202616,-0.156994,0.695519,...,-0.218274,-0.022597,-0.028885,-0.25056,-1.14073,-0.568136,0.482088,0.899122,-0.250971,-0.592976
4,PMC3533997,-0.424435,0.175519,-0.048797,0.178023,-0.042913,-0.282377,0.317917,-0.007002,0.586774,...,-0.111805,-0.26262,-0.187817,-0.463804,-1.308874,-0.352892,0.251982,0.550853,-0.659678,-0.614042


In [10]:
full_training_dataset = {
    dim : (
        pmc_articles_df
        .query(f"pmcid not in {golden_set_df.pmcid.tolist()}")
        [["pmcid", "journal"]]
        .merge(pmc_embedding_dict[dim], left_on="pmcid", right_on="document")
        .drop("pmcid", axis=1)
        .set_index("document")
    )
    for dim in pmc_embedding_dict
}

In [11]:
subsampled_training_dataset = {
    dim : (
        pmc_articles_df
        .query(f"pmcid not in {golden_set_df.pmcid.tolist()}")
        [["pmcid", "journal"]]
        .merge(pmc_embedding_dict[dim], left_on="pmcid", right_on="document")
        .drop("pmcid", axis=1)
        .groupby("journal", group_keys=False)
        .apply(lambda x: x.sample(min(len(x), 100), random_state=100))
        .set_index("document")
    )
    for dim in pmc_embedding_dict
}

# Train Recommendation System

In [12]:
def cross_validation(model, dataset, evaluate, cv=10, random_state=100, **kwargs):
    
    folds = KFold(n_splits=cv, random_state = random_state, shuffle=True)
    cv_fold_accs = []
    
    fold_predictions = []
    for train, val in folds.split(dataset):
        
        prediction, true_labels = evaluate(
            model, dataset.iloc[train], 
            dataset.iloc[val], **kwargs
        )

        accs = [
                 (
                     1 if true_labels[data_idx] in prediction_row 
                     else 0 
                 )
                 for data_idx, prediction_row in enumerate(prediction)
        ]
        
        cv_fold_accs.append(np.sum(accs)/len(accs))
        print(f"{np.sum(accs)} out of {len(accs)}")
        
        fold_predictions.append(prediction)
        
    print(f"Total Accuracy: {np.mean(cv_fold_accs)*100:.3f}%")
    return fold_predictions

## Random Journal Prediction

The central idea here is to answer the question what is the accuracy when journals are recommended at random?

In [13]:
def dummy_evaluate(model, training_data, validation_data, **kwargs):
    top_X = kwargs.get("top_predictions", 10)
    random_states = kwargs.get("dummy_seed", [100,200,300,400,500,600,700,800,900,1000])
    
    X_train = (
        training_data
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )
    
    Y_train = (
        training_data
        .journal
        .values
    )
    
    X_val = (
        validation_data
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )
    Y_val = (
        validation_data
        .journal
        .values
    )
    
    predictions = []
    for i, seed in zip(range(top_X), random_states):
        model.random_state = seed
        model.fit(X_train, Y_train)
        predictions.append(model.predict(X_val))

    return np.stack(predictions).transpose(), Y_val

In [16]:
model = DummyClassifier(strategy='uniform')

In [17]:
_ = cross_validation(
    model, subsampled_training_dataset[300], 
    dummy_evaluate, cv=10, 
    random_state=100, top_predictions=10
)

114 out of 20232
112 out of 20232
116 out of 20232
121 out of 20232
94 out of 20232
83 out of 20232
91 out of 20232
98 out of 20232
110 out of 20231
106 out of 20231
Total Accuracy: 0.517%


## KNearestNeighbors Paper by Paper Comparison - Full Dataset

Assuming I didn't take mega-journal influence into account, what would the initial recommendation accuracy be?

In [14]:
def knn_evaluate(model, training_data, validation_data, **kwargs):
    
    X_train = (
        training_data
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )
    
    Y_train = (
        training_data
        .journal
        .values
    )
    
    X_val = (
        validation_data
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )
    
    Y_val = (
        validation_data
        .journal
        .values
    )
    
    model.fit(X_train, Y_train)
    distance, neighbors = model.kneighbors(X_val)
    
    predictions = [
        Y_train[neighbor_predict]
        for neighbor_predict in neighbors 
    ]

    return np.stack(predictions), Y_val

In [18]:
knn_model = KNeighborsClassifier(n_neighbors=10)

In [None]:
_ = cross_validation(
    knn_model, full_training_dataset[300], 
    knn_evaluate, cv=10,
    random_state=100
)

106786 out of 172616
106776 out of 172616


## KNearestNeighbors Paper by Paper Comparison Subsampled

The first idea for a classifier is to compare which papers are similar to other papers. Due to the overflow of PLOS One papers I sub-sampled each journal to have only 100 papers for representation. Then trained a KNearestNeighbors to determine how often does the correct journal appear in the top ten neighbors as well as top twenty neighbors.

In [16]:
knn_model = KNeighborsClassifier(n_neighbors=10)

In [17]:
result_dict = {}
for dim in subsampled_training_dataset:
    print(dim)
    
    fold_predictions = cross_validation(
        knn_model, subsampled_training_dataset[dim], 
        knn_evaluate, cv=10, 
        random_state=100
    )
    
    print()
    
    result_dict[dim] = fold_predictions

250
8116 out of 20232
8033 out of 20232
8049 out of 20232
7963 out of 20232
8045 out of 20232
7970 out of 20232
8127 out of 20232
7990 out of 20232
8145 out of 20231
8081 out of 20231
Total Accuracy: 39.798%

300
8142 out of 20232
8090 out of 20232
8115 out of 20232
8019 out of 20232
8075 out of 20232
8023 out of 20232
8155 out of 20232
7993 out of 20232
8164 out of 20231
8115 out of 20231
Total Accuracy: 39.982%

150
7952 out of 20232
7840 out of 20232
7889 out of 20232
7883 out of 20232
7945 out of 20232
7825 out of 20232
7954 out of 20232
7906 out of 20232
8037 out of 20231
7936 out of 20231
Total Accuracy: 39.130%



## KNearestNeighbors Centroid analysis

Following up on the original idea, I thought a helpful experiment would be to perform a centroid analysis (i.e. take the average of all papers within each journal). Similar to above I trained a KNearestNeighbors classifier to see if the correct journal will appear in the top 10 neighbors.

In [15]:
def knn_centroid_evaluate(model, training_data, validation_data, **kwargs):
    
    train_centroid_df = (
        training_data
        .groupby("journal")
        .agg("mean")
        .reset_index()
    )
            
    X_train_centroid = (
        train_centroid_df
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )

    Y_train_centroid = (
        train_centroid_df
        .journal
        .values
    )
    
    
    X_val = (
        validation_data
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )
    
    Y_val = (
        validation_data
        .journal
        .values
    )
    
    knn_model.fit(X_train_centroid, Y_train_centroid)
    distance, neighbors = knn_model.kneighbors(X_val)
    
    predictions = [
        Y_train_centroid[neighbor_predict]
        for neighbor_predict in neighbors 
    ]

    return np.stack(predictions), Y_val

In [18]:
knn_model = KNeighborsClassifier(n_neighbors=10)

In [19]:
_ = cross_validation(
    knn_model, subsampled_training_dataset[300], 
    knn_centroid_evaluate, cv=10, 
    random_state=100
)

7277 out of 20232
7238 out of 20232
7327 out of 20232
7349 out of 20232
7376 out of 20232
7195 out of 20232
7340 out of 20232
7286 out of 20232
7400 out of 20231
7298 out of 20231
Total Accuracy: 36.124%


## KNearestNeighbors Centroid Analysis - Full dataset

This section I'm using the entire dataset to calculate journal centroids and then evaluate performance on the sub-sampled dataset.

In [20]:
def knn_centroid_full_evaluate(model, training_data, validation_data, **kwargs):
    
    train_centroid_df = (
        kwargs.get("full_dataset")
        .groupby("journal")
        .agg("mean")
        .reset_index()
    )
            
    X_train_centroid = (
        train_centroid_df
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )

    Y_train_centroid = (
        train_centroid_df
        .journal
        .values
    )
    
    
    X_val = (
        validation_data
        .drop("journal", axis=1)
        .values
        .astype('float32')
    )
    
    Y_val = (
        validation_data
        .journal
        .values
    )
    
    knn_model.fit(X_train_centroid, Y_train_centroid)
    distance, neighbors = knn_model.kneighbors(X_val)
    
    predictions = [
        Y_train_centroid[neighbor_predict]
        for neighbor_predict in neighbors 
    ]

    return np.stack(predictions), Y_val

In [21]:
knn_model = KNeighborsClassifier(n_neighbors=10)

In [23]:
_ = cross_validation(
    knn_model, subsampled_training_dataset[300], 
    knn_centroid_full_evaluate, cv=10, 
    random_state=100, full_dataset=full_training_dataset[300]
)

7518 out of 20232
7502 out of 20232
7546 out of 20232
7568 out of 20232
7617 out of 20232
7487 out of 20232
7612 out of 20232
7536 out of 20232
7665 out of 20231
7496 out of 20231
Total Accuracy: 37.341%


# Golden Set Analysis

In [25]:
biorxiv_embeddings_df = pd.read_csv(
    Path("../word_vector_experiment/output/word2vec_output/biorxiv_all_articles_300.tsv.xz")
    .resolve(),
    sep="\t"
)

biorxiv_embeddings_df.head()

Unnamed: 0,document,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,440735_v1.xml,-0.185549,-0.028703,-0.148555,-0.133172,0.353365,-0.114964,-0.028663,-0.215159,0.551382,...,-0.286513,0.082141,-0.197929,-0.163326,-0.867377,-0.383934,0.684275,0.673811,0.332528,-0.252338
1,775270_v1.xml,-0.53204,-0.065809,-0.313654,-0.117575,0.372866,-0.318956,-0.36801,0.32585,0.808698,...,-0.199805,-0.083095,0.184185,-0.157145,-0.895367,-0.611417,0.224433,0.857584,-0.44224,-0.229249
2,242404_v1.xml,-0.511104,-0.20379,-0.171574,0.244052,0.235567,-0.269997,-0.222966,-0.247174,0.385497,...,-0.35189,-0.155088,-0.222041,-0.258185,-0.866121,-0.568663,0.553375,0.890001,0.025051,-0.347382
3,872994_v1.xml,-0.595936,-0.14779,-0.277296,0.522433,0.574653,-0.046103,-0.586993,-0.312505,0.382142,...,0.07424,0.042758,0.174565,0.155595,-0.557984,-0.445246,0.386596,0.512007,-0.510197,-0.295844
4,080853_v2.xml,-0.076522,0.039275,-0.928818,0.226103,0.035823,-0.07558,-0.189354,0.243075,0.240977,...,0.524121,0.560223,0.246179,0.242929,-0.810809,-0.118951,0.052647,0.184235,-0.154029,0.469902


In [35]:
golden_dataset = (
    golden_set_df[["document", "pmcid"]]
    .merge(pmc_articles_df[["journal", "pmcid"]], on="pmcid")
    .merge(biorxiv_embeddings_df, on="document")
)
golden_dataset.head()

Unnamed: 0,document,pmcid,journal,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,000026_v1.xml,PMC4125079,PLoS_Genet,-0.354588,-0.039688,-0.264044,0.061171,0.445177,-0.09428,-0.207811,...,0.077113,0.009006,-0.12814,0.107098,-0.896259,-0.591504,0.451696,1.010107,0.155138,-0.14159
1,000067_v1.xml,PMC4285441,Nature,-0.223833,0.104426,-0.187421,-0.09792,0.118555,-0.162342,-0.065943,...,0.11093,0.144949,0.003651,-0.035206,-0.887362,-0.582829,0.407179,0.400775,0.030982,0.238798
2,000109_v1.xml,PMC4072524,PLoS_Genet,-0.35977,-0.034423,-0.211397,0.142895,0.25764,0.214825,0.053788,...,0.131452,0.034245,0.119621,-0.07523,-0.786596,-0.557854,0.163636,0.867464,0.000889,0.181491
3,000141_v1.xml,PMC3894166,PLoS_Comput_Biol,-0.489331,0.029247,-0.766819,0.12692,0.228847,-0.231903,-0.362439,...,0.390265,0.303159,0.268525,0.173529,-0.685519,-0.613437,0.188164,0.561036,0.02427,-0.074251
4,000158_v1.xml,PMC4041996,BMC_Genomics,-0.184118,0.065338,-0.151212,0.281209,0.096208,-0.315909,-0.313868,...,-0.04081,-0.05694,-0.258915,-0.034394,-0.636722,-0.097607,0.717791,0.791196,0.07498,-0.112038


## Centroid Analysis

In [37]:
train_centroid_df = (
    full_training_dataset[300]
    .groupby("journal")
    .agg("mean")
    .reset_index()
)

X_train_centroid = (
    train_centroid_df
    .drop("journal", axis=1)
    .values
    .astype('float32')
)

Y_train_centroid = (
    train_centroid_df
    .journal
    .values
)
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train_centroid, Y_train_centroid)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [38]:
distance, neighbors = knn_model.kneighbors(
    golden_dataset
    .drop(["document", "pmcid", "journal"], axis=1)
    .values
)

In [40]:
accs = [
    (
         1 if golden_dataset.journal[data_idx] in prediction_row 
         else 0 
     )
     for data_idx, prediction_row in enumerate(
         [
             Y_train_centroid[neighbor_predict]
             for neighbor_predict in neighbors 
         ]
     )
]

In [43]:
print(f"{np.sum(accs)} out of {len(accs)}")
print(f"{np.mean(accs)}% correct")

3445 out of 15922
0.21636729054138928% correct


## Subsampled Paper Analysis

In [44]:
X_train = (
    subsampled_training_dataset[300]
    .drop("journal", axis=1)
    .values
    .astype('float32')
)
    
Y_train = (
    subsampled_training_dataset[300]
    .journal
    .values
)
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [45]:
distance, neighbors = knn_model.kneighbors(
    golden_dataset
    .drop(["document", "pmcid", "journal"], axis=1)
    .values
)

In [46]:
accs = [
    (
         1 if golden_dataset.journal[data_idx] in prediction_row 
         else 0 
     )
     for data_idx, prediction_row in enumerate(
         [
             Y_train[neighbor_predict]
             for neighbor_predict in neighbors 
         ]
     )
]

In [47]:
print(f"{np.sum(accs)} out of {len(accs)}")
print(f"{np.mean(accs)}% correct")

3286 out of 15922
0.20638110790101746% correct


Conclusions for this notebook:
1. Mega-journals cover a wide range of research topics.
2. The correct journal only appears in the top ten about 37-39 percent of the time.
3. 300 dimensions gives the best performance compared to the other dimensions.
4. Reporting a combination of centroid analysis and individual paper predictions will be needed to go forward.