# Re-Run KNearestNeighbors for Journal Recommendation

This notebook is designed to predict journals based on an updated version of document vector generation. Before I was doing a simple token analysis using spaces, but now I incorporated Spacy with lemma generation. To simplify running the recommendation notebook all over again, I'm just using the 300 dimensions to train a KNN-model and to compare its performance against a random baseline.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotnine as p9
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm_notebook

from annorxiver_modules.journal_rec_helper import (
    cross_validation,
    dummy_evaluate,
    knn_evaluate,
    knn_centroid_evaluate,
)

# Load bioRxiv Papers

In [2]:
biorxiv_journal_df = (
    pd.read_csv(
        Path("../..")
        / Path("biorxiv")
        / Path("journal_tracker")
        / Path("output/mapped_published_doi.tsv"),
        sep="\t",
    )
    .groupby("preprint_doi")
    .agg(
        {
            "document": "last",
            "category": "first",
            "preprint_doi": "last",
            "published_doi": "first",
            "pmcid": "first",
        }
    )
    .reset_index(drop=True)
)
biorxiv_journal_df.head()

Unnamed: 0,document,category,preprint_doi,published_doi,pmcid
0,000026_v1.xml,genetics,10.1101/000026,10.1371/journal.pgen.1004412,PMC4125079
1,000042_v2.xml,genomics,10.1101/000042,10.1038/nrg3723,PMC4151119
2,000067_v1.xml,genomics,10.1101/000067,10.1038/nature12904,PMC4285441
3,000091_v1.xml,synthetic biology,10.1101/000091,,
4,000109_v1.xml,evolutionary biology,10.1101/000109,10.1371/journal.pgen.1004410,PMC4072524


In [3]:
# Count number of Non-NaN elements
print(f"Number of Non-NaN entries: {biorxiv_journal_df.pmcid.count()}")
print(f"Total number of entries: {biorxiv_journal_df.shape[0]}")
print(
    f"Percent Covered: {(biorxiv_journal_df.pmcid.count()/biorxiv_journal_df.shape[0])*100:.2f}%"
)

Number of Non-NaN entries: 30927
Total number of entries: 71115
Percent Covered: 43.49%


In [4]:
golden_set_df = biorxiv_journal_df.query("pmcid.notnull()")
golden_set_df.head()

Unnamed: 0,document,category,preprint_doi,published_doi,pmcid
0,000026_v1.xml,genetics,10.1101/000026,10.1371/journal.pgen.1004412,PMC4125079
1,000042_v2.xml,genomics,10.1101/000042,10.1038/nrg3723,PMC4151119
2,000067_v1.xml,genomics,10.1101/000067,10.1038/nature12904,PMC4285441
4,000109_v1.xml,evolutionary biology,10.1101/000109,10.1371/journal.pgen.1004410,PMC4072524
5,000125_v1.xml,cancer biology,10.1101/000125,10.1016/j.ejca.2014.08.019,PMC4258103


# Load PubMed Central Papers

In [5]:
pmc_articles_df = pd.read_csv(
    Path("../exploratory_data_analysis")
    / Path("output")
    / Path("pubmed_central_journal_paper_map.tsv.xz"),
    sep="\t",
).query("article_type=='research-article'")
print(pmc_articles_df.shape)
pmc_articles_df.head()

(1977651, 4)


Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [6]:
journals = pmc_articles_df.journal.value_counts()
print(journals.shape)
journals

(9112,)


PLoS_One                                     218509
Sci_Rep                                      101554
Nat_Commun                                    23812
Acta_Crystallogr_Sect_E_Struct_Rep_Online     23537
J_Exp_Med                                     22687
                                              ...  
Arthroscopy                                       1
J_Media_Bus_Stud                                  1
Open_Med_J                                        1
Curr_Cancer_Rep                                   1
Genom_Discov                                      1
Name: journal, Length: 9112, dtype: int64

In [7]:
# Filter out low count journals
pmc_articles_df = pmc_articles_df.query(
    f"journal in {journals[journals > 100].index.tolist()}"
)
print(pmc_articles_df.shape)
pmc_articles_df.head()

(1865604, 4)


Unnamed: 0,journal,article_type,doi,pmcid
0,Environ_Health,research-article,10.1186/1476-069X-5-22,PMC1552054
1,Environ_Health,research-article,10.1186/1476-069X-4-12,PMC1226148
3,Environ_Health,research-article,10.1186/s12940-017-0316-3,PMC5635510
4,Environ_Health,research-article,10.1186/1476-069X-10-46,PMC3125232
5,Environ_Health,research-article,10.1186/1476-069X-11-91,PMC3533997


In [8]:
pmc_embedding_df = pd.read_csv(
    "../word_vector_experiment/output/pmc_document_vectors_300_replace.tsv.xz", sep="\t"
)
pmc_embedding_df.head()

Unnamed: 0,journal,document,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,Environ_Health,PMC1552054,-0.168362,0.044431,-0.068592,-0.02525,-0.638518,-0.302347,0.303967,-0.122796,...,0.012046,-0.156143,0.010454,-0.615891,-0.952055,-0.504918,0.113741,0.763464,-0.789372,0.009379
1,Environ_Health,PMC1226148,-0.239919,-0.060203,-0.133328,0.360897,-0.517385,-0.249687,0.056513,0.052408,...,0.321736,-0.251242,0.261475,-0.577342,-0.544784,-0.484138,0.06536,0.530282,-0.82881,-0.012625
2,Environ_Health,PMC5635510,0.048672,0.170688,-0.070395,0.236424,-0.039087,-0.381143,0.051673,-0.148415,...,0.097997,-0.170264,0.064343,-0.447883,-0.75861,-0.459414,0.433691,0.676709,-0.573157,-0.104958
3,Environ_Health,PMC3533997,-0.064178,0.173903,-0.051677,0.159768,-0.305409,-0.493326,0.267089,-0.005127,...,-0.075961,-0.40406,-0.012403,-0.567062,-1.054321,-0.372887,0.075409,0.681162,-0.849728,-0.233986
4,Environ_Health,PMC3125232,-0.186567,0.081031,-0.073595,-0.137754,0.068354,-0.40618,-0.219826,-0.108129,...,-0.090546,-0.173788,0.071811,-0.444737,-0.949252,-0.587158,0.341312,0.888989,-0.47696,-0.284114


In [9]:
full_dataset_df = (
    pmc_articles_df.query(f"pmcid not in {golden_set_df.pmcid.tolist()}")[["pmcid"]]
    .merge(pmc_embedding_df, left_on="pmcid", right_on="document")
    .drop("pmcid", axis=1)
    .set_index("document")
)
full_dataset_df.head()

Unnamed: 0_level_0,journal,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PMC1552054,Environ_Health,-0.168362,0.044431,-0.068592,-0.02525,-0.638518,-0.302347,0.303967,-0.122796,0.476222,...,0.012046,-0.156143,0.010454,-0.615891,-0.952055,-0.504918,0.113741,0.763464,-0.789372,0.009379
PMC1226148,Environ_Health,-0.239919,-0.060203,-0.133328,0.360897,-0.517385,-0.249687,0.056513,0.052408,0.568391,...,0.321736,-0.251242,0.261475,-0.577342,-0.544784,-0.484138,0.06536,0.530282,-0.82881,-0.012625
PMC5635510,Environ_Health,0.048672,0.170688,-0.070395,0.236424,-0.039087,-0.381143,0.051673,-0.148415,0.819124,...,0.097997,-0.170264,0.064343,-0.447883,-0.75861,-0.459414,0.433691,0.676709,-0.573157,-0.104958
PMC3125232,Environ_Health,-0.186567,0.081031,-0.073595,-0.137754,0.068354,-0.40618,-0.219826,-0.108129,0.715562,...,-0.090546,-0.173788,0.071811,-0.444737,-0.949252,-0.587158,0.341312,0.888989,-0.47696,-0.284114
PMC3533997,Environ_Health,-0.064178,0.173903,-0.051677,0.159768,-0.305409,-0.493326,0.267089,-0.005127,0.590815,...,-0.075961,-0.40406,-0.012403,-0.567062,-1.054321,-0.372887,0.075409,0.681162,-0.849728,-0.233986


In [10]:
subsampled_df = (
    pmc_articles_df.query(f"pmcid not in {golden_set_df.pmcid.tolist()}")[["pmcid"]]
    .merge(pmc_embedding_df, left_on="pmcid", right_on="document")
    .drop("pmcid", axis=1)
    .groupby("journal", group_keys=False)
    .apply(lambda x: x.sample(min(len(x), 100), random_state=100))
    .set_index("document")
)
subsampled_df.head()

Unnamed: 0_level_0,journal,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PMC4522720,3_Biotech,0.13333,-0.011659,-0.148201,0.056901,-0.68035,-0.541069,0.182681,0.274385,0.489536,...,0.5371,-0.087045,0.17281,-0.665449,-0.62558,-0.777601,0.042719,0.787549,-0.691591,0.108887
PMC3339597,3_Biotech,0.261378,-0.054963,-0.452646,0.231724,-0.423759,-0.285253,-0.111656,0.200244,0.339868,...,0.719345,-0.38481,0.323023,-0.689859,-0.172516,-0.484886,-0.011187,0.457489,-0.90749,0.550568
PMC4701708,3_Biotech,0.180422,0.044826,-0.319085,0.562185,-0.358127,-0.660756,-0.243341,0.226726,0.542061,...,0.481343,-0.416037,0.348759,-0.603166,-0.258857,-0.701226,-0.013169,0.743965,-0.916708,0.166841
PMC5385177,3_Biotech,0.264529,-0.031155,-0.344421,0.154589,-0.457107,-0.289754,-0.112262,0.144894,0.309427,...,0.464157,-0.327478,0.401458,-0.423051,-0.250931,-0.142234,0.022269,0.389214,-0.712894,0.448589
PMC4522724,3_Biotech,-0.039815,-0.030217,-0.056945,-0.075414,-0.48835,-0.418544,0.079159,0.231567,0.193747,...,0.175017,-0.088419,0.147828,-0.068933,-0.626343,-0.393025,0.365557,0.80831,-0.331347,0.086806


# Train Similarity Search System

In [11]:
knn_model = KNeighborsClassifier(n_neighbors=10)

## Random Journal Prediction

In [12]:
model = DummyClassifier(strategy="uniform")

In [13]:
_ = cross_validation(
    model, subsampled_df, dummy_evaluate, cv=10, random_state=100, top_predictions=10
)

95 out of 20321
91 out of 20321
113 out of 20321
105 out of 20321
98 out of 20321
91 out of 20321
106 out of 20321
100 out of 20321
102 out of 20321
119 out of 20320
Total Accuracy: 0.502%


## Centroid Prediction

In [14]:
_ = cross_validation(
    knn_model, subsampled_df, knn_centroid_evaluate, cv=10, random_state=100
)

7274 out of 20321
7364 out of 20321
7335 out of 20321
7237 out of 20321
7311 out of 20321
7275 out of 20321
7362 out of 20321
7277 out of 20321
7410 out of 20321
7273 out of 20320
Total Accuracy: 35.982%


## Paper by Paper prediction

In [15]:
_ = cross_validation(knn_model, subsampled_df, knn_evaluate, cv=10, random_state=100)

7983 out of 20321
8051 out of 20321
8025 out of 20321
7900 out of 20321
8031 out of 20321
8003 out of 20321
8065 out of 20321
7884 out of 20321
8148 out of 20321
8091 out of 20320
Total Accuracy: 39.457%


# Gold Set Analysis

In [16]:
biorxiv_embeddings_df = pd.read_csv(
    Path(
        "../../biorxiv/word_vector_experiment/output/word2vec_output/biorxiv_all_articles_300.tsv.xz"
    ).resolve(),
    sep="\t",
)
biorxiv_embeddings_df.head()

Unnamed: 0,document,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,440735_v1.xml,-0.185549,-0.028703,-0.148555,-0.133172,0.353365,-0.114964,-0.028663,-0.215159,0.551382,...,-0.286513,0.082141,-0.197929,-0.163326,-0.867377,-0.383934,0.684275,0.673811,0.332528,-0.252338
1,775270_v1.xml,-0.53204,-0.065809,-0.313654,-0.117575,0.372866,-0.318956,-0.36801,0.32585,0.808698,...,-0.199805,-0.083095,0.184185,-0.157145,-0.895367,-0.611417,0.224433,0.857584,-0.44224,-0.229249
2,242404_v1.xml,-0.511104,-0.20379,-0.171574,0.244052,0.235567,-0.269997,-0.222966,-0.247174,0.385497,...,-0.35189,-0.155088,-0.222041,-0.258185,-0.866121,-0.568663,0.553375,0.890001,0.025051,-0.347382
3,872994_v1.xml,-0.595936,-0.14779,-0.277296,0.522433,0.574653,-0.046103,-0.586993,-0.312505,0.382142,...,0.07424,0.042758,0.174565,0.155595,-0.557984,-0.445246,0.386596,0.512007,-0.510197,-0.295844
4,080853_v2.xml,-0.076522,0.039275,-0.928818,0.226103,0.035823,-0.07558,-0.189354,0.243075,0.240977,...,0.524121,0.560223,0.246179,0.242929,-0.810809,-0.118951,0.052647,0.184235,-0.154029,0.469902


In [17]:
golden_dataset = (
    golden_set_df[["document", "pmcid"]]
    .merge(pmc_articles_df[["journal", "pmcid"]], on="pmcid")
    .merge(biorxiv_embeddings_df, on="document")
)
golden_dataset.head()

Unnamed: 0,document,pmcid,journal,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,...,feat_290,feat_291,feat_292,feat_293,feat_294,feat_295,feat_296,feat_297,feat_298,feat_299
0,000026_v1.xml,PMC4125079,PLoS_Genet,-0.354588,-0.039688,-0.264044,0.061171,0.445177,-0.09428,-0.207811,...,0.077113,0.009006,-0.12814,0.107098,-0.896259,-0.591504,0.451696,1.010107,0.155138,-0.14159
1,000067_v1.xml,PMC4285441,Nature,-0.223833,0.104426,-0.187421,-0.09792,0.118555,-0.162342,-0.065943,...,0.11093,0.144949,0.003651,-0.035206,-0.887362,-0.582829,0.407179,0.400775,0.030982,0.238798
2,000109_v1.xml,PMC4072524,PLoS_Genet,-0.35977,-0.034423,-0.211397,0.142895,0.25764,0.214825,0.053788,...,0.131452,0.034245,0.119621,-0.07523,-0.786596,-0.557854,0.163636,0.867464,0.000889,0.181491
3,000141_v1.xml,PMC3894166,PLoS_Comput_Biol,-0.489331,0.029247,-0.766819,0.12692,0.228847,-0.231903,-0.362439,...,0.390265,0.303159,0.268525,0.173529,-0.685519,-0.613437,0.188164,0.561036,0.02427,-0.074251
4,000158_v1.xml,PMC4041996,BMC_Genomics,-0.184118,0.065338,-0.151212,0.281209,0.096208,-0.315909,-0.313868,...,-0.04081,-0.05694,-0.258915,-0.034394,-0.636722,-0.097607,0.717791,0.791196,0.07498,-0.112038


In [18]:
model = DummyClassifier(strategy="uniform")

In [19]:
_ = cross_validation(
    model,
    golden_dataset.drop(["pmcid", "document"], axis=1),
    dummy_evaluate,
    cv=10,
    random_state=100,
    top_predictions=10,
)

28 out of 1700
27 out of 1700
26 out of 1700
20 out of 1700
26 out of 1699
22 out of 1699
26 out of 1699
28 out of 1699
28 out of 1699
25 out of 1699
Total Accuracy: 1.506%


## Centroid Analysis

In [20]:
predictions, true_labels = knn_centroid_evaluate(
    knn_model, subsampled_df, golden_dataset.drop(["pmcid", "document"], axis=1)
)

In [21]:
accs = [
    (1 if true_labels[data_idx] in prediction_row else 0)
    for data_idx, prediction_row in enumerate(predictions)
]

In [22]:
print(f"{np.sum(accs)} out of {len(accs)}")
print(f"{np.mean(accs)*100}% correct")

2948 out of 16994
17.347299046722373% correct


## Paper by Paper analysis

In [23]:
predictions, true_labels = knn_evaluate(
    knn_model, subsampled_df, golden_dataset.drop(["pmcid", "document"], axis=1)
)

In [24]:
accs = [
    (1 if true_labels[data_idx] in prediction_row else 0)
    for data_idx, prediction_row in enumerate(predictions)
]

In [25]:
print(f"{np.sum(accs)} out of {len(accs)}")
print(f"{np.mean(accs)*100}% correct")

2603 out of 16994
15.31717076615276% correct


# Save Entire Dataset

In [26]:
(
    pmc_articles_df[["pmcid"]]
    .merge(pmc_embedding_df, left_on="pmcid", right_on="document")
    .drop("pmcid", axis=1)
    .to_csv(
        "output/paper_dataset/paper_dataset_full.tsv.xz",
        sep="\t",
        compression="xz",
        index=False,
    )
)

In [27]:
cols = dict(document="first")
cols.update({col: "mean" for col in pmc_embedding_df if "feat" in col})

(
    pmc_articles_df[["pmcid"]]
    .merge(pmc_embedding_df, left_on="pmcid", right_on="document")
    .drop(["pmcid"], axis=1)
    .groupby("journal")
    .agg(cols)
    .reset_index()
    .to_csv("output/paper_dataset/centroid.tsv", sep="\t", index=False)
)