# Create Document Vectors

This notebook creates document vectors for each of the research papers based on the paper contents.

In [1]:
from cord.core import JSON_CATALOGS, BIORXIV_MEDRXIV, COMM_USE_SUBSET, CUSTOM_LICENSE, NONCOMM_USE_SUBSET, cord_support_dir
from cord.jsonpaper import load_json_cache
from cord import ResearchPapers
import pandas as pd
from pathlib import Path, PurePath
import numpy as np

In [2]:
VECTOR_SIZE = 20

## 1. Load the Cached JSON Index Tokens

Use the precached json index tokens.

In [3]:
from IPython.display import display
dfs = []
for catalog in JSON_CATALOGS:
    json_cache = load_json_cache(catalog)
    dfs.append(json_cache)
    
json_tokens = pd.concat(dfs, ignore_index=True)

Loading json cache files for comm_use_subset
Loaded comm_use_subset json cache in 39 seconds
Loading json cache files for biorxiv_medrxiv
Loaded biorxiv_medrxiv json cache in 1 seconds
Loading json cache files for noncomm_use_subset
Loaded noncomm_use_subset json cache in 8 seconds
Loading json cache files for custom_license
Loaded custom_license json cache in 61 seconds


In [4]:
assert not json_tokens.pmcid.isnull().all(), 'The pmcid column is all null'

## 2. Train a Gensim Doc2vec Model

In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(json_tokens.index_tokens)]
model = Doc2Vec(documents, vector_size=VECTOR_SIZE, window=2, min_count=1, workers=8)

## 3. Save Doc2VecModel

In [6]:
model.save(f'Doc2Vec_{VECTOR_SIZE}.model')

## 4. Create Document Vector

In [7]:
def get_vector(tokens):
    return model.infer_vector(tokens)

%timeit get_vector(json_tokens.loc[0].index_tokens)

1.01 ms ± 9.37 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
%time json_tokens['document_vector'] = json_tokens.index_tokens.apply(model.infer_vector)

Wall time: 17min 56s


In [11]:
json_tokens

Unnamed: 0,sha,pmcid,index_tokens,document_vector,document_vector_1d,cluster_id
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,,"[s1, phylogeny, sequences, belonging, umrv, ph...","[-0.0001454174, -0.020008974, -0.15157239, -0....",-1.439926,4
1,00142f93c18b07350be89e96372d240372437ed9,,"[human, beings, constantly, exposed, myriad, p...","[-1.5019524, 2.4714665, -1.1794524, 1.2314736,...",21.845858,1
2,0022796bb2112abd2e6423ba2d57751db06049fb,,"[pathogens, vectors, transported, rapidly, aro...","[-1.3127675, 1.0502137, -0.6803469, -1.2441112...",-62.724194,0
3,0031e47b76374e05a18c266bd1a1140e5eacb54f,,"[a1111111111, a1111111111, a1111111111, a11111...","[-3.5530822, 1.9237256, -1.4036909, -2.3870523...",-15.544662,3
4,00326efcca0852dc6e39dc6b7786267e1bc4f194,,"[addition, preventative, care, nutritional, su...","[0.042658817, 2.5792394, 1.4552583, -2.7162018...",-75.814964,0
...,...,...,...,...,...,...
52092,,PMC7105754,"[inactivated, virus, vaccines, inactivated, wk...","[-1.7658923, 0.7377934, 0.80591667, 2.2260578,...",18.975584,1
52093,,PMC7105755,"[types, protein, microarrays, currently, types...","[-2.226667, 2.3260539, 0.6691074, 1.1899889, -...",9.838189,5
52094,,PMC7105756,[],"[0.0024406752, 0.010759468, 0.0051381686, 0.00...",86.231491,4
52095,,PMC7106065,[],"[0.0024406752, 0.010759468, 0.0051381686, 0.00...",86.231491,4


## 5. Save Document Vectors

In [13]:
docvector_savepath = Path(cord_support_dir()) / f'DocumentVectors_{VECTOR_SIZE}.pq'
json_vectors = json_tokens[['sha', 'pmcid', 'document_vector']]
json_vectors.to_parquet(Path(cord_support_dir()) / f'DocumentVectors_{VECTOR_SIZE}.pq')

In [14]:
pd.read_parquet(docvector_savepath)

Unnamed: 0,sha,pmcid,document_vector
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,,"[-0.0001454174, -0.020008974, -0.15157239, -0...."
1,00142f93c18b07350be89e96372d240372437ed9,,"[-1.5019524, 2.4714665, -1.1794524, 1.2314736,..."
2,0022796bb2112abd2e6423ba2d57751db06049fb,,"[-1.3127675, 1.0502137, -0.6803469, -1.2441112..."
3,0031e47b76374e05a18c266bd1a1140e5eacb54f,,"[-3.5530822, 1.9237256, -1.4036909, -2.3870523..."
4,00326efcca0852dc6e39dc6b7786267e1bc4f194,,"[0.042658817, 2.5792394, 1.4552583, -2.7162018..."
...,...,...,...
52092,,PMC7105754,"[-1.7658923, 0.7377934, 0.80591667, 2.2260578,..."
52093,,PMC7105755,"[-2.226667, 2.3260539, 0.6691074, 1.1899889, -..."
52094,,PMC7105756,"[0.0024406752, 0.010759468, 0.0051381686, 0.00..."
52095,,PMC7106065,"[0.0024406752, 0.010759468, 0.0051381686, 0.00..."
