# Downsample Document Vectors



In [26]:
from cord.core import DOCUMENT_VECTOR_PATH, DOCUMENT_VECTOR_LENGTH, JSON_CATALOGS, cord_support_dir
from cord.jsonpaper import load_json_cache
from cord import ResearchPapers
import pandas as pd
from pathlib import Path, PurePath
import numpy as np

## 1. Load Document Vectors

In [3]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH)
document_vectors = document_vectors[['sha', 'pmcid','document_vector']]
document_vectors

Unnamed: 0,sha,pmcid,document_vector
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,,"[0.56928027, -0.3666296, -0.20493843, 0.431051..."
1,00142f93c18b07350be89e96372d240372437ed9,,"[3.3664315, -2.1276946, 1.6158434, 0.9182286, ..."
2,0022796bb2112abd2e6423ba2d57751db06049fb,,"[0.27237293, 0.15642405, 2.5503929, 1.1305904,..."
3,0031e47b76374e05a18c266bd1a1140e5eacb54f,,"[0.5768533, -3.854187, 0.072966725, 0.8637349,..."
4,00326efcca0852dc6e39dc6b7786267e1bc4f194,,"[1.3273811, 0.4609563, 3.068578, -0.50986123, ..."
...,...,...,...
52092,,,"[1.418982, -0.4864095, 0.19998026, 0.23552166,..."
52093,,,"[-0.5313242, -1.7436063, 1.3774712, 0.01626098..."
52094,,,"[0.0024406752, 0.010759468, 0.0051381686, 0.00..."
52095,,,"[0.0024406752, 0.010759468, 0.0051381686, 0.00..."


## 2. Create Downsampled Vectors

In [7]:
docvector_arr = np.stack(document_vectors.document_vector.values)
RANDOM_STATE = 42

def kmean_labels(docvectors, n_clusters=6, random_state=RANDOM_STATE):
    print('Setting cluster labels')
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=random_state).fit(docvectors)
    return kmeans.labels_


def tsne_embeddings(docvectors, dimensions=2):
    print(f'Creating {dimensions}D  embeddings')
    from sklearn.manifold import TSNE
    tsne = TSNE(verbose=1,
                perplexity=15,
                early_exaggeration=24,
                n_components=dimensions,
                n_jobs=8,
                random_state=RANDOM_STATE,
                learning_rate=600)
    embeddings = tsne.fit_transform(docvectors)
    return embeddings

%time document_vector_2d = tsne_embeddings(docvector_arr, 2)
%time document_vector_1d = tsne_embeddings(docvector_arr, 1)
%time cluster_id = kmean_labels(docvector_arr, 7)

Creating 2D  embeddings
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 52097 samples in 0.285s...
[t-SNE] Computed neighbors for 52097 samples in 25.602s...
[t-SNE] Computed conditional probabilities for sample 1000 / 52097
[t-SNE] Computed conditional probabilities for sample 2000 / 52097
[t-SNE] Computed conditional probabilities for sample 3000 / 52097
[t-SNE] Computed conditional probabilities for sample 4000 / 52097
[t-SNE] Computed conditional probabilities for sample 5000 / 52097
[t-SNE] Computed conditional probabilities for sample 6000 / 52097
[t-SNE] Computed conditional probabilities for sample 7000 / 52097
[t-SNE] Computed conditional probabilities for sample 8000 / 52097
[t-SNE] Computed conditional probabilities for sample 9000 / 52097
[t-SNE] Computed conditional probabilities for sample 10000 / 52097
[t-SNE] Computed conditional probabilities for sample 11000 / 52097
[t-SNE] Computed conditional probabilities for sample 12000 / 52097
[t-SNE] Computed conditio

In [21]:
document_vectors['document_vector_2d'] = document_vector_2d.tolist()
document_vectors['document_vector_1d'] = document_vector_1d
document_vectors['cluster_id'] = cluster_id

## 7. Save Document Vectors

In [27]:
docvector_savepath = Path(cord_support_dir()) / f'DocumentVectors_{DOCUMENT_VECTOR_LENGTH}.pq'
document_vectors.to_parquet(docvector_savepath)

In [29]:
pd.read_parquet(DOCUMENT_VECTOR_PATH)

Unnamed: 0,sha,pmcid,document_vector,document_vector_2d,document_vector_1d,cluster_id
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,,"[0.56928027, -0.3666296, -0.20493843, 0.431051...","[15.698081970214844, 17.25160789489746]",-20.126677,1
1,00142f93c18b07350be89e96372d240372437ed9,,"[3.3664315, -2.1276946, 1.6158434, 0.9182286, ...","[31.03999137878418, -16.235151290893555]",28.878820,5
2,0022796bb2112abd2e6423ba2d57751db06049fb,,"[0.27237293, 0.15642405, 2.5503929, 1.1305904,...","[-50.606563568115234, 47.40485763549805]",-54.790283,0
3,0031e47b76374e05a18c266bd1a1140e5eacb54f,,"[0.5768533, -3.854187, 0.072966725, 0.8637349,...","[-69.33251190185547, 3.0156607627868652]",-7.735852,2
4,00326efcca0852dc6e39dc6b7786267e1bc4f194,,"[1.3273811, 0.4609563, 3.068578, -0.50986123, ...","[-18.875192642211914, 65.8654556274414]",-64.668060,0
...,...,...,...,...,...,...
52092,,,"[1.418982, -0.4864095, 0.19998026, 0.23552166,...","[8.962655067443848, -2.460934638977051]",27.742414,5
52093,,,"[-0.5313242, -1.7436063, 1.3774712, 0.01626098...","[62.28213882446289, -22.328174591064453]",64.295235,4
52094,,,"[0.0024406752, 0.010759468, 0.0051381686, 0.00...","[9.226234436035156, 101.4948959350586]",-89.850319,1
52095,,,"[0.0024406752, 0.010759468, 0.0051381686, 0.00...","[6.759056568145752, 103.68380737304688]",-89.850319,1
