# Downsample Document Vectors



In [1]:
from cord.core import DOCUMENT_VECTOR_PATH, DOCUMENT_VECTOR_LENGTH, JSON_CATALOGS, cord_support_dir
from cord.jsonpaper import load_json_cache
from cord import ResearchPapers
import pandas as pd
from pathlib import Path, PurePath
import numpy as np

## 1. Load Document Vectors

In [2]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH)
document_vectors = document_vectors[['sha', 'pmcid','document_vector']]
document_vectors

Unnamed: 0,sha,pmcid,document_vector
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,,"[-0.0001454174, -0.020008974, -0.15157239, -0...."
1,00142f93c18b07350be89e96372d240372437ed9,,"[-1.5019524, 2.4714665, -1.1794524, 1.2314736,..."
2,0022796bb2112abd2e6423ba2d57751db06049fb,,"[-1.3127675, 1.0502137, -0.6803469, -1.2441112..."
3,0031e47b76374e05a18c266bd1a1140e5eacb54f,,"[-3.5530822, 1.9237256, -1.4036909, -2.3870523..."
4,00326efcca0852dc6e39dc6b7786267e1bc4f194,,"[0.042658817, 2.5792394, 1.4552583, -2.7162018..."
...,...,...,...
52092,,PMC7105754,"[-1.7658923, 0.7377934, 0.80591667, 2.2260578,..."
52093,,PMC7105755,"[-2.226667, 2.3260539, 0.6691074, 1.1899889, -..."
52094,,PMC7105756,"[0.0024406752, 0.010759468, 0.0051381686, 0.00..."
52095,,PMC7106065,"[0.0024406752, 0.010759468, 0.0051381686, 0.00..."


## 2. Create Downsampled Vectors

In [5]:
docvector_arr = np.stack(document_vectors.document_vector.values)
RANDOM_STATE = 42

def kmean_labels(docvectors, n_clusters=6, random_state=RANDOM_STATE):
    print('Setting cluster labels')
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=random_state).fit(docvectors)
    return kmeans.labels_, kmeans


def tsne_embeddings(docvectors, dimensions=2):
    print(f'Creating {dimensions}D  embeddings')
    from sklearn.manifold import TSNE
    tsne = TSNE(verbose=1,
                perplexity=15,
                early_exaggeration=24,
                n_components=dimensions,
                n_jobs=8,
                random_state=RANDOM_STATE,
                learning_rate=600)
    embeddings = tsne.fit_transform(docvectors)
    return embeddings, tsne

%time document_vector_2d, tsne2d = tsne_embeddings(docvector_arr, 2)
%time document_vector_1d, tsne1d = tsne_embeddings(docvector_arr, 1)
%time cluster_id, kmeans = kmean_labels(docvector_arr, 7)

Creating 2D  embeddings
[t-SNE] Computing 46 nearest neighbors...
[t-SNE] Indexed 52097 samples in 0.347s...
[t-SNE] Computed neighbors for 52097 samples in 33.091s...
[t-SNE] Computed conditional probabilities for sample 1000 / 52097
[t-SNE] Computed conditional probabilities for sample 2000 / 52097
[t-SNE] Computed conditional probabilities for sample 3000 / 52097
[t-SNE] Computed conditional probabilities for sample 4000 / 52097
[t-SNE] Computed conditional probabilities for sample 5000 / 52097
[t-SNE] Computed conditional probabilities for sample 6000 / 52097
[t-SNE] Computed conditional probabilities for sample 7000 / 52097
[t-SNE] Computed conditional probabilities for sample 8000 / 52097
[t-SNE] Computed conditional probabilities for sample 9000 / 52097
[t-SNE] Computed conditional probabilities for sample 10000 / 52097
[t-SNE] Computed conditional probabilities for sample 11000 / 52097
[t-SNE] Computed conditional probabilities for sample 12000 / 52097
[t-SNE] Computed conditio

In [6]:
document_vectors['document_vector_2d'] = document_vector_2d.tolist()
document_vectors['document_vector_1d'] = document_vector_1d
document_vectors['cluster_id'] = cluster_id

## 3. Save Document Vectors

In [8]:
docvector_savepath = Path(cord_support_dir()) / f'DocumentVectors_{DOCUMENT_VECTOR_LENGTH}.pq'
document_vectors.to_parquet(docvector_savepath)

## 4. Save TSNE

In [9]:
import pickle

with Path(cord_support_dir()/ 'TSNE2d.pickle').open('wb') as f:
    pickle.dump(tsne2d, f)
    
with Path(cord_support_dir()/ 'TSNE1d.pickle').open('wb') as f:
    pickle.dump(tsne1d, f)
    
with Path(cord_support_dir()/ 'KMeans.pickle').open('wb') as f:
    pickle.dump(kmeans, f)

In [10]:
pd.read_parquet(DOCUMENT_VECTOR_PATH)

Unnamed: 0,sha,pmcid,document_vector,document_vector_2d,document_vector_1d,cluster_id
0,000b7d1517ceebb34e1e3e817695b6de03e2fa78,,"[-0.0001454174, -0.020008974, -0.15157239, -0....","[-14.980960845947266, -10.291929244995117]",-2.289008,4
1,00142f93c18b07350be89e96372d240372437ed9,,"[-1.5019524, 2.4714665, -1.1794524, 1.2314736,...","[61.95029830932617, -67.07720947265625]",-19.983704,1
2,0022796bb2112abd2e6423ba2d57751db06049fb,,"[-1.3127675, 1.0502137, -0.6803469, -1.2441112...","[-53.65645217895508, 25.589277267456055]",61.410961,0
3,0031e47b76374e05a18c266bd1a1140e5eacb54f,,"[-3.5530822, 1.9237256, -1.4036909, -2.3870523...","[12.35268783569336, -39.1517219543457]",13.684706,3
4,00326efcca0852dc6e39dc6b7786267e1bc4f194,,"[0.042658817, 2.5792394, 1.4552583, -2.7162018...","[-72.797119140625, -11.658207893371582]",71.808960,0
...,...,...,...,...,...,...
52092,,PMC7105754,"[-1.7658923, 0.7377934, 0.80591667, 2.2260578,...","[6.685484409332275, -8.654548645019531]",-22.822292,1
52093,,PMC7105755,"[-2.226667, 2.3260539, 0.6691074, 1.1899889, -...","[19.905824661254883, 47.55586624145508]",-11.599771,5
52094,,PMC7105756,"[0.0024406752, 0.010759468, 0.0051381686, 0.00...","[-70.5265884399414, -58.488616943359375]",-90.872078,4
52095,,PMC7106065,"[0.0024406752, 0.010759468, 0.0051381686, 0.00...","[-65.7143325805664, -62.280982971191406]",-89.859550,4
