Infer vectors for each task using each docvec model. 

In [None]:
import os
import pickle

import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
import gensim

import embedding_tools

assert gensim.__version__ == '1.0.1'
assert np.__version__ == '1.13.1'
assert pd.__version__ == '0.20.3'

In [None]:
def infer_vectors(data, model, k, dest_file, overlap=False, method=None):
    df = pd.read_csv(data)
    seqs = embedding_tools.get_seqs(df)
    if method is not None:
        seqs = embedding_tools.randomize_seqs(seqs, method=method)
    embeds = embedding_tools.get_embeddings_new(model, seqs, k=k,
                                                overlap=overlap)
    embeds = pd.DataFrame(embeds, index=df.index)
    terms = list(range(embeds.shape[1]))
    name = model.split('/')[-1]
    with open(dest_file + 'X_' + name, 'wb') as f:
        pickle.dump((embeds, terms), f)

In [None]:
models = os.listdir('../outputs/docvec_models/')
models = [m for m in models if m[-3:] == 'pkl']

In [None]:
datasets = ['localization', 'T50', 'enantioselectivity', 'absorption']

In [None]:
for s in datasets:
    dest = '../outputs/' + s + '_embeddings/'
    Xs = os.listdir('/'.join(s.split('/')[:-1]) + dest)
    for model in models:
        if 'X_' + model in Xs:
            continue
        print('Model ' + model + ' for dataset ' + s + ':')
        k = int(model[-7])
        print('Inferring...')
        infer_vectors('../inputs/' + s + '.txt', '../outputs/docvec_models/' + model, k, dest)
