In [None]:
import multiprocessing
import os
import pickle
import random

import gensim
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from embeddings_reproduction import embedding_tools

In [None]:
# assert gensim.__version__ == '1.0.1'
# assert np.__version__ == '1.13.1'
# assert pd.__version__ == '0.20.3'

In [None]:
sequence_dict = {}
sequence_dict['small'] = pd.read_csv('../inputs/small_uniprot.txt')
sequence_dict['original'] = pd.read_csv('../inputs/uniprot-reviewed-lim_sequences.txt', comment='#')
sequence_dict['scrambled'] = pd.read_csv('../inputs/r1.txt')
sequence_dict['random'] = pd.read_csv('../inputs/r2.txt')
sequence_dict['uniform'] = pd.read_csv('../inputs/r3.txt')

In [None]:
sequence_dict['ChR_data'] = pd.read_csv('../inputs/localization.txt')
sequence_dict['P450_data'] = pd.read_csv('../inputs/T50.txt')
sequence_dict['absorption_data'] = pd.read_csv('../inputs/absorption.txt')
sequence_dict['enan_data'] = pd.read_csv('../inputs/enantioselectivity.txt')

In [None]:
def train(X, k, window):
    name_list = [X, str(k), str(window)]
    if os.path.isfile('../outputs/docvec_models/' + '_'.join(name_list) + '.pkl'):
        return
    print('X\t\tk\twindow')
    print(name_list[0] + '\t\t' + '\t'.join(name_list[1:]))
    kmer_hypers = {'k':k, 
                   'overlap':False,
                   'merge':False}
    model_hypers = {'size': 64,
                    'min_count': 0,
                    'iter': 25,
                    'window':window,
                    'workers': 4}
    documents = embedding_tools.Corpus(sequence_dict[X], kmer_hypers)
    model = Doc2Vec(**model_hypers)
    model.build_vocab(documents)
    model.train(documents)
    model.save('../outputs/docvec_models/' + '_'.join(name_list) + '.pkl')

In [None]:
for X in sequence_dict.keys():
    for k in range(1, 6):
        for window in range(1, 8):
            train(X, k, window)