In [1]:
from gensim.models import word2vec
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import re
import os
import tqdm
import pickle
import biovec
from mypackages.smilesvec import *
from mypackages.deepchem import *
from mypackages.purple_teletubbies import *
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)



# Data

In [None]:
drugbank = pd.read_csv('data/dtba_prediction/drugbank.csv')
drugcentral = pd.read_csv('data/dtba_predction/drugcentral.csv')

In [None]:
drugbank.dropna(subset=['SMILES'], inplace=True)
drugcentral.dropna(subset=['SMILES'], inplace=True)

In [None]:
main_protease = 'SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVLKLKVDTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIKGSFLNGSCGSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYGPFVDRQTAQAAGTDTTITVNVLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDILGPLSAQTGIAVLDMCASLKELLQNGMNGRTILGSALLEDEFTPFDVVRQCSGVTFQ'

In [None]:
pv = np.sum(protvec.to_vecs(main_protease), axis=0)

# Embedding

In [None]:
protvec = word2vec.Word2Vec.load('model/protvec.model')
smilesvec = word2vec.Word2Vec.load('model/smilesvec.model')

In [None]:
chain_col_name = ['chain_'+str(i) for i in range(1, 20)]
pv_col_name = ['pv_'+str(i) for i in range(1, 101)]
sv_col_name = ['sv_'+str(i) for i in range(1, 101)]
dc_col_name = ['dc_'+str(i) for i in range(1, 112)]
types = {c: np.float64 for c in ['molwt'] + pv_col_name + sv_col_name + dc_col_name}

In [None]:
missed_smiles = [smiles for smiles in set(drugcentral.SMILES).union(set(drugbank.SMILES)) if not smilesvec.has_vocab(smiles) and Chem.MolFromSmiles(smiles)]

In [None]:
smilesvec.online_train(missed_smiles)

In [None]:
def generate_features(data):
    new_cols = list(data.columns.values) + ['molwt']
    new_cols.extend(pv_col_name)
    new_cols.extend(sv_col_name)
    new_cols.extend(dc_col_name)
    ll = []
    feat = RDKitDescriptors()
    for i, row in tqdm.tqdm(data.iterrows()):
        mol = Chem.MolFromSmiles(row['SMILES'])
        if not mol:
            continue
        
        smiles = row['SMILES']
        if not smilesvec.has_vocab(smiles):
            print(smiles)
            smilesvec.online_train([smiles])
        sv = smilesvec.to_vec(smiles)
        
        dc = feat.featurize([mol])
        
        molwt = Chem.Descriptors.MolWt(mol)
        ll.append(np.concatenate([row, [molwt], pv, sv, dc[0]]))
    arr = np.vstack(ll)
    full_df = pd.DataFrame(arr)
    full_df = full_df.astype(types)
    full_df.frop(['dc_10', 'dc_11', 'dc_12', 'dc_13', 'dc_29'], axis=1, inplace=True)
    return full_df

In [None]:
featured_drugcentral = generate_features(drugcentral)
featured_drugbank = generate_features(drugbank)

In [None]:
dc_features = torch.from_numpy(featured_drugcentral.drop(['ID', 'SMILES'], axis=1).values)
db_features = torch.from_numpy(featured_drugbank.drop(['DrugBank ID', 'Name', 'SMILES'], axis=1).values)

# Prediction

In [None]:
standardscaler = pickle.load(open('model/standardscaler', 'rb'))
dc_features = standardscaler.transform(dc_features)
db_features = standardscaler.transfrom(db_features)

In [None]:
model = purple_teletubbies()
model.load_state_dict(torch.load('model/purple_teletubbies.model'))

dc_features_tensors = torch.from_numpy(dc_features).float()
db_features_tensors = torch.from_numpy(db_features).float()
featured_drugcentral['prediction'] = model(dc_features_tensors).reshape(-1).data
featured_drugbank['prediction'] = model(db_features_tensors).reshape(-1).data

In [None]:
featured_drugcentral.to_csv('data/dtba_prediction/drugcentral_prediction.csv')
featured_drugbank.to_csv('data/dtba_prediction/drugbank_prediction.csv')