In [None]:
from gensim.models import word2vec
import matplotlib.pyplot as plt
import gensim
import re
import os
import deepsmiles
import tqdm
import pickle
import numpy as np
import pandas as pd
import biovec
from mypackages.smilesvec import *
from mypackages.deepchem import *
pd.set_option('display.max_columns', 100)

# Data
and somemore cleaning lol

In [None]:
df = pd.read_csv('data/preprocessing/clean_bindingdb.csv').drop('Unnamed: 0', axis=1)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
col = ['id', 'ligand_smiles', 'target_name', 'Ki', 'Ki_r', 'IC50', 'IC50_r', 'Kd_r', 'Kd', 'EC50', 'EC50_r', 'pH', 'temp', 'n_protein_chains']
for i in range(1, 20):
    col.append('chain_' + str(i))
col.append('molwt')
df.columns = col

In [None]:
df[['Ki_r', 'IC50_r', 'Kd_r', 'EC50_r']].fillna(0, inplace=True)
df['pH'].fillna(-1, inplace=True)
df['temp'] = pd.to_numeric(df.temp.str.replace('C', '').fillna(-1))
types = {'id': np.int32, 'Ki_r': np.int8, 'IC50_r': np.int8, 'Kd_r': np.int8, 'EC50_r': np.int8, 'pH': np.float16, 'temp': np.float16, 'molwt': np.float16}
df = df.astype(types)

In [None]:
for i in range(1, 20):
    df['chain_' + str(i)] = df['chain_' + str(i)].str.upper()

In [None]:
has_target = ~df.chain_1.isnull() #checking whether the entry having at least one target, the dataset is a real mess, n_protein_chains is not accurate =(
for i in range(2, 20):
    has_target = has_target | (~df['chain_'+str(i)].isnull())

df = df[has_target]
df.dropna(subset=['ligand_smiles'], inplace=True)

In [None]:
chain_col_name = ['chain_'+str(i) for i in range(1, 20)]
pv_col_name = ['pv_'+str(i) for i in range(1, 101)]
sv_col_name = ['sv_'+str(i) for i in range(1, 101)]
dc_col_name = ['dc_'+str(i) for i in range(1, 112)]

In [None]:
df.loc[:, chain_col_name] = df[chain_col_name].fillna('-1')

In [None]:
df = df[~((df.chain_1.str.contains('\d', na=False)) & (df.chain_1 != '-1'))]
df = df[~((df.chain_2.str.contains('\d', na=False)) & (df.chain_2 != '-1'))]
df = df[~((df.chain_2.str.contains('\W', na=False)) & (df.chain_2 != '-1'))]

# embedding

In [None]:
protvec = word2vec.Word2Vec.load('model/protvec.model')
smilesvec = word2vec.Word2Vec.load('model/smilesvec.model')

In [None]:
new_cols = list(df.columns.values)
new_cols.extend(pv_col_name)
new_cols.extend(sv_col_name)
new_cols.extend(dc_col_name)

In [None]:
def protvec_has_vocab(protvec, chain):
    for vv in biovec.utils.split_ngrams(chain, 3):
        for v in vv:
            if v not in protvec.wv.vocab:
                return False
    return True

def protvec_online_train(protvec, chain):
    corpus = biovec.utils.split_ngrams(chain, 3)
    protvec.build_vocab(corpus, update=True)
    protvec.train(corpus, epochs=protvec.epochs, total_examples=protvec.corpus_count)

In [None]:
ll = []
feat = RDKitDescriptors()
for i, row in tqdm.tqdm(df.iterrows()):
    mol = Chem.MolFromSmiles(row.ligand_smiles) #checking validity of SMILES string
    if not mol:
        continue
    pv = np.zeros(100)
    for name in chain_col_name:
        chain = row[name]
        if chain != '-1':
            if not protvec_has_vocab(protvec, chain):
                protvec_online_train(protvec, chain)
            pv += np.sum(protvec.to_vecs(chain), axis=0)

    smiles = row['ligand_smiles']
    if not smilesvec.has_vocab(smiles):
        smilesvec.online_train([smiles])
    sv = smilesvec.to_vec(smiles)
    
    dc = feat.featurize([mol])
    
    ll.append(np.concatenate([row, pv, sv, dc[0]]))

In [None]:
import gc
del df
gc.collect()

In [None]:
arr = np.vstack(ll)
del ll
gc.collect()

In [None]:
full_df = pd.DataFrame(arr)
del arr
gc.collect()

In [None]:
full_df.columns = new_cols

In [None]:
to_drop = ['n_protein_chains'] + chain_col_name
full_df.drop(to_drop, axis=1, inplace=True)

In [None]:
types = {'id': np.int32, 'Ki': np.float64, 'Ki_r': np.int8, 'IC50': np.float64, 'IC50_r': np.int8, 
         'Kd': np.float64, 'Kd_r': np.int8, 'EC50': np.float64, 'EC50_r': np.int8, 'pH': np.float16, 
         'temp': np.float16, 'molwt': np.float16}
for c in pv_col_name + sv_col_name + dc_col_name:
    types[c] = np.float32

full_df = full_df.astype(types)

In [None]:
pickle.dump(full_df, open('data/dtba_prediction/featured_bindingdb', 'wb+'))