In [None]:
import deepchem as dc
import numpy as np
import os
from my_tokenizers2 import SmilesTokenizer
tokenizer = SmilesTokenizer(vocab_file ='mol2vec_vocabs.txt')

from transformers import ElectraModel, ElectraConfig,ElectraForPreTraining,ElectraForMaskedLM
emb_dim = 256
output_dir="./fingerprints_smile_output256"
electra_model = ElectraForMaskedLM.from_pretrained(output_dir)

In [None]:
from deepchem.splits.splitters import ScaffoldSplitter
splitter = ScaffoldSplitter()
train_dataset, valid_dataset, test_dataset = dc.molnet.load_hiv(split='scaffold',reload=False)[1]
print(len(train_dataset.X))
print(len(valid_dataset))
print(len(test_dataset))

In [None]:
#get subsets from indices
from tqdm import tqdm
train_x = []
train_y = []
valid_x = []
valid_y = []
test_x = []
test_y = []
hidden_dim, smilen = 256,256
MAX_SMI_LEN = 256
def trans(smi):
    from rdkit import Chem
    if isinstance(smi,str):
        smi = Chem.MolFromSmiles(smi)
    from mol2vec.features import mol2alt_sentence
    sentence = mol2alt_sentence(smi,1)
    sentence = ' '.join(map(str, sentence)) 
    input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
    if len(input_ids[0]) > MAX_SMI_LEN:
        input_ids = input_ids[:, :MAX_SMI_LEN]
        input_ids[:, 255:256] = 3 
    outputs = electra_model(input_ids, labels=input_ids)
    embX = outputs[2][0].detach().numpy()[0][1:-1]
    newX = np.zeros((smilen, hidden_dim))
    for i, x in enumerate(embX):
        if(len(x) > smilen):
            newX[i] = x[:smilen]
        else:
            newX[i,:len(x)] = x
    return newX

def trans_Y(label):
    if label == 1:
        target = [1,0]
    elif label == 0:
        target = [0,1]
    return target

for i in tqdm(range(len(train_dataset))):   
    train_x.append(trans(train_dataset.ids[i]))
    train_y.append(trans_Y(train_dataset.y[i]))

for i in tqdm(range(len(valid_dataset))):
    valid_x.append(trans(valid_dataset.ids[i]))
    valid_y.append(trans_Y(valid_dataset.y[i]))
    
for i in tqdm(range(len(test_dataset))):
    test_x.append(trans(test_dataset.ids[i]))
    test_y.append(trans_Y(test_dataset.y[i]))


In [None]:
train_x = np.array(train_x)
train_y = np.array(train_y)
valid_x = np.array(valid_x)
valid_y = np.array(valid_y)
test_x = np.array(test_x)
test_y = np.array(test_y)
np.save("dataset_new/HIV_train_x_full",train_x)
np.save("dataset_new/HIV_train_y_full",train_y)
np.save("dataset_new/HIV_valid_x_full",valid_x)
np.save("dataset_new/HIV_valid_y_full",valid_y)
np.save("dataset_new/HIV_test_x_full",test_x)
np.save("dataset_new/HIV_test_y_full",test_y)
