In [8]:
import numpy as np
import torch
from torch.utils.data import DataLoader,Dataset,random_split,ConcatDataset
from torchvision.transforms import transforms
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import pandas as pd
import pickle
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import torcheval.metrics.functional as tce
import os

In [9]:
class VectorizerApply(BaseEstimator,TransformerMixin):
    def __init__(self,vectorizer_type) -> None:
        super().__init__()
        self.vectoryzer_type = vectorizer_type
        self.Vectorizers_ = None
        self.dataset_columns_ = None
        self.id_col_ = None
        self.name_col_ = None
        self.index_col_ = None

    def fit(self,X:pd.DataFrame,y=None,**fit_params):
        self.Vectorizers_ = []
        self.id_col_ = fit_params['id']
        self.name_col_ = fit_params['name']
        self.index_col_ = fit_params['index']        
        X = X.drop([fit_params['id'],fit_params['index'],fit_params['name']],axis=1)
        self.dataset_columns_ = X.columns
        for column in self.dataset_columns_:
            X[column + 'ـseparate'] = X[column].apply(lambda x: x.replace('|',' '))
            vectorizer = self.vectoryzer_type()
            vectorizer.fit(X[column + 'ـseparate']).toarray()
            self.Vectorizers_.append(vectorizer)
        return self
    
    def transform(self,X:pd.DataFrame) -> pd.DataFrame:
        result_columns = [self.id_col_,self.name_col_]
        result_data = X[[self.id_col_,self.name_col_]].values
        X = X.drop([self.id_col_,self.index_col_,self.name_col_],axis=1)
        for i,column in enumerate(self.dataset_columns_):
            X[column + 'ـseparate'] = X[column].apply(lambda x: x.replace('|',' '))
            vectorizer = self.Vectorizers_[i]
            data_matrix = vectorizer.transform(X[column + 'ـseparate']).toarray()
            result_columns = result_columns + list(vectorizer.get_feature_names_out())
            data_matrix[data_matrix > 1] = 1
            result_data = np.concatenate((result_data,data_matrix),axis=1)
        result_df = pd.DataFrame(data=result_data,columns=result_columns)
        return result_df

    def fit_transform(self, X:pd.DataFrame, y=None, **fit_params) -> pd.DataFrame:
        self.Vectorizers_ = []
        self.id_col_ = fit_params['id']
        self.name_col_ = fit_params['name']
        self.index_col_ = fit_params['index']        
        result_columns = [fit_params['id'],fit_params['name']]
        result_data = X[[fit_params['id'],fit_params['name']]].values
        X = X.drop([fit_params['id'],fit_params['index'],fit_params['name']],axis=1)
        self.dataset_columns_ = X.columns
        for column in self.dataset_columns_:
            X[column + 'ـseparate'] = X[column].apply(lambda x: x.replace('|',' '))
            vectorizer = self.vectoryzer_type()
            data_matrix = vectorizer.fit_transform(X[column + 'ـseparate']).toarray()
            result_columns = result_columns + list(vectorizer.get_feature_names_out())
            data_matrix[data_matrix > 1] = 1
            result_data = np.concatenate((result_data,data_matrix),axis=1)
            self.Vectorizers_.append(vectorizer)
        result_df = pd.DataFrame(data=result_data,columns=result_columns)
        return result_df   

In [10]:
def evaluate_model(model, dataloader, device,num_classes):
    model.eval()
    y_true, y_pred = [], None
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            if y_pred is None:
                y_pred = outputs
            else:
                y_pred = torch.vstack((y_pred,outputs))
            y_true.extend(targets.cpu().numpy())
    y_pred = y_pred.cpu().numpy()
    y_pred = np.argmax(y_pred, axis=1)
    y_pred_final = []
    y_true_final = []
    for i in range(len(y_true)):
      y_true_final.append(y_true[i] + 1)
    for i in range(len(y_pred)):
      y_pred_final.append(y_pred[i] + 1)

    # Calculate accuracy
    acc = accuracy_score(y_true_final, y_pred_final)
    
    # calculate f1_score
    f1_score_result = f1_score(y_true_final, y_pred_final, average='macro')

    # Calculate precision
    precision = precision_score(y_true_final, y_pred_final, average='macro', zero_division = 0)

    # calculate recall
    recall = recall_score(y_true_final, y_pred_final, average='macro')
    
    return acc, f1_score_result, precision,recall

In [11]:
class Label_creator(BaseEstimator,TransformerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.interaction_list_ = None
        self.index_col_ = None
        self.mechanism_col_ = None
        self.action_col_ = None

    def fit(self,X:pd.DataFrame,y=None,**fit_params):
        self.index_col_ = fit_params['index']
        self.mechanism_col_= fit_params['mechanism']
        self.action_col_ = fit_params['action']
        self.interaction_list_ = list(set(X[self.mechanism_col_] + ' '+ X[self.action_col_]))

    def transform(self,X:pd.DataFrame) -> pd.DataFrame:
        X = X.drop([self.index_col_],axis=1)
        X['interaction'] = X[self.mechanism_col_] + ' ' + X[self.action_col_]
        X['interaction_numaber'] = list(map(lambda x : self.interaction_list_.index(x),X[self.mechanism_col_] + ' ' + X[self.action_col_]))
        X = X.drop([self.mechanism_col_,self.action_col_],axis=1)
        return X

    def fit_transform(self, X:pd.DataFrame, y=None, **fit_params) -> pd.DataFrame:
        self.index_col_ = fit_params['index']
        self.mechanism_col_= fit_params['mechanism']
        self.action_col_ = fit_params['action']
        self.interaction_list_ = list(set(X[self.mechanism_col_] + ' '+ X[self.action_col_]))
        X = X.drop([self.index_col_],axis=1)
        X['interaction'] = X[self.mechanism_col_] + ' ' + X[self.action_col_]
        X['interaction_numaber'] = list(map(lambda x : self.interaction_list_.index(x),X[self.mechanism_col_] + ' ' + X[self.action_col_]))
        X = X.drop([self.mechanism_col_,self.action_col_],axis=1)
        return X
        
        
        

In [12]:
class DrugDataset(Dataset):

    def __init__(self,X,y,**kwargs) -> None:
        super().__init__(**kwargs)
        self.pair_drugs = []
        self.classes = [item[1] for item in sorted(set(zip(y['interaction_numaber'],y['interaction'])))]
        self.class_to_idx = dict(set(zip(y['interaction'],y['interaction_numaber'])))
        self.data = None
        self.targets = None
        final_X = []
        final_y = []
        for i in range(len(y)):
            first_drug = y.loc[i,'drugA']
            second_drug = y.loc[i,'drugB']
            first_drug_vec = X[X['name']==first_drug].values[0]
            second_drug_vec = X[X['name']== second_drug].values[0]
            final_X.append(np.int32(np.vstack((first_drug_vec[2:],second_drug_vec[2:]))))
            final_y.append(y.loc[i,'interaction_numaber'])
            self.pair_drugs.append((first_drug,second_drug))
            if os.name == 'nt':
                if i%1000 == 0:
                    _ = os.system('cls')
                    print(f'Process: {(i/len(y))*100} %')
            else:
                if i%1000 == 0:
                    _ = os.system('clear')
                    print(f'Process: {(i/len(y))*100} %')
        self.data = np.array(final_X)
        self.data = torch.Tensor(self.data).to(dtype=torch.float32)
        self.targets = torch.Tensor(final_y).to(dtype=torch.int32)

    def __len__(self) -> int:
        return len(self.targets)
    
    def __getitem__(self, index):
        return self.data[index],self.targets[index]

In [13]:
class DatasetWithTransform(Dataset):
    def __init__(self, dataset, transform):
        self.dataset = dataset
        self.transform = transform

    def __getitem__(self, index):
        x, y = self.dataset[index]
        return self.transform(x), y

    def __len__(self):
        return len(self.dataset)

In [14]:
def augment_and_concat(dataset):
    transform = transforms.RandomVerticalFlip(p=1)
    augmented_dataset = DatasetWithTransform(dataset=dataset,transform=transform)
    return ConcatDataset(datasets=[dataset,augmented_dataset])

In [15]:
# Deep learning module

class CPSP(torch.nn.Module):

    def __init__(self,feature_size,in_channels,conv_mid_channel,out_dim,*args, **kwargs) -> None:
        super(CPSP,self).__init__(*args, **kwargs)
        self.conv = torch.nn.Conv2d(in_channels=in_channels, out_channels=conv_mid_channel,kernel_size=(2,3),stride=1)
        self.bn1 = torch.nn.BatchNorm2d(conv_mid_channel)
        self.conv1_1 = torch.nn.Conv2d(in_channels=conv_mid_channel,out_channels=1, kernel_size=1)
        self.bn2 = torch.nn.BatchNorm2d(1)
        self.fc1 = torch.nn.Linear((feature_size-3)//1 + 1, 2048)
        self.fc2 = torch.nn.Linear(2048, 1024)
        self.fc3 = torch.nn.Linear(1024, out_dim)
        self.dropout1 = torch.nn.Dropout(p=0.5)
        self.dropout2 = torch.nn.Dropout(p=0.3)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = torch.nn.functional.selu(x)
        x = self.bn1(x)
        x = self.conv1_1(x)
        x = torch.nn.functional.selu(x)
        x = self.bn2(x)
        x = x.view(-1,int(x.nelement() / x.shape[0]))
        x = self.dropout1(x)
        x = self.fc1(x)
        x = torch.nn.functional.selu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        x = torch.nn.functional.selu(x)
        x = self.fc3(x)
        return x

# dataset 1

In [21]:
with open('./DS1/df.pkl','rb') as f:
    dataset = pickle.load(file=f)
dataset

Unnamed: 0,index,id,side,target,enzyme,pathway,smile,name
0,0,DB01296,C1096328|C0162830|C1611725|C0541767|C0242973|C...,P14780|Q00653|P01375|P01579|P33673,P33261|P05181,hsa:4318|hsa:4791|hsa:7124|hsa:3458,9|10|14|18|19|20|178|181|283|284|285|286|299|3...,Glucosamine
1,1,DB09230,C0015371|C0949040|C0856054|C0231926|C1608969|C...,Q02641,P08684,hsa:782,9|10|11|12|13|14|15|16|18|19|20|129|131|132|17...,Azelnidipine
2,2,DB05812,C0341697|C0035232|C0855476|C0162119|C1142166|C...,P05093,P08684|Q06520|P10635|P10632|P05177|P33261|P11712,hsa:1586,9|10|11|12|14|18|143|147|178|179|182|183|184|1...,Abiraterone
3,3,DB01195,C1737214|C0015376|C0576091|C1536116|C0679254|C...,Q14524|P35499|Q12809,P10635|P11712,hsa:6331|hsa:6329|hsa:3757,9|10|11|12|14|15|18|19|23|24|25|178|180|181|18...,Flecainide
4,4,DB00201,C0423602|C0239557|C0031924|C0947912|C0600125|C...,P30542|P29274|Q07343|P21817|BE0004922|P78527|O...,P20815|P05177|P24462|P08684|P05181|P10632|P117...,hsa:134|hsa:135|hsa:5142|hsa:6261|hsa:5591|hsa...,9|10|11|14|15|16|18|19|143|148|149|178|183|184...,Caffeine
...,...,...,...,...,...,...,...,...
567,567,DB01587,C0155867|C0341697|C0231341|C0853557|C0159060|C...,P30536|P14867|P18505|Q8N1C3|O14764|P78334,P08684,hsa:706|hsa:2554|hsa:2560|hsa:2565|hsa:2563|hs...,9|10|11|12|14|15|18|19|37|178|182|183|184|185|...,Ketazolam
568,568,DB00448,C0241148|C0040440|C1096403|C0011253|C0001416|C...,P20648|P10636,P33261|P11712|P08684|P04798|P05177|Q16678|P332...,hsa:495|hsa:4137,9|10|11|12|14|15|18|19|23|24|33|143|148|149|17...,Lansoprazole
569,569,DB00559,C0030283|C0856054|C0853557|C0426597|C0026636|C...,P25101|P24530,P08684|P11712,hsa:1909|hsa:1910,9|10|11|12|14|15|16|18|19|20|33|178|182|183|18...,Bosentan
570,570,DB04953,C0238097|C1095952|C0558401|C0341217|C0154446|C...,O43526|O43525|P56696|Q9NR82,P22309|P35503|P22310|O60656|P11509|P11245,hsa:3785|hsa:3786|hsa:9132|hsa:56479,9|10|11|12|14|15|18|19|23|178|182|185|189|283|...,Ezogabine


In [22]:
vec = VectorizerApply(CountVectorizer)
X = vec.fit_transform(dataset,id='id',index='index',name='name')
X

Unnamed: 0,id,name,c0000727,c0000729,c0000733,c0000734,c0000735,c0000772,c0000809,c0000810,...,840,842,845,847,860,861,863,866,93,95
0,DB01296,Glucosamine,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DB09230,Azelnidipine,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,DB05812,Abiraterone,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,DB01195,Flecainide,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DB00201,Caffeine,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,DB01587,Ketazolam,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
568,DB00448,Lansoprazole,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
569,DB00559,Bosentan,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
570,DB04953,Ezogabine,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
connection = sqlite3.connect('./DS1/event.db')
extraction = pd.read_sql('select * from extraction;', connection)
extraction

Unnamed: 0,index,mechanism,action,drugA,drugB
0,0,The risk or severity of adverse effects,increase,Abemaciclib,Amiodarone
1,1,The serum concentration,decrease,Abemaciclib,Apalutamide
2,2,The serum concentration,increase,Abemaciclib,Aprepitant
3,3,The metabolism,decrease,Abemaciclib,Atomoxetine
4,4,The metabolism,decrease,Abemaciclib,Bortezomib
...,...,...,...,...,...
37259,37259,The serum concentration,increase,Nefazodone,Netupitant
37260,37260,The metabolism,decrease,Nefazodone,Nicardipine
37261,37261,The serum concentration,increase,Neratinib,Netupitant
37262,37262,The serum concentration,increase,Netupitant,Nicardipine


In [24]:
label_creator = Label_creator()
y = label_creator.fit_transform(extraction,index='index',mechanism='mechanism',action='action')
y

Unnamed: 0,drugA,drugB,interaction,interaction_numaber
0,Abemaciclib,Amiodarone,The risk or severity of adverse effects increase,38
1,Abemaciclib,Apalutamide,The serum concentration decrease,18
2,Abemaciclib,Aprepitant,The serum concentration increase,10
3,Abemaciclib,Atomoxetine,The metabolism decrease,13
4,Abemaciclib,Bortezomib,The metabolism decrease,13
...,...,...,...,...
37259,Nefazodone,Netupitant,The serum concentration increase,10
37260,Nefazodone,Nicardipine,The metabolism decrease,13
37261,Neratinib,Netupitant,The serum concentration increase,10
37262,Netupitant,Nicardipine,The serum concentration increase,10


In [25]:
drug_dataset = DrugDataset(X,y)

Process: 0.0 %
Process: 2.6835551738943755 %
Process: 5.367110347788751 %
Process: 8.050665521683126 %
Process: 10.734220695577502 %
Process: 13.417775869471877 %
Process: 16.10133104336625 %
Process: 18.784886217260627 %
Process: 21.468441391155004 %
Process: 24.151996565049377 %
Process: 26.835551738943753 %
Process: 29.51910691283813 %
Process: 32.2026620867325 %
Process: 34.88621726062688 %
Process: 37.569772434521255 %
Process: 40.25332760841563 %
Process: 42.93688278231001 %
Process: 45.62043795620438 %
Process: 48.30399313009875 %
Process: 50.98754830399314 %
Process: 53.671103477887506 %
Process: 56.35465865178189 %
Process: 59.03821382567626 %
Process: 61.721768999570635 %
Process: 64.405324173465 %
Process: 67.08887934735938 %
Process: 69.77243452125376 %
Process: 72.45598969514813 %
Process: 75.13954486904251 %
Process: 77.82310004293689 %
Process: 80.50665521683126 %
Process: 83.19021039072564 %
Process: 85.87376556462002 %
Process: 88.55732073851439 %
Process: 91.240875912

In [26]:
# save dataset
if not os.path.exists('./saved_dataset/'):
    os.makedirs('./saved_dataset/')

with open('./saved_dataset/drug_dataset.pickle','wb') as f:
    pickle.dump(drug_dataset,f)

In [27]:
# split dataset to train and validation section
train_size = int(0.8 * len(drug_dataset))
val_size = len(drug_dataset) - train_size

train_dataset , val_dataset = random_split(drug_dataset,lengths=[train_size,val_size])


In [28]:
train_dataset = augment_and_concat(train_dataset)

In [29]:
len(train_dataset)

59622

In [30]:
# create data loader for train loader and val loader

train_loader = DataLoader(train_dataset,batch_size=16,shuffle=True)

val_loader = DataLoader(val_dataset,batch_size=16)

In [41]:
# create model and train loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = CPSP(12897,1,3,65).to(device=device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.0001,momentum=0.9)

N_epochs = 100
patience = 5
save_path = './saved_dataset/model1_f3.pt'
best_val_los = np.inf

for epoch in range(N_epochs):
    # Training
    train_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.type(torch.LongTensor).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    
    # Validation
    val_loss = 0.0
    model.eval()
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.type(torch.LongTensor).to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
    if val_loss >= best_val_los:
        print("Epoch: {} Train Loss: {} Val Loss: {}".format(epoch,
                                                         train_loss/len(train_loader),
                                                         val_loss/len(val_loader)))
        patience -= 1
        if patience <= 0:
            break
    else:
        patience = 5
        best_val_los = val_loss
        print("Epoch: {} Train Loss: {} Val Loss: {}".format(epoch,
                                                         train_loss/len(train_loader),
                                                         val_loss/len(val_loader)))
        print(f'saving the best model in: {save_path} ...')
        torch.save(model.state_dict(), save_path)
    
    
    

Epoch: 0 Train Loss: 1.390258842230099 Val Loss: 0.863194686692416
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 1 Train Loss: 0.8642917642900123 Val Loss: 0.7117033369615354
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 2 Train Loss: 0.7376351102902737 Val Loss: 0.6333759513247934
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 3 Train Loss: 0.6644144560956814 Val Loss: 0.584032648414885
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 4 Train Loss: 0.6105734994132717 Val Loss: 0.5484155963290914
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 5 Train Loss: 0.5664535027196237 Val Loss: 0.5037418200030859
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 6 Train Loss: 0.5235290823637767 Val Loss: 0.48228439516008154
saving the best model in: ./saved_dataset/model1_f3.pt ...
Epoch: 7 Train Loss: 0.48050168516658603 Val Loss: 0.4453102996484916
saving the best model in: ./saved_da

In [46]:
# load best model
model_dict = torch.load(save_path)
model = CPSP(12897,1,3,65).to(device=device)
model.load_state_dict(model_dict)

<All keys matched successfully>

In [47]:
# calculate performance mesure for train data
acc, f_score, precision,recall = evaluate_model(model,train_loader,device,65)
print("train acc: {} \ntrain f_score: {} \ntrain precision: {} \ntrain recall: {}".format(acc, f_score, precision,recall))

train acc: 0.9991613833819731 
train f_score: 0.9986538397350785 
train precision: 0.9977454465192896 
train recall: 0.9996218329089509


In [48]:
# calculate performance mesure for validation data
acc, f_score, precision,recall = evaluate_model(model,val_loader,device,65)
print("val acc: {} \nval f_score: {} \nval precision: {} \nval recall: {}".format(acc, f_score, precision,recall))

val acc: 0.92539916812022 
val f_score: 0.8160281499407529 
val precision: 0.8176468898019431 
val recall: 0.8228101951409763


  _warn_prf(average, modifier, msg_start, len(result))


In [166]:
# save train_loader and val_loader
with open('./saved_dataset/train_loader1.pickle','wb') as f:
    pickle.dump(train_loader,f)
with open('./saved_dataset/val_loader1.pickle','wb') as f:
    pickle.dump(val_loader,f)

# dataset 2

In [49]:
dataset = pd.read_csv('./DS2/drug_information_1258.csv')
dataset

Unnamed: 0,index,id,target,enzyme,smile,name
0,0,DB00006,P00734,P05164,1|41|79|80|108|117|140|143|173|193|197|242|269...,Bivalirudin
1,1,DB00035,P30518|P37288|P47901,P23219|P35354,1|53|80|115|117|140|143|173|193|197|242|253|30...,Desmopressin
2,2,DB00091,P49069|Q96LZ3|P62937|P30405,P20815|P08684|P33261|P10635,1|5|19|38|47|80|101|115|126|132|186|208|219|22...,Cyclosporine
3,3,DB00115,Q99707|P22033|Q9UBK8|Q8IVH4|Q9Y4U1|P42898,Q96EY8|Q05599,1|35|41|45|49|75|80|84|106|140|188|192|194|197...,Cyanocobalamin
4,4,DB00118,Q14749|P17707|P31153|P35520|Q00266|P21964|Q8N1...,P05181|P19623,1|75|80|194|209|348|362|378|454|457|489|577|61...,Ademetionine
...,...,...,...,...,...,...
1253,1253,DB15444,P13569,P08684|P20815|P05177|P20813|P10632|P11712|P332...,40|45|59|67|73|80|92|114|119|130|202|295|307|3...,Elexacaftor
1254,1254,DB15488,Q92847|P10275,P21964,13|25|31|80|104|170|176|193|222|310|315|489|55...,Echinacoside
1255,1255,DB15566,P04150,P08684,51|80|84|147|182|240|314|315|373|408|493|504|5...,Prednisoloneacetate
1256,1256,DB15598,P05106|P49281,P19224,166|253|271|314|650|656|677|715|751|787|835|84...,Ferricmaltol


In [50]:
vec = VectorizerApply(CountVectorizer)
X = vec.fit_transform(dataset,id='id',index='index',name='name')
X

Unnamed: 0,id,name,a0a024r8i1,a0a0e1r3h3,a0a0h2xj39,a0a0t9az62,a0a143zzk9,a0a144a2g5,a2qlk4,a5x5y0,...,99,990,991,992,993,994,996,997,998,999
0,DB00006,Bivalirudin,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,DB00035,Desmopressin,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,DB00091,Cyclosporine,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,DB00115,Cyanocobalamin,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DB00118,Ademetionine,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,DB15444,Elexacaftor,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1254,DB15488,Echinacoside,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1255,DB15566,Prednisoloneacetate,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1256,DB15598,Ferricmaltol,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
extraction = pd.read_csv('./DS2/drug_interaction.csv')
extraction = extraction.drop('Unnamed: 0',axis=1)
extraction

Unnamed: 0,index,mechanism,action,drugA,drugB
0,0,the anticoagulant activities,increase,Apixaban,Bivalirudin
1,1,the anticoagulant activities,increase,Dabigatranetexilate,Bivalirudin
2,2,The risk or severity of bleeding and hemorrhage,increase,Dasatinib,Bivalirudin
3,4,the anticoagulant activities,increase,Bivalirudin,Rivaroxaban
4,6,The risk or severity of bleeding and hemorrhage,increase,Tipranavir,Bivalirudin
...,...,...,...,...,...
161765,323530,The metabolism,decrease,Zanubrutinib,Curcuminsulfate
161766,323531,The metabolism,increase,Elexacaftor,Betamethasonephosphate
161767,323535,The serum concentration,increase,Ubrogepant,Ripretinib
161768,323536,The metabolism,decrease,Avapritinib,Voxelotor


In [52]:
label_creator = Label_creator()
y = label_creator.fit_transform(extraction,index='index',mechanism='mechanism',action='action')
y

Unnamed: 0,drugA,drugB,interaction,interaction_numaber
0,Apixaban,Bivalirudin,the anticoagulant activities increase,92
1,Dabigatranetexilate,Bivalirudin,the anticoagulant activities increase,92
2,Dasatinib,Bivalirudin,The risk or severity of bleeding and hemorrhag...,68
3,Bivalirudin,Rivaroxaban,the anticoagulant activities increase,92
4,Tipranavir,Bivalirudin,The risk or severity of bleeding and hemorrhag...,68
...,...,...,...,...
161765,Zanubrutinib,Curcuminsulfate,The metabolism decrease,3
161766,Elexacaftor,Betamethasonephosphate,The metabolism increase,47
161767,Ubrogepant,Ripretinib,The serum concentration increase,2
161768,Avapritinib,Voxelotor,The metabolism decrease,3


In [171]:
drug_dataset = DrugDataset(X,y)

Process: 0.0 %
Process: 0.6181615874389566 %
Process: 1.236323174877913 %
Process: 1.8544847623168697 %
Process: 2.472646349755826 %
Process: 3.090807937194783 %
Process: 3.7089695246337393 %
Process: 4.327131112072696 %
Process: 4.945292699511652 %
Process: 5.563454286950609 %
Process: 6.181615874389566 %
Process: 6.799777461828523 %
Process: 7.417939049267479 %
Process: 8.036100636706434 %
Process: 8.654262224145391 %
Process: 9.272423811584348 %
Process: 9.890585399023305 %
Process: 10.508746986462262 %
Process: 11.126908573901218 %
Process: 11.745070161340173 %
Process: 12.363231748779132 %
Process: 12.981393336218087 %
Process: 13.599554923657045 %
Process: 14.217716511096 %
Process: 14.835878098534957 %
Process: 15.454039685973914 %
Process: 16.07220127341287 %
Process: 16.690362860851828 %
Process: 17.308524448290783 %
Process: 17.926686035729738 %
Process: 18.544847623168696 %
Process: 19.163009210607655 %
Process: 19.78117079804661 %
Process: 20.399332385485565 %
Process: 21.0

In [172]:
# save dataset
if not os.path.exists('./saved_dataset/'):
    os.makedirs('./saved_dataset/')

with open('./saved_dataset/drug_dataset2.pickle','wb') as f:
    pickle.dump(drug_dataset,f)

In [54]:
# split dataset to train and validation section
train_size = int(0.8 * len(drug_dataset))
val_size = len(drug_dataset) - train_size

train_dataset , val_dataset = random_split(drug_dataset,lengths=[train_size,val_size])

In [55]:
train_dataset = augment_and_concat(train_dataset)

In [56]:
len(train_dataset)

258832

In [57]:
# create data loader for train loader and val loader

train_loader = DataLoader(train_dataset,batch_size=16,shuffle=True)

val_loader = DataLoader(val_dataset,batch_size=16)

In [91]:
# create model and train loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = CPSP(3997,1,3,100).to(device=device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.0001,momentum=0.9)

N_epochs = 100
patience = 7
save_path = './saved_dataset/model2_f.pt'
best_val_los = np.inf

for epoch in range(N_epochs):
    # Training
    train_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.type(torch.LongTensor).to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    
    # Validation
    val_loss = 0.0
    model.eval()
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.type(torch.LongTensor).to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
    if val_loss >= best_val_los:
        print("Epoch: {} Train Loss: {} Val Loss: {}".format(epoch,
                                                         train_loss/len(train_loader),
                                                         val_loss/len(val_loader)))
        patience -= 1
        if patience <= 0:
            break
    else:
        patience = 7
        best_val_los = val_loss
        print("Epoch: {} Train Loss: {} Val Loss: {}".format(epoch,
                                                         train_loss/len(train_loader),
                                                         val_loss/len(val_loader)))
        print(f'saving the best model in: {save_path} ...')
        torch.save(model.state_dict(), save_path)
    
    
    

Epoch: 0 Train Loss: 0.23859053163446609 Val Loss: 0.444650853078631
Epoch: 1 Train Loss: 0.23144505278178518 Val Loss: 0.47736695995553174
Epoch: 2 Train Loss: 0.22572299693871437 Val Loss: 0.4603695822769184
Epoch: 3 Train Loss: 0.2193316174973876 Val Loss: 0.45163489135766693
Epoch: 4 Train Loss: 0.2151270854184582 Val Loss: 0.46552097648326946
Epoch: 5 Train Loss: 0.20661479761026252 Val Loss: 0.4534956604590823
Epoch: 6 Train Loss: 0.20350696123962542 Val Loss: 0.45233616900030205


In [96]:
# load best model
model_dict = torch.load(save_path)
model = CPSP(3997,1,3,100).to(device=device)
model.load_state_dict(model_dict)

<All keys matched successfully>

In [97]:
# calculate performance mesure for train data
acc, f_score, precision,recall = evaluate_model(model,train_loader,device,100)
print("train acc: {} \ntrain f_score: {} \ntrain precision: {} \ntrain recall: {}".format(acc, f_score, precision,recall))

train acc: 0.9796161216542004 
train f_score: 0.9843218446108337 
train precision: 0.9739868137305093 
train recall: 0.995499106184536


In [98]:
# calculate performance mesure for validation data
acc, f_score, precision,recall = evaluate_model(model,val_loader,device,100)
print("val acc: {} \nval f_score: {} \nval precision: {} \nval recall: {}".format(acc, f_score, precision,recall))

val acc: 0.8687024788279656 
val f_score: 0.828722719277672 
val precision: 0.828177835600361 
val recall: 0.8506997108877142


In [23]:
# save train_loader and val_loader
with open('./saved_dataset/train_loader2.pickle','wb') as f:
    pickle.dump(train_loader,f)
with open('./saved_dataset/val_loader2.pickle','wb') as f:
    pickle.dump(val_loader,f)

# with adam and max pooling 

# Deep learning module

class CPSP1(torch.nn.Module):

    def __init__(self,feature_size,in_channels,conv_mid_channel,out_dim,*args, **kwargs) -> None:
        super(CPSP,self).__init__(*args, **kwargs)
        self.conv = torch.nn.Conv2d(in_channels=in_channels, out_channels=conv_mid_channel,kernel_size=(2,3),stride=1)
        self.bn1 = torch.nn.BatchNorm2d(conv_mid_channel)
        self.conv1_1 = torch.nn.Conv2d(in_channels=conv_mid_channel,out_channels=1, kernel_size=1)
        self.bn2 = torch.nn.BatchNorm2d(1)
        self.fc1 = torch.nn.Linear((feature_size-3)//1 + 1, 2048)
        self.fc2 = torch.nn.Linear(2048, 1024)
        self.fc3 = torch.nn.Linear(1024, out_dim)
        self.dropout1 = torch.nn.Dropout(p=0.5)
        self.dropout2 = torch.nn.Dropout(p=0.3)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x)
        x = torch.nn.functional.selu(x)
        x = self.bn1(x)
        x = self.conv1_1(x)
        x = torch.nn.functional.selu(x)
        x = torch.nn.functional.
        x = self.bn2(x)
        x = x.view(-1,int(x.nelement() / x.shape[0]))
        x = self.dropout1(x)
        x = self.fc1(x)
        x = torch.nn.functional.selu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        x = torch.nn.functional.selu(x)
        x = self.fc3(x)
        return x