In [1]:
!pip install torch torchvision --quiet
!pip install transformers  --quiet
!pip install pandas  --quiet
!pip install numpy  --quiet
!pip install sentencepiece  --quiet
!pip install sentence-splitter  --quiet
!pip install shap --quiet
!pip install optuna --quiet

In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW,AutoModelForQuestionAnswering, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DebertaTokenizer, DebertaModel, BartTokenizer
import math

# **Model loading**

In [4]:
# Use a GPU if you have one available (Runtime -> Change runtime type -> GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Set seeds for reproducibility
random.seed(26)
np.random.seed(26)
torch.manual_seed(26)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


cuda


In [5]:
mapping = {"positive" : 0, "negative" : 1, "relate" : 2, "NA" : 3}


In [6]:
reverse = {v: k for k, v in mapping.items()}


In [7]:
def get_optimizer_grouped_parameters(
    model, model_type, 
    learning_rate, weight_decay, 
    layerwise_learning_rate_decay
):
    no_decay = ["bias", "LayerNorm.weight"]
    # initialize lr for task specific layer
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n or "pooler" in n],
            "weight_decay": 0.0,
            "lr": learning_rate,
        },
    ]
    # initialize lrs for every layer
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = learning_rate
    for layer in layers:
        lr *= layerwise_learning_rate_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters

In [8]:
def encode_data(tokenizer, passages,questions, max_length):
    """Encode the question/passage pairs into features than can be fed to the model."""
    input_ids = []
    attention_masks = []

    for passage,question in zip(passages,questions):
        encoded_data = tokenizer.encode_plus(passage,question, max_length=max_length, pad_to_max_length=True, truncation='longest_first')
        encoded_pair = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]

        input_ids.append(encoded_pair)
        attention_masks.append(attention_mask)

    return np.array(input_ids), np.array(attention_masks)

In [9]:
def predict(passage,question):
  sequence = tokenizer.encode_plus(passage,question, return_tensors="pt")['input_ids'].to(device)
  
  logits = model(sequence)[0]
  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
  index = probabilities.index(max(probabilities))
  return reverse[index]

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



# Train and evaluate the accuracy of neural network with the addition of pruning mechanism
def train_and_evaluate(model,train_data_df,dev_data_df):
    passages_train = train_data_df.EVIDENCE.values
    questions_train = train_data_df.QUESTIONS.values
    answers_train = train_data_df.RELATION.values.astype(int)

    passages_dev = dev_data_df.EVIDENCE.values
    questions_dev = dev_data_df.QUESTIONS.values
    answers_dev = dev_data_df.RELATION.values.astype(int)

    # Encoding data
    max_seq_length = 512
    input_ids_train, attention_masks_train = encode_data(tokenizer, passages_train,questions_train, max_seq_length)
    input_ids_dev, attention_masks_dev = encode_data(tokenizer, passages_dev,questions_dev, max_seq_length)

    train_features = (input_ids_train, attention_masks_train, answers_train)
    dev_features = (input_ids_dev, attention_masks_dev, answers_dev)

    batch_size = 2
    train_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in train_features]
    dev_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in dev_features]

    train_dataset = TensorDataset(*train_features_tensors)
    dev_dataset = TensorDataset(*dev_features_tensors)

    train_sampler = RandomSampler(train_dataset)
    dev_sampler = SequentialSampler(dev_dataset)

    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
    dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_size)

    learning_rate = 5e-5
    layerwise_learning_rate_decay = 0.9
    weight_decay = 0.01
    adam_epsilon = 1e-6
    use_bertadam = False
    # scheduler params
    num_epochs = 7
    num_warmup_steps = 0
    _model_type = 'bert'

    grouped_optimizer_params = get_optimizer_grouped_parameters(
        model, _model_type, 
        learning_rate, weight_decay, 
        layerwise_learning_rate_decay
    )
    optimizer = AdamW(
        grouped_optimizer_params,
        lr=learning_rate,
        eps=adam_epsilon,
        correct_bias=not use_bertadam
    )
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_epochs
    )

    (learning_rates1, learning_rates2, learning_rates3, learning_rates4,
    learning_rates5, learning_rates6, learning_rates7, learning_rates8,
    learning_rates9, learning_rates10, learning_rates11, learning_rates12, 
    learning_rates13, learning_rates14) = [[] for i in range(14)]

    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[4]["lr"])
    learning_rates4.append(optimizer.param_groups[6]["lr"])
    learning_rates5.append(optimizer.param_groups[8]["lr"])
    learning_rates6.append(optimizer.param_groups[10]["lr"])
    learning_rates7.append(optimizer.param_groups[12]["lr"])
    learning_rates8.append(optimizer.param_groups[14]["lr"])
    learning_rates9.append(optimizer.param_groups[16]["lr"])
    learning_rates10.append(optimizer.param_groups[18]["lr"])
    learning_rates11.append(optimizer.param_groups[20]["lr"])
    learning_rates12.append(optimizer.param_groups[22]["lr"])
    learning_rates13.append(optimizer.param_groups[24]["lr"])
    learning_rates14.append(optimizer.param_groups[26]["lr"])

    for epoch in range(num_epochs):
        optimizer.step()
        scheduler.step()
        learning_rates1.append(optimizer.param_groups[0]["lr"])
        learning_rates2.append(optimizer.param_groups[2]["lr"])
        learning_rates3.append(optimizer.param_groups[4]["lr"])
        learning_rates4.append(optimizer.param_groups[6]["lr"])
        learning_rates5.append(optimizer.param_groups[8]["lr"])
        learning_rates6.append(optimizer.param_groups[10]["lr"])
        learning_rates7.append(optimizer.param_groups[12]["lr"])
        learning_rates8.append(optimizer.param_groups[14]["lr"])
        learning_rates9.append(optimizer.param_groups[16]["lr"])
        learning_rates10.append(optimizer.param_groups[18]["lr"])
        learning_rates11.append(optimizer.param_groups[20]["lr"])
        learning_rates12.append(optimizer.param_groups[22]["lr"])
        learning_rates13.append(optimizer.param_groups[24]["lr"])
        learning_rates14.append(optimizer.param_groups[26]["lr"])

    print("Done setting up optimizer\n")
    train_loss_values = []
    dev_acc_values = []
    state=[]

    for i in tqdm(range(num_epochs), desc="Epoch"):

      # Training
      print("In epoch ", i, "\n")
      epoch_train_loss = 0 # Cumulative loss
      model.train()
      #model.zero_grad()

      for step, batch in enumerate(train_dataloader):

          input_ids = batch[0].to(device)
          attention_masks = batch[1].to(device)
          labels = batch[2].to(device)     
          model.zero_grad() 
          outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)

          loss = outputs[0]
          #loss = loss / grad_acc_steps
          epoch_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

      epoch_train_loss = epoch_train_loss / len(train_dataloader)          
      train_loss_values.append(epoch_train_loss)
      print("Epoch loss is", epoch_train_loss)
      
      # Evaluation
      epoch_dev_accuracy = 0 # Cumulative accuracy
      model.eval()

      for batch in dev_dataloader:
        
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2]
                    
        with torch.no_grad():        
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
                        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        
        predictions = np.argmax(logits, axis=1).flatten()
        labels = labels.numpy().flatten()
        
        epoch_dev_accuracy += np.sum(predictions == labels) / len(labels)

      epoch_dev_accuracy = epoch_dev_accuracy / len(dev_dataloader)
      print("Epoch accuracy is",epoch_dev_accuracy )
      dev_acc_values.append(epoch_dev_accuracy)            
      temp=model
      state.append(temp)

    index = dev_acc_values.index(max(dev_acc_values))
    best = state[index] 
    return best
  


In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
import gc 
df=pd.read_csv('../data/gut_microbiota-disease1.csv')
df = df[['MICROBE', 'DISEASE', 'EVIDENCE','RELATION','QUESTIONS']]
df.RELATION = df.RELATION.fillna('NA')
for i in range(len(df)):
  df['RELATION'][i] = str(df['RELATION'][i])
  df['RELATION'][i] = mapping[df['RELATION'][i]]

In [13]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
CV_accuracy_array=[]
CV_macro_avg_array=[]
CV_weighted_avg_array=[]
CV_precision_macro_array = []
CV_recall_macro_array = []
CV_precision_weighted_array = []
CV_recall_weighted_array = []

for train_index, test_index in kf.split(df):
    print("TRAIN:", train_index) 
    print("TEST:", test_index)
    train_data_df, dev_data_df = df.loc[train_index], df.loc[test_index]
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 4)
    model.to(device)
    model=train_and_evaluate(model,train_data_df,dev_data_df)
    filter=list(dev_data_df.index)
    preds=[]
    index=[]
    for i in filter:
        try:
          passage=dev_data_df.EVIDENCE[i]
          question =dev_data_df.QUESTIONS[i]
          answer=predict(passage,question)
          preds.append(answer)
          index.append(i)
        except Exception as e:
          print(e)
          continue
    
    for i in index:
        dev_data_df['RELATION'][i] = reverse[dev_data_df['RELATION'][i]]
    
    dev=dev_data_df[dev_data_df.index.isin(index)]
    true_results=dev['RELATION'].tolist()
    print(classification_report(true_results, preds))
    results = classification_report(true_results, preds,output_dict=True,)

    CV_accuracy_array.append(results['accuracy'])
    CV_macro_avg_array.append(results['macro avg']['f1-score'])
    CV_precision_macro_array.append(results['macro avg']['precision'])
    CV_recall_macro_array.append(results['macro avg']['recall'])
    CV_weighted_avg_array.append(results['weighted avg']['f1-score'])
    CV_precision_weighted_array.append(results['weighted avg']['precision'])
    CV_recall_weighted_array.append(results['weighted avg']['recall'])
    #model.save_pretrained(f'bertBaseModel_iter1')
    del model
    gc.collect() 
    torch.cuda.empty_cache()
    



TRAIN: [   0    1    4    5    6    7    8    9   11   12   13   14   15   16
   17   18   19   20   21   22   24   26   27   28   29   32   33   34
   35   36   37   38   40   41   42   43   45   46   47   48   49   50
   51   52   53   56   57   58   61   62   64   65   68   69   71   73
   74   75   77   78   79   80   81   82   83   84   85   87   89   90
   91   93   94   95   97   98   99  102  103  104  105  108  110  111
  112  114  115  116  117  118  119  121  122  123  124  125  126  127
  128  129  130  131  132  133  134  135  137  138  140  142  143  144
  145  146  147  148  149  150  151  152  153  154  155  156  157  159
  160  161  162  163  164  165  166  167  169  170  171  172  173  175
  176  177  178  179  180  181  182  183  185  186  187  188  189  190
  191  193  194  195  196  197  200  201  202  203  204  205  206  207
  211  212  214  215  216  217  218  219  220  221  222  223  224  225
  226  227  228  229  230  232  233  234  235  236  238  239  240  241

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 0.9572103346626346
Epoch accuracy is 0.5096153846153846
In epoch  1 

Epoch loss is 0.9117970027929494
Epoch accuracy is 0.6490384615384616
In epoch  2 

Epoch loss is 0.796957797179997
Epoch accuracy is 0.6538461538461539
In epoch  3 

Epoch loss is 0.6499252703940258
Epoch accuracy is 0.7451923076923077
In epoch  4 

Epoch loss is 0.5515222771363579
Epoch accuracy is 0.7067307692307693
In epoch  5 

Epoch loss is 0.363192930869935
Epoch accuracy is 0.7596153846153846
In epoch  6 

Epoch loss is 0.3785398607820193
Epoch accuracy is 0.8076923076923077


Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors


The expanded size of the tensor (624) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 624].  Tensor sizes: [1, 512]
The expanded size of the tensor (558) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 558].  Tensor sizes: [1, 512]
The expanded size of the tensor (818) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 818].  Tensor sizes: [1, 512]
The expanded size of the tensor (823) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 823].  Tensor sizes: [1, 512]
The expanded size of the tensor (824) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 824].  Tensor sizes: [1, 512]
              precision    recall  f1-score   support

          NA       1.00      1.00      1.00         3
    negative       0.80      0.81      0.81        95
    positive       0.80      0.80      0.80        90
      relate       0.93

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 0.9605283469964678
Epoch accuracy is 0.6826923076923077
In epoch  1 

Epoch loss is 0.8884854905481916
Epoch accuracy is 0.7019230769230769
In epoch  2 

Epoch loss is 0.8226078496176902
Epoch accuracy is 0.7451923076923077
In epoch  3 

Epoch loss is 0.7381789143529693
Epoch accuracy is 0.7355769230769231
In epoch  4 

Epoch loss is 0.6658625897721672
Epoch accuracy is 0.7692307692307693
In epoch  5 

Epoch loss is 0.594644839908002
Epoch accuracy is 0.7403846153846154
In epoch  6 

Epoch loss is 0.5011066633635962
Epoch accuracy is 0.8028846153846154
The expanded size of the tensor (623) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 623].  Tensor sizes: [1, 512]
The expanded size of the tensor (624) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 624].  Tensor sizes: [1, 512]
The expanded size of the tensor (559) must match the existing size (512) at non-singleton dimension 1.  Targe

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 0.9929504599064015
Epoch accuracy is 0.5961538461538461
In epoch  1 

Epoch loss is 0.8387781669846915
Epoch accuracy is 0.7163461538461539
In epoch  2 

Epoch loss is 0.7718295209398582
Epoch accuracy is 0.6971153846153846
In epoch  3 

Epoch loss is 0.6084330398891604
Epoch accuracy is 0.7355769230769231
In epoch  4 

Epoch loss is 0.5382046064136935
Epoch accuracy is 0.7596153846153846
In epoch  5 

Epoch loss is 0.431065125636171
Epoch accuracy is 0.7788461538461539
In epoch  6 

Epoch loss is 0.30535378229902005
Epoch accuracy is 0.7884615384615384
              precision    recall  f1-score   support

          NA       1.00      1.00      1.00         1
    negative       0.74      0.86      0.79        94
    positive       0.84      0.72      0.78       105
      relate       0.86      0.75      0.80         8

    accuracy                           0.79       208
   macro avg       0.86      0.83      0.84       208
weighted avg       0.80      0.7

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 0.9779646263027993
Epoch accuracy is 0.6778846153846154
In epoch  1 

Epoch loss is 0.8199133214679583
Epoch accuracy is 0.7307692307692307
In epoch  2 

Epoch loss is 0.7220855858059412
Epoch accuracy is 0.7644230769230769
In epoch  3 

Epoch loss is 0.5566045689191434
Epoch accuracy is 0.7692307692307693
In epoch  4 

Epoch loss is 0.49726030875521804
Epoch accuracy is 0.7836538461538461
In epoch  5 

Epoch loss is 0.3349928941413684
Epoch accuracy is 0.8028846153846154
In epoch  6 

Epoch loss is 0.2831442648867568
Epoch accuracy is 0.7932692307692307
The expanded size of the tensor (623) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 623].  Tensor sizes: [1, 512]
              precision    recall  f1-score   support

          NA       1.00      1.00      1.00         2
    negative       0.80      0.77      0.78        91
    positive       0.78      0.82      0.80       103
      relate       0.89      0.73      0.8

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 0.9481553157361654
Epoch accuracy is 0.5961538461538461
In epoch  1 

Epoch loss is 0.9086024938117659
Epoch accuracy is 0.7163461538461539
In epoch  2 

Epoch loss is 0.7398231750015
Epoch accuracy is 0.7451923076923077
In epoch  3 

Epoch loss is 0.617860942651952
Epoch accuracy is 0.7451923076923077
In epoch  4 

Epoch loss is 0.5115918550765435
Epoch accuracy is 0.7596153846153846
In epoch  5 

Epoch loss is 0.42036744865408066
Epoch accuracy is 0.7451923076923077
In epoch  6 

Epoch loss is 0.3245554398337425
Epoch accuracy is 0.7836538461538461
The expanded size of the tensor (558) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 558].  Tensor sizes: [1, 512]
The expanded size of the tensor (559) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 559].  Tensor sizes: [1, 512]
The expanded size of the tensor (821) must match the existing size (512) at non-singleton dimension 1.  Target 

In [14]:
!nvidia-smi

Thu Mar 21 17:38:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.45                 Driver Version: 531.68       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   47C    P8              14W /  N/A |   2855MiB /  6144MiB |     11%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [15]:
print("The mean accuracy score is", np.mean(CV_accuracy_array))

The mean accuracy score is 0.7953746569938474


In [16]:
print("The standard deviation for accuracy is", np.std(CV_accuracy_array))

The standard deviation for accuracy is 0.009969694099366462


In [17]:
print("The mean macro avg score is", np.mean(CV_macro_avg_array))

The mean macro avg score is 0.8451106020713676


In [18]:
print("The standard deviation for macro avg score is", np.std(CV_macro_avg_array))

The standard deviation for macro avg score is 0.03256576949738966


In [19]:
print("The mean weighted avg score is", np.mean(CV_weighted_avg_array))

The mean weighted avg score is 0.7953628161633084


In [20]:
print("The standard deviation for weighted avg score is", np.std(CV_weighted_avg_array))

The standard deviation for weighted avg score is 0.010133338185835377


In [21]:
print("The mean macro precision score is", np.mean(CV_precision_macro_array))

The mean macro precision score is 0.8509537721439318


In [22]:
print("The standard deviation for macro precision score is", np.std(CV_precision_macro_array))

The standard deviation for macro precision score is 0.04022186115504636


In [23]:
print("The mean weighted precision score is", np.mean(CV_precision_weighted_array)) 

The mean weighted precision score is 0.797869997548735


In [24]:
print("The standard deviation for weighted precision score is", np.std(CV_precision_weighted_array))

The standard deviation for weighted precision score is 0.009131692583974527


In [25]:
print("The mean macro recall score is", np.mean(CV_recall_macro_array))

The mean macro recall score is 0.8423041817906396


In [26]:
print("The standard deviation for macro recall score is", np.std(CV_recall_macro_array))

The standard deviation for macro recall score is 0.026801963218236746


In [27]:
print("The mean weighted recall score is", np.mean(CV_recall_weighted_array))

The mean weighted recall score is 0.7953746569938474


In [28]:
print("The standard deviation for weighted recall score is", np.std(CV_recall_weighted_array))

The standard deviation for weighted recall score is 0.009969694099366462
