In [None]:
!pip install torch torchvision --quiet
!pip install transformers  --quiet
!pip install pandas  --quiet
!pip install numpy  --quiet
!pip install sentencepiece  --quiet
!pip install sentence-splitter  --quiet
!pip install shap --quiet
!pip install optuna --quiet




[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Imports**

In [None]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW,AutoModelForQuestionAnswering, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DebertaTokenizer, DebertaModel, BartTokenizer
import math

In [None]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda:1



In [None]:
torch.cuda.device_count()


1

# **Model loading**

In [None]:
# Use a GPU if you have one available (Runtime -> Change runtime type -> GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds for reproducibility
random.seed(26)
np.random.seed(26)
torch.manual_seed(26)

tokenizer = AutoTokenizer.from_pretrained("EMBO/BioMegatron345mUncased", do_lower_case=True)


In [None]:
def get_optimizer_grouped_parameters(
    model, model_type,
    learning_rate, weight_decay,
    layerwise_learning_rate_decay
):
    no_decay = ["bias", "LayerNorm.weight"]
    # initialize lr for task specific layer
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n or "pooler" in n],
            "weight_decay": 0.0,
            "lr": learning_rate,
        },
    ]
    # initialize lrs for every layer
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = learning_rate
    for layer in layers:
        lr *= layerwise_learning_rate_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters

In [None]:
def encode_data(tokenizer, passages,questions, max_length):
    """Encode the question/passage pairs into features than can be fed to the model."""
    input_ids = []
    attention_masks = []

    for passage,question in zip(passages,questions):
        encoded_data = tokenizer.encode_plus(passage,question, max_length=max_length, pad_to_max_length=True, truncation='longest_first')
        encoded_pair = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]

        input_ids.append(encoded_pair)
        attention_masks.append(attention_mask)

    return np.array(input_ids), np.array(attention_masks)

In [None]:
def predict(passage,question):
  sequence = tokenizer.encode_plus(passage,question,max_length=512, pad_to_max_length=True, truncation='longest_first', return_tensors="pt")['input_ids'].to(device)

  logits = model(sequence)[0]
  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
  proba_yes = round(probabilities[1], 2)
  proba_no = round(probabilities[0], 2)

  #print(f"Question: {question}, Yes: {proba_yes}, No: {proba_no}")

  if (proba_yes >= proba_no):
    return True
  else:
    return False






In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



# Train and evaluate the accuracy of neural network with the addition of pruning mechanism
def train_and_evaluate(model,train_data_df,dev_data_df):
    passages_train = train_data_df.Abstract.values
    questions_train = train_data_df.questions.values
    answers_train = train_data_df.AMES.values.astype(int)

    passages_dev = dev_data_df.Abstract.values
    questions_dev = dev_data_df.questions.values
    answers_dev = dev_data_df.AMES.values.astype(int)

    # Encoding data
    max_seq_length = 512
    input_ids_train, attention_masks_train = encode_data(tokenizer, passages_train,questions_train, max_seq_length)
    input_ids_dev, attention_masks_dev = encode_data(tokenizer, passages_dev,questions_dev, max_seq_length)

    train_features = (input_ids_train, attention_masks_train, answers_train)
    dev_features = (input_ids_dev, attention_masks_dev, answers_dev)

    batch_size = 2
    train_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in train_features]
    dev_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in dev_features]

    train_dataset = TensorDataset(*train_features_tensors)
    dev_dataset = TensorDataset(*dev_features_tensors)

    train_sampler = RandomSampler(train_dataset)
    dev_sampler = SequentialSampler(dev_dataset)

    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
    dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_size)

    learning_rate = 5e-5
    layerwise_learning_rate_decay = 0.9
    adam_epsilon = 1e-6
    use_bertadam = False

    # scheduler params
    num_epochs = 5
    num_warmup_steps = 0

    optimizer = AdamW(model.parameters(),
        lr=learning_rate,
        eps=adam_epsilon,
        correct_bias=not use_bertadam
    )
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_epochs
    )

    print("Done setting up optimizer\n")
    train_loss_values = []
    dev_acc_values = []
    state=[]

    for i in tqdm(range(num_epochs), desc="Epoch"):

      # Training
      print("In epoch ", i, "\n")
      epoch_train_loss = 0 # Cumulative loss
      model.train()
      #model.zero_grad()

      for step, batch in enumerate(train_dataloader):

          input_ids = batch[0].to(device)
          attention_masks = batch[1].to(device)
          labels = batch[2].to(device)
          model.zero_grad()
          outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)

          loss = outputs[0]
          #loss = loss / grad_acc_steps
          epoch_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

      epoch_train_loss = epoch_train_loss / len(train_dataloader)
      train_loss_values.append(epoch_train_loss)
      print("Epoch loss is", epoch_train_loss)

      # Evaluation
      epoch_dev_accuracy = 0 # Cumulative accuracy
      model.eval()

      for batch in dev_dataloader:

        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2]

        with torch.no_grad():
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()

        predictions = np.argmax(logits, axis=1).flatten()
        labels = labels.numpy().flatten()

        epoch_dev_accuracy += np.sum(predictions == labels) / len(labels)

      epoch_dev_accuracy = epoch_dev_accuracy / len(dev_dataloader)
      print("Epoch accuracy is",epoch_dev_accuracy )
      dev_acc_values.append(epoch_dev_accuracy)
      temp=model
      state.append(temp)

    index = dev_acc_values.index(max(dev_acc_values))
    best = state[index]
    return best



In [None]:
import pandas as pd

df=pd.read_csv('./new_data_mutagenicity.csv')

# Apply the function to the column
df['label'] = df['label'].apply(lambda x: x.startswith("['True"))
#df['label'] = df['label'].apply(lambda x: x[0] == "['False - AMES non Mutagenic']")

new_column_names = {'text': 'Abstract', 'label': 'AMES'}
df.rename(columns=new_column_names, inplace=True)


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import pandas as pd
import gc
#df=pd.read_csv('./data.csv')

kf = KFold(n_splits=5, random_state=42, shuffle=True)
CV_accuracy_array=[]
CV_macro_avg_array=[]
CV_weighted_avg_array=[]
for train_index, test_index in kf.split(df):
    print("TRAIN:", train_index)
    print("TEST:", test_index)
    train_data_df, dev_data_df = df.loc[train_index], df.loc[test_index]
    model = AutoModelForSequenceClassification.from_pretrained("EMBO/BioMegatron345mUncased")
    model.to(device)
    model=train_and_evaluate(model,train_data_df,dev_data_df)
    filter=list(dev_data_df.index)
    preds=[]
    index=[]
    for i in filter:
        try:
          passage=dev_data_df.Abstract[i]
          question =dev_data_df.questions[i]
          answer=predict(passage,question)
          preds.append(answer)
          index.append(i)
        except Exception as e:
          print(e)
          continue
    dev=dev_data_df[dev_data_df.index.isin(index)]
    true_results=dev['AMES'].tolist()
    print(classification_report(true_results, preds))
    results = classification_report(true_results, preds,output_dict=True,)
    CV_accuracy_array.append(results['accuracy'])
    CV_macro_avg_array.append(results['macro avg']['f1-score'])
    CV_weighted_avg_array.append(results['weighted avg']['f1-score'])
    del model
    gc.collect()
    torch.cuda.empty_cache()



TRAIN: [   0    1    2 ... 1643 1644 1645]
TEST: [  15   23   29   30   32   43   44   49   51   56   59   63   65   67
   69   70   73   76   78   99  100  101  107  109  115  123  124  128
  135  141  148  162  163  168  170  173  175  184  185  192  198  199
  203  212  220  226  231  237  239  240  244  247  251  259  261  266
  270  271  274  275  289  297  298  300  303  306  309  316  324  331
  332  339  342  344  350  351  352  353  366  367  371  374  383  394
  398  405  408  411  413  414  415  416  420  422  425  426  433  438
  450  451  464  471  479  481  482  483  486  490  493  494  497  526
  527  529  534  538  543  551  552  554  560  561  567  575  582  584
  585  588  589  590  591  597  610  613  614  617  619  620  621  629
  651  654  668  669  674  679  680  682  694  706  707  720  724  727
  730  741  744  752  754  764  765  767  772  780  792  798  802  803
  809  810  813  816  818  838  842  844  845  847  855  861  865  867
  873  874  879  887  888  8

Downloading pytorch_model.bin:   0%|          | 0.00/668M [00:00<?, ?B/s]

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at EMBO/BioMegatron345mUncased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 2.6587714486313656
Epoch accuracy is 0.4636363636363636
In epoch  1 

Epoch loss is 1.4059426015907384
Epoch accuracy is 0.8242424242424242
In epoch  2 

Epoch loss is 0.9128536704776028
Epoch accuracy is 0.803030303030303
In epoch  3 

Epoch loss is 0.7140075684509888
Epoch accuracy is 0.8515151515151516
In epoch  4 

Epoch loss is 0.6441674539242137
Epoch accuracy is 0.8393939393939394
              precision    recall  f1-score   support

       False       0.81      0.83      0.82       150
        True       0.85      0.83      0.84       180

    accuracy                           0.83       330
   macro avg       0.83      0.83      0.83       330
weighted avg       0.83      0.83      0.83       330

TRAIN: [   0    1    2 ... 1643 1644 1645]
TEST: [  10   18   31   41   48   54   58   81   83   86   88   96  111  113
  126  129  131  140  142  147  155  156  158  164  174  178  179  181
  188  195  196  208  209  210  214  218  221  233  236  243  2

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at EMBO/BioMegatron345mUncased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 2.464200147843205
Epoch accuracy is 0.6787878787878788
In epoch  1 

Epoch loss is 1.4829911712512343
Epoch accuracy is 0.8393939393939394
In epoch  2 

Epoch loss is 0.8969762703373163
Epoch accuracy is 0.8666666666666667
In epoch  3 

Epoch loss is 0.5962146474570353
Epoch accuracy is 0.8636363636363636
In epoch  4 

Epoch loss is 0.4123983518002107
Epoch accuracy is 0.8848484848484849
              precision    recall  f1-score   support

       False       0.89      0.80      0.84       137
        True       0.86      0.93      0.89       192

    accuracy                           0.87       329
   macro avg       0.88      0.86      0.87       329
weighted avg       0.87      0.87      0.87       329

TRAIN: [   1    4    7 ... 1642 1643 1645]
TEST: [   0    2    3    5    6    9   12   24   25   27   33   39   42   45
   47   52   55   60   62   66   68   71   72   74   77   80   82   84
   85   92   94   97  102  104  105  106  110  117  118  120  1

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at EMBO/BioMegatron345mUncased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 2.134457865528899
Epoch accuracy is 0.8363636363636363
In epoch  1 

Epoch loss is 1.0158954132844857
Epoch accuracy is 0.9
In epoch  2 

Epoch loss is 0.5006309637811284
Epoch accuracy is 0.8484848484848485
In epoch  3 

Epoch loss is 0.41675355583970286
Epoch accuracy is 0.9030303030303031
In epoch  4 

Epoch loss is 0.36348012267729674
Epoch accuracy is 0.9121212121212121
              precision    recall  f1-score   support

       False       0.92      0.86      0.89       142
        True       0.90      0.94      0.92       187

    accuracy                           0.91       329
   macro avg       0.91      0.90      0.90       329
weighted avg       0.91      0.91      0.91       329

TRAIN: [   0    1    2 ... 1641 1642 1644]
TEST: [   4    7   11   16   17   19   22   28   35   36   38   46   50   57
   61   75   79   89   90   93  108  114  116  119  127  133  144  149
  153  154  157  159  169  172  176  177  180  190  191  217  234  245
  255

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at EMBO/BioMegatron345mUncased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 2.0669092969520775
Epoch accuracy is 0.8333333333333334
In epoch  1 

Epoch loss is 1.0540673612926224
Epoch accuracy is 0.8484848484848485
In epoch  2 

Epoch loss is 0.6819018712260122
Epoch accuracy is 0.896969696969697
In epoch  3 

Epoch loss is 0.39503988409270846
Epoch accuracy is 0.8818181818181818
In epoch  4 

Epoch loss is 0.3679472292261931
Epoch accuracy is 0.8787878787878788
              precision    recall  f1-score   support

       False       0.95      0.72      0.82       145
        True       0.82      0.97      0.89       184

    accuracy                           0.86       329
   macro avg       0.88      0.85      0.85       329
weighted avg       0.87      0.86      0.86       329

TRAIN: [   0    2    3 ... 1643 1644 1645]
TEST: [   1    8   13   14   20   21   26   34   37   40   53   64   87   91
   95   98  103  112  121  122  130  134  143  146  150  151  152  160
  161  166  186  187  189  197  200  201  202  205  206  207  

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at EMBO/BioMegatron345mUncased and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Done setting up optimizer





Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

In epoch  0 

Epoch loss is 2.2956707312669318
Epoch accuracy is 0.7787878787878788
In epoch  1 

Epoch loss is 0.9532628518415086
Epoch accuracy is 0.896969696969697
In epoch  2 

Epoch loss is 0.6728477489552847
Epoch accuracy is 0.8666666666666667
In epoch  3 

Epoch loss is 0.5384182960370889
Epoch accuracy is 0.8939393939393939
In epoch  4 

Epoch loss is 0.357879923584379
Epoch accuracy is 0.8575757575757575
              precision    recall  f1-score   support

       False       0.79      0.88      0.83       156
        True       0.88      0.79      0.83       173

    accuracy                           0.83       329
   macro avg       0.84      0.84      0.83       329
weighted avg       0.84      0.83      0.83       329



In [None]:
print("The mean accuracy score is", np.mean(CV_accuracy_array))

The mean accuracy score is 0.8602855300727642


In [None]:
print("The standard deviation for accuracy is", np.std(CV_accuracy_array))

The standard deviation for accuracy is 0.02781210151944958


In [None]:
print("The mean macro avg score is", np.mean(CV_macro_avg_array))

The mean macro avg score is 0.8569278561315417


In [None]:
print("The standard deviation for macro avg score is", np.std(CV_macro_avg_array))

The standard deviation for macro avg score is 0.026804814779586853


In [None]:
print("The mean weighted avg score is", np.mean(CV_weighted_avg_array))

The mean weighted avg score is 0.8593066906595499


In [None]:
print("The standard deviation for weighted avg score is", np.std(CV_weighted_avg_array))

The standard deviation for weighted avg score is 0.027574033466025667
