In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch torchvision --quiet
!pip install transformers  --quiet
!pip install pandas  --quiet
!pip install numpy  --quiet
!pip install sentencepiece  --quiet
!pip install sentence-splitter  --quiet
!pip install shap --quiet
!pip install nlp --quiet



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# **Imports**

In [3]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW,AutoModelForQuestionAnswering, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DebertaTokenizer, DebertaModel, BartTokenizer
import math

# **Model loading**

In [5]:
# Use a GPU if you have one available (Runtime -> Change runtime type -> GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "tpu")

# Set seeds for reproducibility
random.seed(26)
np.random.seed(26)
torch.manual_seed(26)

tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-base", do_lower_case=True)

#change model file path accordingly here
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/public/mutagenicity")
model.to(device) # Send the model to the GPU if we have one


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28895, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

df=pd.read_csv('./new_data_mutagenicity.csv')

# Apply the function to the column
df['label'] = df['label'].apply(lambda x: x.startswith("['True"))
#df['label'] = df['label'].apply(lambda x: x[0] == "['False - AMES non Mutagenic']")

new_column_names = {'text': 'Abstract', 'label': 'AMES'}
df.rename(columns=new_column_names, inplace=True)

train_data_df, dev_data_df = train_test_split(df, test_size = 0.2, random_state = 42)

In [7]:
passages_train = train_data_df.Abstract.values
questions_train = train_data_df.questions.values
answers_train = train_data_df.AMES.values.astype(int)
#c = tokenizer.encode_plus(passages_train[0],questions_train[0], return_tensors="pt")['input_ids'].to(device)
#logits = model(sequence)[0]
#probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]

In [8]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
def predict(passage,question):
  sequence = tokenizer.encode_plus(passage,question, return_tensors="pt")['input_ids'].to(device)

  logits = model(sequence)[0]
  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
  proba_yes = round(probabilities[1], 2)
  proba_no = round(probabilities[0], 2)

  #print(f"Question: {question}, Yes: {proba_yes}, No: {proba_no}")

  if (proba_yes >= proba_no):
    return True
  else:
    return False






In [10]:
import shap
import scipy as sp

def predictor(x):
    #sequence = tokenizer.encode_plus(a, b, return_tensors="pt")['input_ids'].to(device)
    sequence = tokenizer.encode_plus(x, return_tensors="pt")['input_ids'].to(device)
    logits = model(sequence)[0]
    #logits = logits.detach().cpu().numpy()
    probabilities = torch.softmax(logits, dim=1).detach().cpu().numpy()
    val = sp.special.logit(probabilities[:,1])
    return val

def f_batch(x):
    val = np.array([])
    for i in x:
      val = np.append(val, predictor(i))
    return val




In [25]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(passages_train[51])
text = ''.join(sentences[:8])
answers_train[51], questions_train[51]

(1, 'Is p-Rosaniline mutagenic for TA98 or TA100 or TA1535 or TA1537 strain?')

In [26]:
answer=predict(passages_train[51],questions_train[51])
print(answer)

True


In [27]:
explainer_bert = shap.Explainer(f_batch, tokenizer, seed = 26)
test = {'label': [1], 'text':[text]}

shap_values = explainer_bert(test)
shap.plots.text(shap_values, num_starting_labels=0)

  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer: 100%|██████████| 2/2 [00:19<00:00, 19.71s/it]


In [24]:
answers_train[10:20]

array([0, 1, 0, 0, 1, 0, 1, 1, 0, 0])

In [51]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(passages_train[12])
text = ''.join(sentences[:8])
answers_train[12], questions_train[12]

(0, 'Is Retinol acetate mutagenic for TA98 or TA100 strain?')

In [52]:
answer=predict(passages_train[12],questions_train[12])
print(answer)

False


In [53]:
explainer_bert = shap.Explainer(f_batch, tokenizer, seed = 26)
test = {'label': [0], 'text':[text]}

shap_values = explainer_bert(test)
shap.plots.text(shap_values, num_starting_labels=0)

  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer: 100%|██████████| 2/2 [00:12<00:00, 12.01s/it]
