In [None]:
!pip install torch torchvision --quiet
!pip install transformers  --quiet
!pip install pandas  --quiet
!pip install numpy  --quiet
!pip install sentencepiece  --quiet
!pip install sentence-splitter  --quiet
!pip install shap --quiet





[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.9/547.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Imports**

In [None]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW,AutoModelForQuestionAnswering, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DebertaTokenizer, DebertaModel, BartTokenizer
import math

# **Model loading**

In [None]:
# Use a GPU if you have one available (Runtime -> Change runtime type -> GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "tpu")

# Set seeds for reproducibility
random.seed(26)
np.random.seed(26)
torch.manual_seed(26)

tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-base", do_lower_case=True)

#change model file path accordingly here
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/public/mutagenicity")
model.to(device) # Send the model to the GPU if we have one


Downloading (…)okenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/447k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28895, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# **Data Loading**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df=pd.read_csv('/content/drive/MyDrive/public/data.csv')

# Apply the function to the column
df['label'] = df['label'].apply(lambda x: x.startswith("['True"))
#df['label'] = df['label'].apply(lambda x: x[0] == "['False - AMES non Mutagenic']")

new_column_names = {'text': 'Abstract', 'label': 'AMES'}
df.rename(columns=new_column_names, inplace=True)

train_data_df, dev_data_df = train_test_split(df, test_size = 0.2, random_state = 9)

# **Prediction**

In [None]:
def predict(passage,question):
  sequence = tokenizer.encode_plus(passage,question, return_tensors="pt")['input_ids'].to(device)

  logits = model(sequence)[0]
  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
  proba_yes = round(probabilities[1], 2)
  proba_no = round(probabilities[0], 2)

  print(f"Question: {question}, Yes: {proba_yes}, No: {proba_no}")

  if (proba_yes >= proba_no):
    return True
  else:
    return False






In [None]:
filter=list(dev_data_df.index)

In [None]:
preds=[]
index=[]
for i in filter:
    try:
      passage=dev_data_df.Abstract[i]
      question =dev_data_df.questions[i]
      answer=predict(question, passage)
      preds.append(answer)
      index.append(i)
    except Exception as e:
      continue



Question: The mechanism by which vitamin A prevents or delays in chemical carcinogenesis remains unclear. In the present study, we assess the suggestive role of vitamin A in the initiation phase of carcinogenesis. We have conducted a dose-effect relationship between vitamin A dietary intake and aflatoxin B1 (AFB1) genotoxicity measured both in vitro and in vivo. Thus AFB1-induced mutagenesis in Salmonella typhimurium TA98 was investigated and compared to AFB1-induced single-strand breaks (SSBs) in DNA of rat hepatocytes. Rats were fed ad libitum with diet containing 0, 5, 50 or 500 IU of retinyl palmitate for 8 weeks. The AFB1-treated rats were injected i.p. with 1 mg/kg body weight. In the Ames test conditions TA98 back-reversion was negatively correlated with the log of vitamin A concentration in liver S9 fractions from experimental groups. However, the activities of metabolizing enzymes which specifically activate or deactivate AFB1 were found to be significantly decreased in vitami

In [None]:
dev=dev_data_df[dev_data_df.index.isin(index)]
true_results=dev['AMES'].tolist()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_results, preds))

              precision    recall  f1-score   support

       False       0.90      0.85      0.88       136
        True       0.88      0.92      0.90       159

    accuracy                           0.89       295
   macro avg       0.89      0.89      0.89       295
weighted avg       0.89      0.89      0.89       295

