# Assessment of LLMs for drug side effect identification

Load the basic libraries and connect to google drive


In [1]:
# essentials
import sys
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# access to files in drive
from google.colab import drive
drive.mount('/content/drive')
from tqdm import tqdm

Mounted at /content/drive


## Step 1: Install libraries

In [2]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

Collecting transformers==4.33.0
  Downloading transformers-4.33.0-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.22.0
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.0.300
  Downloading langchain-0.0.300-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers==0.0.21
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl (167.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

## Step 2: Define the root directory and log in in HugginFace

In [16]:
root_dir = 'drive/MyDrive/1. Research 2021/Kwaai AI for Life Sciences/test'

from huggingface_hub import login
# need to replace this with your huggingface key
mytoken = "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"
login(mytoken)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Step 3: Define the model, the device

In [6]:
models2test = ["meta-llama/Meta-Llama-3-8B-Instruct", "ShadNygren/FineTuneTest-DrugAdverseEffects-SIDER-Diego1-50epochs", "ShadNygren/FineTuneTest-DrugAdverseEffects-SIDER-Diego2-10epochs"]
model_selected = 0
model_id = models2test[model_selected] # we need to run this for each model
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.generation_config.pad_token_ids = tokenizer.pad_token_id
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Prepare model, tokenizer: 263.153 sec.


## Step 4: Define the query pipeline

In [7]:
time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        max_new_tokens = 10,
        temperature = 0.01,
        pad_token_id = tokenizer.eos_token_id,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Prepare pipeline: 1.257 sec.


Create an instance of the LLM

In [8]:
llm = HuggingFacePipeline(pipeline=query_pipeline)

## Step 5: Load data Format A and B for RAG

In [14]:
root_dir

'drive/MyDrive/Kwaai AI for Life Sciences/test/'

In [18]:
# we read the drug side effect associations generated for 200 drugs
query_data = pd.read_excel(root_dir + '/data/drug_side_effects200drugs.xlsx')

# Load the datasets for RAG
loader = TextLoader(root_dir + '/data/data format A.txt',
                    encoding="utf8")
documents = loader.load()

loader2 = TextLoader(root_dir + '/data/data format B.txt',
                    encoding="utf8")
documents2 = loader2.load()

Split the datasets format A and B and create the embeddings

In [19]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)
all_splits2 = text_splitter.split_documents(documents2)

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# vector DBs
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")
vectordb2 = Chroma.from_documents(documents=all_splits2, embedding=embeddings, persist_directory="chroma_db")

# retrievers
retriever = vectordb.as_retriever()
retriever2 = vectordb2.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=False
)

qb = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever2,
    verbose=False
)

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [20]:
def rag_query(q, query):
  return q.run(query)

def binary_answer(text):
  if 'Yes' in text:
    return 1
  else:
    return 0

In [21]:
questions= ['Is [SE] an adverse effect of [DRUG]?']

In [24]:
result_rag = pd.DataFrame()
for i in tqdm(range(query_data.shape[0])):
  se = query_data.iloc[i]['side effect'].lower()
  drug_name = query_data.iloc[i]['drug name']
  label = query_data.iloc[i]['label']
  for c,j in enumerate(questions):
    q = j.replace('[SE]', se.lower())
    q = q.replace('[DRUG]', drug_name.lower())
    try:
      # method 1: llama-3 on its own
      response_llm = llm(prompt=q)[0:50]
      # method 2: llama-3 + RAG on text input A
      response_rag_A = rag_query(qa, q)[0:50]
      # method 3: llama-3 + RAG on text input B
      response_rag_B = rag_query(qb, q)[0:50]
      # method 4: llama-3 + graph RAG
      if label == 1:
        temp = 'Answer the following question based on this information: The drug ' + drug_name + ' causes the adverse effect ' + se + '. '
      else:
        temp = 'Answer the following question based on this information: The drug ' + drug_name + ' does not causes the adverse effect ' + se + '. '

      response_graph_rag = llm(prompt=temp + q)[0:50]
    except:
      continue

    query_data.loc[i, 'prompt' + str(c+1)] = q

    query_data.loc[i,'output_llm' + str(c + 1)] = binary_answer(response_llm)
    query_data.loc[i,'output_ragA' + str(c + 1)] = binary_answer(response_rag_A)
    query_data.loc[i,'output_ragB' + str(c + 1)] = binary_answer(response_rag_B)
    query_data.loc[i,'output_graphrag' + str(c + 1)] = binary_answer(response_graph_rag)

# save the results
query_data.to_excel(root_dir + '/results/results_model_' + str(model_selected)+'_200drugs.xlsx')

  0%|          | 0/1966 [00:15<?, ?it/s]
