## Notebook Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os
import pandas as pd
import json
import csv
from random import sample

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

!pip install -q torch==2.2.2
!pip install -q -U torchaudio torchtext torchvision

!pip install transformers
!pip install datasets==2.16.0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━

In [None]:
import torch
import torchaudio
import torchtext
import torchvision

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb, platform, gradio, warnings

from datasets import load_dataset

from trl import SFTTrainer
from huggingface_hub import notebook_login

import wandb

# Print the installed versions
print(f"torch: {torch.__version__}")
print(f"torchaudio: {torchaudio.__version__}")
print(f"torchtext: {torchtext.__version__}")
print(f"torchvision: {torchvision.__version__}")

torch: 2.2.2+cu121
torchaudio: 2.2.2+cu121
torchtext: 0.17.2+cpu
torchvision: 0.17.2+cu121


## Data Preprocessing

In [None]:
output_jsonl_file_path = 'drive/MyDrive/266_project/json_data/main_data.jsonl'

data = []

with open(output_jsonl_file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

len(data)

49877

## Setup Mistral 7b

In [None]:
base_model = "mistralai/Mistral-7B-v0.1"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

## Model Inference


In [None]:
prompt_inst = """
Extract the entities for the specified labels from the given medical text and provide the results in JSON format
- Entities must be extracted precisely as they appear in the text.
- Return each entity under its label without creating new labels.
- Provide a list of entities for each label. If no entities are found for a label, return an empty list.
- Prioritize accuracy and relevance in the identification of entities.

Here are the entity labels and their descriptions:
1. Drug: Extract any mentioned medications or drugs.
2. Duration: Extract the duration of treatment or medication usage.
3. Dosage: Extract dosages related to medications, including units.
4. Frequency: Extract how often the medication or treatment is to be taken or administered.
5. Strength: Extract the concentration or potency of the medication.
6. Form: Extract the form in which the medication is to be used.
7. Route: Extract the method of administration for a medication.
8. Reason: Extract the reason or condition the medication is prescribed for.
9. ADE: Extract adverse drug events or side effects mentioned.

Make sure to go through the text carefully and extract all entities mentioned above if they are present. Do not create fictitious data.

#### START EXAMPLES
============================

----- Example Input 1 -----

"Amoxicillin 500 mg capsule Sig: Two capsules PO BID for 7 days for acute otitis media."

----- Example Output 1 -----

{
  "Drug": ["Amoxicillin"],
  "Duration": ["7 days"],
  "Dosage": ["500 mg"],
  "Frequency": ["BID"],
  "Strength": ["500 mg"],
  "Form": ["capsule"],
  "Route": ["PO"],
  "Reason": ["acute otitis media"],
  "ADE": []
}

============================

----- Example Input 2 -----

"Patient reported severe itchiness and rash within hours after taking penicillin."

----- Example Output 2 -----

{
  "Drug": ["penicillin"],
  "Duration": [],
  "Dosage": [],
  "Frequency": [],
  "Strength": [],
  "Form": [],
  "Route": [],
  "Reason": [],
  "ADE": ["severe itchiness", "rash"]
}

============================

----- Example Input 3 -----

"Allergies: Penicillin / Aspirin / Codeine"

----- Example Output 3 -----

{
  "Drug": ["Penicillin", "Aspirin", "Codeine"],
  "Duration": [],
  "Dosage": [],
  "Frequency": [],
  "Strength": [],
  "Form": [],
  "Route": [],
  "Reason": [],
  "ADE": []
}

============================

----- Example Input 4 -----

"Due to an exacerbation of rheumatoid arthritis, methotrexate dose was increased to 20 mg per week, leading to noticeable reduction in joint pain and swelling."

----- Example Output 4 -----

{
  "Drug": ["methotrexate"],
  "Duration": [],
  "Dosage": ["20 mg"],
  "Frequency": ["per week"],
  "Strength": ["20 mg"],
  "Form": [],
  "Route": [],
  "Reason": ["exacerbation of rheumatoid arthritis"],
  "ADE": ["reduction in joint pain", "swelling"]
}

============================

----- Example Input 5 -----

"Discontinued Metformin due to gastrointestinal upset."

----- Example Output 5 -----

{
  "Drug": ["Metformin"],
  "Duration": [],
  "Dosage": [],
  "Frequency": [],
  "Strength": [],
  "Form": [],
  "Route": [],
  "Reason": [],
  "ADE": ["gastrointestinal upset"]
}

============================
#### END EXAMPLES

Do not produce any text after providing the JSON object with the extracted entities.
"""

In [None]:
user_prompt = "125 mg of Tylenol daily"
system_prompt = prompt_inst
B_INST, E_INST = "[INST]", "[/INST]"
prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"

In [None]:
prompt



In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyForCausalLM', 'MvpForCausalLM', 'OpenLlam

In [None]:
sequences = pipe(
    prompt,
    do_sample=True,
    max_new_tokens=200,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1,
)
print(sequences[0]['generated_text'])


Extract the entities for the specified labels from the given medical text and provide the results in JSON format
- Entities must be extracted precisely as they appear in the text.
- Return each entity under its label without creating new labels.
- Provide a list of entities for each label. If no entities are found for a label, return an empty list.
- Prioritize accuracy and relevance in the identification of entities.

Here are the entity labels and their descriptions:
1. Drug: Extract any mentioned medications or drugs.
2. Duration: Extract the duration of treatment or medication usage.
3. Dosage: Extract dosages related to medications, including units.
4. Frequency: Extract how often the medication or treatment is to be taken or administered.
5. Strength: Extract the concentration or potency of the medication.
6. Form: Extract the form in which the medication is to be used.
7. Route: Extract the method of administration for a medication.
8. Reason: Extract the reason or condition th

## Create Test Set

In [None]:
data_with_entities = [obj for obj in data[1000:] if any(obj['entities'].values())]

sample_size = min(100, len(data_with_entities))
test_data_with_entities = sample(data_with_entities, sample_size)

def convert_to_csv(json_objects):
    csv_string_data = []
    for obj in json_objects:
        chat_sample = f"""{obj['Original_INSTRUCTION']}\n\n### Instruction:\n{obj['text']}\n\n### Response:\n{json.dumps(obj['entities'], indent=2)}\n"""
        csv_string_data.append([chat_sample, 'ner_data'])
    return csv_string_data

test_csv_file_path_with_entities = '/content/drive/My Drive/266_project/mistral_7b_data/test_data_with_entities_100.csv'
with open(test_csv_file_path_with_entities, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['chat_sample', 'source'])
    csv_writer.writerows(convert_to_csv(test_data_with_entities))

print(f'Test CSV file with entities saved at: {test_csv_file_path_with_entities}')

Test CSV file with entities saved at: /content/drive/My Drive/266_project/mistral_7b_data/test_data_with_entities_100.csv


## Generate Predictions and Store Results

In [None]:
import re

def extract_json_from_response(response):
    start_index = response.find("[/INST]") + len("[/INST]")
    if start_index == -1:
        return None

    response_part = response[start_index:].strip()

    match = re.search(r'\{.*?\}', response_part, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            json_dict = json.loads(json_str)
            return json_dict
        except json.JSONDecodeError:
            return None
    else:
        return None

In [None]:
import pandas as pd

test_cases = pd.read_csv("/content/drive/My Drive/266_project/mistral_7b_data/test_data_with_entities_100.csv")

predictions = []
instructions = []
true_values = []

for index, row in test_cases.iterrows():
    instruction = row['chat_sample'].split("### Instruction:\n")[1].split("\n### Response:")[0].strip()

    instructions.append(instruction)

    system_prompt = prompt_inst
    B_INST, E_INST = "[INST]", "[/INST]"
    prompt = f"{system_prompt}{B_INST}{instruction.strip()}\n{E_INST}"

    pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
    )

    sequences = pipe(
      prompt,
      do_sample=True,
      max_new_tokens=200,
      temperature=0.7,
      top_k=50,
      top_p=0.95,
      num_return_sequences=1,
    )

    generated_response = sequences[0]['generated_text']

    prediction = extract_json_from_response(generated_response)

    predictions.append(prediction)

    true_response = json.loads(row['chat_sample'].split("### Response:\n")[1].strip())

    true_values.append(true_response)

In [None]:
data = {
    'Instruction': instructions,
    'True Value': true_values,
    'Prediction': predictions
}

df = pd.DataFrame(data)
df.head()

In [None]:
file_path = '/content/drive/My Drive/266_project/mistral_7b_data/few_shot_entity_predictions_100.csv'

df.to_csv(file_path, index=False)

print(f'CSV file saved at: {file_path}')

CSV file saved at: /content/drive/My Drive/266_project/mistral_7b_data/few_shot_entity_predictions_100.csv
