## Notebook Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os
import pandas as pd
import json
import csv
from random import sample

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

!pip install -q torch==2.2.2
!pip install -q -U torchaudio torchtext torchvision

!pip install transformers
!pip install datasets==2.16.0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━

In [None]:
import torch
import torchaudio
import torchtext
import torchvision

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb, platform, gradio, warnings

from datasets import load_dataset

from trl import SFTTrainer
from huggingface_hub import notebook_login

import wandb

print(f"torch: {torch.__version__}")
print(f"torchaudio: {torchaudio.__version__}")
print(f"torchtext: {torchtext.__version__}")
print(f"torchvision: {torchvision.__version__}")

torch: 2.2.2+cu121
torchaudio: 2.2.2+cu121
torchtext: 0.17.2+cpu
torchvision: 0.17.2+cu121


## Data Preprocessing

In [None]:
output_jsonl_file_path = 'drive/MyDrive/266_project/json_data/main_data.jsonl'

data = []

with open(output_jsonl_file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

len(data)

49877

In [None]:
prompt = """
Extract the entities for the specified labels from the given medical text and provide the results in JSON format
- Entities must be extracted precisely as they appear in the text.
- Return each entity under its label without creating new labels.
- Provide a list of entities for each label. If no entities are found for a label, return an empty list.
- Prioritize accuracy and relevance in the identification of entities.

Here are the entity labels and their descriptions:
1. Drug: Extract any mentioned medications or drugs.
2. Duration: Extract the duration of treatment or medication usage.
3. Dosage: Extract dosages related to medications, including units.
4. Frequency: Extract how often the medication or treatment is to be taken or administered.
5. Strength: Extract the concentration or potency of the medication.
6. Form: Extract the form in which the medication is to be used.
7. Route: Extract the method of administration for a medication.
8. Reason: Extract the reason or condition the medication is prescribed for.
9. ADE: Extract adverse drug events or side effects mentioned.

Make sure to go through the text carefully and extract all entities mentioned above if they are present. Do not create fictitious data.
"""

instruction_value = prompt.strip()

In [None]:
for obj in data:
    obj["Original_INSTRUCTION"] = instruction_value

len(data)

49877

## Create Train Set

In [None]:
import pandas as pd
import json
import csv
from random import sample

training_data = data[:1000]

def convert_to_csv(json_objects):
    csv_string_data = []
    for obj in json_objects:
        chat_sample = f"""{obj['Original_INSTRUCTION']}\n\n### Instruction:\n{obj['text']}\n\n### Response:\n{json.dumps(obj['entities'], indent=2)}\n"""
        csv_string_data.append([chat_sample, 'ner_data'])
    return csv_string_data

training_csv_file_path = '/content/drive/My Drive/266_project/mistral_7b_data/train_data.csv'
with open(training_csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['chat_sample', 'source'])
    csv_writer.writerows(convert_to_csv(training_data))

training_csv_file_path

## Mistral 7b Training

In [None]:
base_model = "mistralai/Mistral-7B-v0.1"

In [None]:
dataset = load_dataset('csv', data_files=training_csv_file_path)

print(dataset['train'][0])

Generating train split: 0 examples [00:00, ? examples/s]

{'chat_sample': 'Extract the entities for the specified labels from the given medical text and provide the results in JSON format\n- Entities must be extracted precisely as they appear in the text.\n- Return each entity under its label without creating new labels.\n- Provide a list of entities for each label. If no entities are found for a label, return an empty list.\n- Prioritize accuracy and relevance in the identification of entities.\n\nHere are the entity labels and their descriptions:\n1. Drug: Extract any mentioned medications or drugs.\n2. Duration: Extract the duration of treatment or medication usage.\n3. Dosage: Extract dosages related to medications, including units.\n4. Frequency: Extract how often the medication or treatment is to be taken or administered.\n5. Strength: Extract the concentration or potency of the medication.\n6. Form: Extract the form in which the medication is to be used.\n7. Route: Extract the method of administration for a medication.\n8. Reason: Extr

In [None]:
len(dataset['train'])

1000

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

(True, True)

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
wandb.login(key='d5eecddaf83eb3db40465ec86f52e03545f5f914')

wandb.init(project='Finetuned Mistral 7B NER', job_type="training", anonymous="allow")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mashoksun01[0m ([33mmids-berkeley[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
training_arguments = TrainingArguments(
    output_dir="/content/drive/My Drive/266_project/mistral_7b_data/mistral_7b_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_8bit",
    save_steps=1000,
    logging_steps=30,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="chat_sample",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
torch.cuda.empty_cache()
trainer.train()
wandb.finish()
model.config.use_cache = True
model.eval()



Step,Training Loss
30,0.3235
60,0.1916
90,0.1718
120,0.1303
150,0.1135
180,0.1184
210,0.1117
240,0.1156
270,0.0758
300,0.0821


VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▂▂▃▃▄▅▅▆▆▇██
train/global_step,▁▂▂▃▃▄▅▅▆▆▇██
train/grad_norm,▃▁█▃▄▃▄▅▅▆▄▆
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▄▄▃▂▂▂▂▁▁▁▁

0,1
total_flos,5.223687157825536e+16
train/epoch,3.0
train/global_step,375.0
train/grad_norm,0.22227
train/learning_rate,0.0002
train/loss,0.0852
train_loss,0.12947
train_runtime,1375.4711
train_samples_per_second,2.181
train_steps_per_second,0.273


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer):

In [None]:
model_save_path = '/content/drive/My Drive/266_project/official_mistral_7b'

if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    print(f"Directory {model_save_path} created")

tokenizer.save_pretrained(model_save_path)

model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), os.path.join(model_save_path, 'pytorch_model.bin'))

model.config.to_json_file(os.path.join(model_save_path, 'config.json'))

print(f"Model and tokenizer have been manually saved to {model_save_path}")

Directory /content/drive/My Drive/266_project/official_mistral_7b created
Model and tokenizer have been manually saved to /content/drive/My Drive/266_project/official_mistral_7b


## Model Inference


In [None]:
prompt_inst = """
Extract the entities for the specified labels from the given medical text and provide the results in JSON format
- Entities must be extracted precisely as they appear in the text.
- Return each entity under its label without creating new labels.
- Provide a list of entities for each label. If no entities are found for a label, return an empty list.
- Prioritize accuracy and relevance in the identification of entities.

Here are the entity labels and their descriptions:
1. Drug: Extract any mentioned medications or drugs.
2. Duration: Extract the duration of treatment or medication usage.
3. Dosage: Extract dosages related to medications, including units.
4. Frequency: Extract how often the medication or treatment is to be taken or administered.
5. Strength: Extract the concentration or potency of the medication.
6. Form: Extract the form in which the medication is to be used.
7. Route: Extract the method of administration for a medication.
8. Reason: Extract the reason or condition the medication is prescribed for.
9. ADE: Extract adverse drug events or side effects mentioned.

Make sure to go through the text carefully and extract all entities mentioned above if they are present. Do not create fictitious data.
/n
"""

In [None]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = prompt_inst
    B_INST, E_INST = "[INST]", "[/INST]"

    prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    response = model.generate(**inputs, streamer=streamer, max_new_tokens=150)

    response = tokenizer.decode(response[0], skip_special_tokens=True)

    return response

In [None]:
stream("""125 mg of Tylenol daily""")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.




### Instruction:
[ * * Hospital1 * * ]

### Response:
{
 "Drug": [
   "Tylenol"
 ],
 "Duration": [],
 "Dosage": [],
 "Frequency": [
   "daily"
 ],
 "Strength": [
   "125 mg"
 ],
 "Form": [],
 "Route": [],
 "Reason": [],
 "ADE": []
}

### Response:
{
 "Drug": [
   "Tylenol"
 ],
 "Duration": [],
 "Dos


'\nExtract the entities for the specified labels from the given medical text and provide the results in JSON format\n- Entities must be extracted precisely as they appear in the text.\n- Return each entity under its label without creating new labels.\n- Provide a list of entities for each label. If no entities are found for a label, return an empty list.\n- Prioritize accuracy and relevance in the identification of entities.\n\nHere are the entity labels and their descriptions:\n1. Drug: Extract any mentioned medications or drugs.\n2. Duration: Extract the duration of treatment or medication usage.\n3. Dosage: Extract dosages related to medications, including units.\n4. Frequency: Extract how often the medication or treatment is to be taken or administered.\n5. Strength: Extract the concentration or potency of the medication.\n6. Form: Extract the form in which the medication is to be used.\n7. Route: Extract the method of administration for a medication.\n8. Reason: Extract the reason

## Create Test Set

In [None]:
data_with_entities = [obj for obj in data[1000:] if any(obj['entities'].values())]

sample_size = min(100, len(data_with_entities))
test_data_with_entities = sample(data_with_entities, sample_size)

def convert_to_csv(json_objects):
    csv_string_data = []
    for obj in json_objects:
        chat_sample = f"""{obj['Original_INSTRUCTION']}\n\n### Instruction:\n{obj['text']}\n\n### Response:\n{json.dumps(obj['entities'], indent=2)}\n"""
        csv_string_data.append([chat_sample, 'ner_data'])
    return csv_string_data

test_csv_file_path_with_entities = '/content/drive/My Drive/266_project/mistral_7b_data/test_data_with_entities_100.csv'
with open(test_csv_file_path_with_entities, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['chat_sample', 'source'])
    csv_writer.writerows(convert_to_csv(test_data_with_entities))

print(f'Test CSV file with entities saved at: {test_csv_file_path_with_entities}')

Test CSV file with entities saved at: /content/drive/My Drive/266_project/mistral_7b_data/test_data_with_entities_100.csv


## Generate Predictions and Store Results

In [None]:
import pandas as pd

test_cases = pd.read_csv("/content/drive/My Drive/266_project/mistral_7b_data/test_data_with_entities_100.csv")

predictions = []
instructions = []
true_values = []

for index, row in test_cases.iterrows():
    instruction = row['chat_sample'].split("### Instruction:\n")[1].split("\n### Response:")[0].strip()

    instructions.append(instruction)

    generated_response = stream(instruction)

    prediction = extract_json_from_response(generated_response)

    predictions.append(prediction)

    true_response = json.loads(row['chat_sample'].split("### Response:\n")[1].strip())

    true_values.append(true_response)

In [None]:
data = {
    'Instruction': instructions,
    'True Value': true_values,
    'Prediction': predictions
}

df = pd.DataFrame(data)

In [None]:
file_path = '/content/drive/My Drive/266_project/mistral_7b_data/entity_predictions_100.csv'

df.to_csv(file_path, index=False)

print(f'CSV file saved at: {file_path}')

CSV file saved at: /content/drive/My Drive/266_project/mistral_7b_data/entity_predictions_100.csv
