# MedAlpaca Training

## Initiate config settings

In [33]:
# %cd medalpaca-7b-capstone/

# nvidia-smi
# # sudo kill -9 29431 (29431 is PID in nvidia-smi)
# df -h
# du -h --max-depth=1 /home/jupyter | sort -rh
# rm -rf /home/jupyter/.cache/*

import warnings
warnings.filterwarnings("ignore")

In [34]:
# Config Settings
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, LLaMAForCausalLM, LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

# Setting for A100
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 2 # paper uses 3
LEARNING_RATE = 2e-5  # from the original paper
CUTOFF_LEN = 512
TRAIN_STEPS = 300 # adjust for training time and efficiency
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

## Loading MedAlpaca 7B Model and Tokenizer

In [35]:
from transformers import AutoTokenizer, AutoModelForCausalLM, LLaMATokenizer

model = AutoModelForCausalLM.from_pretrained(
    "medalpaca/medalpaca-7b",
    load_in_8bit=True,
    device_map="auto",
)
tokenizer = LLaMATokenizer.from_pretrained(
    "medalpaca/medalpaca-7b", add_eos_token=True
)

tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'LLaMATokenizer'.


### Essential methods 

In [36]:
cutoff_len = 512

def generate_prompt(data_point, cutoff_len):
    available_space = cutoff_len - len("### Instruction:\n") - len(data_point["instruction"]) - len("\n### Input:\n") - len("\n### Response:\n")
    
    # Truncate the input text to fit within the available space
    truncated_input = data_point["input"][:available_space]
    
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""

    return prompt

def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True, # truncation was set to True, making the response not shown properly when the input is too long
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point, 512)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

## Process Dataset

In [37]:
model = prepare_model_for_int8_training(model)

In [38]:
# Prepare Data
data = load_dataset("json", data_files="CT_InstructionTuning.json")

Found cached dataset json (/home/jupyter/.cache/huggingface/datasets/json/default-6606eaf40512cd75/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
# Initiate LoraConfig
from peft import LoraConfig, get_peft_model, set_peft_model_state_dict, get_peft_model_state_dict

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

In [40]:
# Create validation dataset 
train_val = data["train"].train_test_split(test_size=300, shuffle=True, seed=42)

train_data = (train_val["train"].shuffle().map(generate_and_tokenize_prompt))
val_data = (train_val["test"].shuffle().map(generate_and_tokenize_prompt))

Loading cached split indices for dataset at /home/jupyter/.cache/huggingface/datasets/json/default-6606eaf40512cd75/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-b170efa1078d55ae.arrow and /home/jupyter/.cache/huggingface/datasets/json/default-6606eaf40512cd75/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-768a7e750b79d253.arrow


Map:   0%|          | 0/7700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [43]:
max_len = 0
max_id = 0
for i in range(7700):
    length = len(train_data[i]['input_ids'])
    if max_len < length:
        max_len = length
        max_id = i
print(max_id)
print(max_len)

# print('\n',train_data[max_id]['input'])
# print('\n',train_data[max_id]['output'])

print(generate_prompt(train_data[max_id], 512))

0
512
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate impression based on clinical information and findings.
### Input:
Status post type a aortic dissection repair history: status post repair. Interval development of large left pleural effusion. Moderate to severe upper lobe predominant emphysema. Right lung base subpleural bullae. Interval repair of ascending aortic aneurysm. Interval aneurysmal dilation of the descending aorta. At the level of the carina the descending aorta measures 4.9 x 5.7 cm , previously 3.5 x 3.9 cm. At the level of the diaphragmatic hiatus the descending aorta measures 4.3 x 4.7 cm , previously 3.7 x 4.1 cm. There is slow enhancement of the false lumen indicating a fenestration of the dissection flap. Right carotid stent in place. Dilated pulmonary artery compatible with pulmonary hypertension. Subcarinal lymph node measuring 

In [9]:
model.print_trainable_parameters()

trainable params: 4194304 || all params: 6742618112 || trainable%: 0.06220586618327525


In [10]:
# Initiate Training with Lora Config
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        max_steps = TRAIN_STEPS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=50,
        save_steps=50,
        output_dir="medalpaca_results",
        save_total_limit=3,
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
        )
)
model.config.use_cache = False

old_state_dict = model.state_dict
state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))()

set_peft_model_state_dict(model, state_dict)

trainer.train()

Step,Training Loss,Validation Loss
50,2.6346,2.622983
100,2.3284,2.276741
150,1.966,1.936463
200,1.7954,1.784597
250,1.8061,1.741016
300,1.7158,1.724318


TrainOutput(global_step=300, training_loss=2.100595785776774, metrics={'train_runtime': 4153.9806, 'train_samples_per_second': 4.622, 'train_steps_per_second': 0.072, 'total_flos': 3.609343397666488e+17, 'train_loss': 2.100595785776774, 'epoch': 2.5})

In [11]:
# save the trained model
model.save_pretrained("medalpaca_7b_capstone")

# Save model to huggingface page

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
model.push_to_hub("Danieljyc/medalpaca_7b_capstone", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Danieljyc/medalpaca_7b_capstone/commit/d8cde8c50ad1a300c609cb9d67981af568826cba', commit_message='Upload model', commit_description='', oid='d8cde8c50ad1a300c609cb9d67981af568826cba', pr_url=None, pr_revision=None, pr_num=None)

## Evaluation

In [14]:
from rouge import Rouge
import nltk

In [44]:
from peft import PeftModel
from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig
tokenizer = LLaMATokenizer.from_pretrained("medalpaca/medalpaca-7b")

model = LLaMAForCausalLM.from_pretrained(
    "medalpaca/medalpaca-7b",
    load_in_8bit=True,
    device_map="auto",
)
model = PeftModel.from_pretrained(model, "Danieljyc/medalpaca_7b_capstone")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizer'. 
The class this function is called from is 'LLaMATokenizer'.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [45]:
# config generation settings
generation_config = GenerationConfig(
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.15,
)

In [47]:
generated_impressions = []
original_impressions = []
generated_counts = 0

for data_point in val_data:
    inputs = generate_and_tokenize_prompt(data_point)
    
    input_ids = torch.tensor(inputs["input_ids"]).unsqueeze(0)
    generation_output = model.generate(
        input_ids=input_ids.to('cuda'),
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=512,
    )
    generated_counts+=1
    
    for s in generation_output.sequences:
        impression_tokens = tokenizer.decode(s)
    
    if generated_counts % 50 == 0:
        print(f'Generating... Generated {generated_counts} impressions.')
    
    generated_impression = impression_tokens.split("### Response:")[-1].split("</s>")[0].strip()
    
    generated_impressions.append(generated_impression)
    original_impressions.append(data_point['output'])

Generating... Generated 50 impressions.
Generating... Generated 100 impressions.
Generating... Generated 150 impressions.
Generating... Generated 200 impressions.
Generating... Generated 250 impressions.
Generating... Generated 300 impressions.


### ROUGE Score

In [48]:
import nltk
import rouge_score

In [49]:
from datasets import load_metric
rouge_result = load_metric('rouge')

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [50]:
rouge_result.compute(predictions= generated_impressions, references= original_impressions)

{'rouge1': AggregateScore(low=Score(precision=0.8718436880237352, recall=0.9030677166912301, fmeasure=0.860219247125902), mid=Score(precision=0.903166478217011, recall=0.9241705214603966, fmeasure=0.8901430408063447), high=Score(precision=0.9322995823400029, recall=0.944964532202922, fmeasure=0.9192132211433116)),
 'rouge2': AggregateScore(low=Score(precision=0.859028023338243, recall=0.8585548059326111, fmeasure=0.8439612262686172), mid=Score(precision=0.8913050020031232, recall=0.8890489431961077, fmeasure=0.8769190538164033), high=Score(precision=0.9241021865237158, recall=0.9168513901057519, fmeasure=0.9091053080124466)),
 'rougeL': AggregateScore(low=Score(precision=0.8649746303132518, recall=0.876137503508571, fmeasure=0.8486046824932391), mid=Score(precision=0.8975516146484162, recall=0.9029343766983999, fmeasure=0.88307768766514), high=Score(precision=0.9288085755338452, recall=0.9287765408860261, fmeasure=0.9150868265775309)),
 'rougeLsum': AggregateScore(low=Score(precision=0

### BLEU Score

In [51]:
bleu_result = load_metric('bleu')

Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [88]:
import re
split_generated_impressions = [re.split(r'\s+|-', text) for text in generated_impressions]
list_of_split_generated_impressions = [list(inner_array) for inner_array in split_generated_impressions]

from nltk.tokenize import sent_tokenize, word_tokenize
tokenized_sentences = [sent_tokenize(paragraph) for paragraph in original_impressions]
split_original_impressions = [[word_tokenize(sentence) for sentence in paragraph] for paragraph in tokenized_sentences]

In [90]:
bleu_result.compute(predictions= list_of_split_generated_impressions, references= split_original_impressions)

{'bleu': 0.3100091749178359,
 'precisions': [0.38318009734991887,
  0.34040681693238045,
  0.28988261598658466,
  0.24427263941788416],
 'brevity_penalty': 1.0,
 'length_ratio': 7.1252408477842,
 'translation_length': 18490,
 'reference_length': 2595}

## other codes

In [46]:
id = 254

inputs = generate_and_tokenize_prompt(val_data[id])
input_ids = torch.tensor(inputs["input_ids"]).unsqueeze(0)
generation_output = model.generate(
        input_ids=input_ids.to('cuda'),
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256,
    )
for s in generation_output.sequences:
    print(tokenizer.decode(s))
    
print('\n',val_data[id]['output'])

  Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate impression based on clinical information and findings.
### Input:
persistent headache and neck pain, dizziness. History of gunshot wound with residual shrapnel to t4/t5. CervicalThe cervical vertebral bodies are appropriate height. Alignment is maintained. No fractures are identified in the cervical spine. No destructive osseous lesions are identified in the cervical spine. Nonspecific lytic lesion involving the right upper aspect of the C4 vertebral body as well as the posterior aspect of the T1 vertebral body. There is also a sclerotic focus involving the T2 vertebral body. Small osseous protuberance along the left inferior facet of C6 may represent a tiny osteochondroma. C2-3: No significant compromise to the spinal canal or neural foramina. No significant compromise to the spinal canal or neural for

In [32]:
print(generate_prompt(val_data[254], 512))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Generate impression based on clinical information and findings.
### Input:
 There is transitional lumbosacral anatomy, with partial lumbarization of the S1 vertebra. For the purposes of this exam, the last fully formed disc is at S1-S2 with last rib-bearing vertebra designated as T12. Postoperative changes are seen from posterior surgical fusion of L5 and S1, with bilateral pedicle screws and interconnecting rods. There is no evidence of instrumentation complication. There is no cortical breach. There is evidence of left facetectomy at the L5-S1 level. Interbody spacer is present with associated bone graft material. Streak artifact from the instrumentation limits evaluation of surrounding structures. There are a few foci of air scattered in the subcutaneous soft tissues as well as a few in the left ventral epidura

In [188]:
input_ids = torch.tensor(val_data[1]['input_ids']).unsqueeze(0)
generation_output = model.generate(
        input_ids=input_ids.to('cuda'),
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=512,
    )
for s in generation_output.sequences:
    print(tokenizer.decode(s))

  Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
            ### Instruction:
            Generate impression based on clinical information and findings.
            ### Input:
            Esophageal cancer status post chemo rads on chemo, creatine kinase response. Scattered pulmonary micronodules, some of which are calcified, indicating healed granulomatous disease. No suspicious pulmonary nodule or mass. No focal consolidation or pleural effusion. Again seen is bulky, heterogeneous, eccentric thickening of the distal esophagus, that appears to invade the gastric wall as well, and appears to have increased in size from the most recent prior as well as significantly when compared to the original study. This area measures approximately 33 mm in diameter, previously 26 mm, and before that 19 mm. Right chest port tip at the cavoatrial junction. The heart size is within normal lim