In [None]:


!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/244.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer


In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4


# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
dataset = load_dataset(dataset_name, split="train")
print(dataset)

Downloading readme:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 1000
})


In [None]:
import pandas as pd

# Load dataset (you can process it here)
dataset = pd.read_excel(r'/content/drive/MyDrive/bart/MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx')

dataset = dataset[['CHQ','Summary']]


In [None]:
from sklearn.model_selection import train_test_split
train_data, temp = train_test_split(dataset, test_size=0.2, random_state=42)
test_data, val_data = train_test_split(temp, test_size=0.5, random_state=42)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


In [None]:
test_data

Unnamed: 0,CHQ,Summary
0,Im suffering from Beta Thalassemia Major from ...,Is mylostat 500 a treatment for Beta Thalassem...
1,SUBJECT: loss of taste\nMESSAGE: i have gia...,What causes loss of taste and is it related t...
2,SUBJECT: shingles\nMESSAGE: I am having sympto...,I am having symptoms of shingles; is it too la...
3,SUBJECT: Rubella\nMESSAGE: My grandson (4 yrs ...,What is the prognosis of rubella in a child?
4,SUBJECT: Ochoa syndrome\nMESSAGE: Where could ...,Where can I find information on ochoa syndrome?
...,...,...
95,SUBJECT: ClinicalTrials.gov - Question - gener...,What are the treatments for ocular albinism?
96,Vitamins.\n Can I take zinc and zinc citrate a...,Can zinc and zinc citrate be taken together?
97,SUBJECT: thalamic lacunar strokes\nMESSAGE: I ...,Where can I find information on thalamic lacun...
98,SUBJECT: ingredient in apremilast.\nMESSAGE: I...,What are the ingredients in apremilast?


In [None]:
for i in test_data['CHQ']:
  print (i)

Im suffering from Beta Thalassemia Major from birth. I have been transfusing every 22 days. Now i heard about a tablet named mylostat 500 (Hydroxyurea). I wanted to know wheather the same will be suited for me.
SUBJECT: loss of  taste
MESSAGE: i  have  giant  cell arteritis.  on  steroids.  lost  taste  .  is  this  related  to  the  illness  (  was  it  ever  reported  in  GCA  ?  )  or  side  rx  to  prednisone.  i   wish  a  rheumatologist  at  NIH  responds
SUBJECT: shingles
MESSAGE: I am having symptoms of shingles, no rash or blisters, is it too late to get the vaccine? I have had the chicken pox and take acylovir on a as needed basis for blisters I get on my buttocks
SUBJECT: Rubella
MESSAGE: My grandson (4 yrs old) has contracted Rubella.  I know for sure he has had at least one of the vaccines.  They are living in Mexico right now.  Should we be worried?
SUBJECT: Ochoa syndrome
MESSAGE: Where could I find more information about the rare disease, Ochoa syndrome. I think a sibli

In [None]:
train_dataset=pd.DataFrame()
train_dataset['text'] = dataset['CHQ']+ "summary: " + dataset['Summary']
def func(x):
    return x.lower()
train_dataset['text'] = train_dataset['text'].map(func)
dataset = load_dataset(dataset_name, split="train")
print(train_dataset)

                                                  text
0    subject: who and where to get cetirizine - d\n...
1    who makes bromocriptine\ni am wondering what c...
2    subject: nulytely\nmessage: hello can you tell...
3    williams' syndrome\ni would like to have my da...
4    clinicaltrials.gov - question - general inform...
..                                                 ...
995  subject: after surgery of ear drum still same ...
996  subject: clinicaltrials.gov - question - speci...
997  message: i have numbness/tingling in my lower ...
998  subject: sleep apnea\nmessage: i was diagnosed...
999  subject: home resources for amd people\nmessag...

[1000 rows x 1 columns]


In [None]:


# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
# trainer.train()

# # Save trained model
# trainer.model.save_pretrained(new_model)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



AttributeError: ignored

In [None]:
path='/content/drive/MyDrive/vocabulary.txt'
with open(path, "r") as file:
    vocabulary = file.readlines()

# Remove the newline character from each word
vocabulary = [word.strip() for word in vocabulary]

In [None]:
vocabulary = [item.lower() for item in vocabulary]
item_to_remove = ['supply','who','costs','price','cost','prescription','daughter','female','parents','health','mother','farther','family','texas','today','sister','brother','military','father','son','time','use','work','woman','man','male','lunch','dinner']
for item in item_to_remove:
  if item in vocabulary:
    vocabulary.remove(item)
clinical_dict = {"Clinical Phrase": vocabulary}

In [None]:
import re
from nltk.util import ngrams
from nltk import word_tokenize
import nltk
nltk.download('punkt')
clinical=[]
text_list=['SUBJECT: diabetes MESSAGE: It was really helpful after reading about type 1 diabetes but would like some more suggestions from you .my friend is 35 year old and its been 1 year of his diabetes n his sugar level is around 100 or 120 OR SO.. each time he tests his sugar level. is there ANY COMPLICATIONS? HE ALSO GO FOR EXERCISES AND BLOOD TEST EVERY 2 OR 3 MONTHS N TAKES A PROPER DIET LIKE GREEN JUICE,OLIVE OIL AND ALL. WAITING FOR YOUR FEED BACK. THANK YOU.']

for text in text_list:
  tokens = word_tokenize(text.lower())

  text_bigrams_1 = [' '.join(ngram) for ngram in ngrams(tokens, 1)]
  text_bigrams_2 = [' '.join(ngram) for ngram in ngrams(tokens, 2)]
  text_bigrams_3 = [' '.join(ngram) for ngram in ngrams(tokens, 3)]


  matching_phrases_1_drug = [phrase.lower() for phrase in clinical_dict['Clinical Phrase'] if phrase.lower() in text_bigrams_1]
  matching_phrases_2_drug = [phrase.lower() for phrase in clinical_dict['Clinical Phrase'] if phrase.lower() in text_bigrams_2]
  matching_phrases_3_drug = [phrase.lower() for phrase in clinical_dict['Clinical Phrase'] if phrase.lower() in text_bigrams_3]

  combined_clinical = matching_phrases_1_drug + matching_phrases_2_drug+matching_phrases_3_drug
  clinical.append(matching_phrases_1_drug)
  clinical.append(matching_phrases_2_drug)
  clinical.append(matching_phrases_3_drug)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
clinical

[['blood',
  'suggestions',
  'back',
  'reading',
  'diet',
  'sugar',
  'olive',
  'friend',
  'exercises',
  'complications',
  'blood'],
 ['blood test', 'olive oil'],
 ['type 1 diabetes']]

In [None]:

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Could you summarize the paragraph with keyword ['blood','suggestions','back','reading','diet','sugar','olive','friend','exercises','complications','blood test', 'olive oil','type 1 diabetes'] (SUBJECT: diabetes MESSAGE: It was really helpful after reading about type 1 diabetes but would like some more suggestions from you .my friend is 35 year old and its been 1 year of his diabetes n his sugar level is around 100 or 120 OR SO.. each time he tests his sugar level. is there ANY COMPLICATIONS? HE ALSO GO FOR EXERCISES AND BLOOD TEST EVERY 2 OR 3 MONTHS N TAKES A PROPER DIET LIKE GREEN JUICE,OLIVE OIL AND ALL. WAITING FOR YOUR FEED BACK. THANK YOU.) after you learn from these two pair of summarization(###[orginal context]:SUBJECT: Stroke mobility equipment MESSAGE: Husband has had 3 strokes since 1990.  He refuses to take anymore maintenance therapy.  It is becoming more challenging to help him be mobile.  There does not seem to be any source to assist me in learning what equipment, etc.  I could or should get to help him and to help me in my role as caregiver.  Any help you can give would be much appreciated!###[summrization]: Where can I find information on stroke resources, including mobility equipment and caregiver support groups?|||###[original context]: SUBJECT: grievance counseling MESSAGE: My father passed away [DATE].  Hospice helped our family with his passing and offered us counseling which at that time our family didn't use.   My brother is now interested in counseling services and I'm wondering if I can get some information on this.   Thank you. ###[summarization:]Where can I get information on grief counseling and support groups?)"
prompt1 = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=10000)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

[INST] Could you summarize the paragraph with keyword ['blood','suggestions','back','reading','diet','sugar','olive','friend','exercises','complications','blood test', 'olive oil','type 1 diabetes'] (SUBJECT: diabetes MESSAGE: It was really helpful after reading about type 1 diabetes but would like some more suggestions from you .my friend is 35 year old and its been 1 year of his diabetes n his sugar level is around 100 or 120 OR SO.. each time he tests his sugar level. is there ANY COMPLICATIONS? HE ALSO GO FOR EXERCISES AND BLOOD TEST EVERY 2 OR 3 MONTHS N TAKES A PROPER DIET LIKE GREEN JUICE,OLIVE OIL AND ALL. WAITING FOR YOUR FEED BACK. THANK YOU.) after you learn from these two pair of summarization(###[orginal context]:SUBJECT: Stroke mobility equipment MESSAGE: Husband has had 3 strokes since 1990.  He refuses to take anymore maintenance therapy.  It is becoming more challenging to help him be mobile.  There does not seem to be any source to assist me in learning what equipment

In [None]:
!pip install rouge

NotImplementedError: ignored

In [None]:
reference_summaries = test_data['Summary'].tolist()
from rouge import Rouge

rouge = Rouge()
rouge_l_scores = rouge.get_scores(predictions, reference_summaries, avg=True)['rouge-l']

print("ROUGE-L Scores:", rouge_l_scores)

In [None]:
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('punkt')
nltk.download('wordnet')
# Tokenize the reference summaries and predictions
tokenized_reference_summaries = [word_tokenize(ref) for ref in reference_summaries]
tokenized_predictions = [word_tokenize(pred) for pred in predictions]

# Compute METEOR scores
meteor_scores = [meteor_score([ref], pred) for ref, pred in zip(tokenized_reference_summaries, tokenized_predictions)]
avg_meteor_score = sum(meteor_scores) / len(meteor_scores)

print("Average METEOR Score:", avg_meteor_score)


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Format the reference summaries for use with nltk's corpus_bleu
references = [[ref.split()] for ref in reference_summaries]
# Tokenize the generated predictions
candidates = [pred.split() for pred in predictions]

bleu_score = corpus_bleu(references, candidates)

print("BLEU Score:", bleu_score)
