In [1]:
pip install transformers datasets peft

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
   ---------------------------------------- 0.0/480.6 kB ? eta -:--:--
   --------- ------------------------------ 112.6/480.6 kB 3.2 MB/s eta 0:00:01
   ---------------------------------------  471.0/480.6 kB 5.8 MB/s eta 0:00:01
   ---------------------------------------- 480.6/480.6 kB 5.0 MB/s eta 0:00:00
Downloading peft-0.13.2-py3-none-any.whl


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\ss1516\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import json
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig

# 1. Load the distilgpt2 model and tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
# model = GPT2LMHeadModel.from_pretrained('distilgpt2')
# # Set the pad_token to eos_token
# tokenizer.pad_token = tokenizer.eos_token
# model.config.pad_token_id = tokenizer.eos_token_id


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'})

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

print("model loaded")

# 2. Apply LoRA
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print("LoRA applied")
model.print_trainable_parameters()
print("trainable parameters printed")

# 3. Load and format your dataset (same as before)
# ... (use the same load_dataset function as earlier)
# def load_dataset(file_path):
#     with open(file_path, 'r') as file:
#         data = json.load(file)
    
#     # Limit to 1000 examples
#     data = data[:100]
    
#     formatted_data = []
#     for item in data:
#         instruction = item.get('instruction', '')
#         input_text = item.get('input', '')
#         output_text = item.get('output', '')
        
#         # Construct the prompt
#         if input_text:
#             prompt = f"{instruction}\n\nInput: {input_text}\n\nResponse:"
#         else:
#             prompt = f"{instruction}\n\nResponse:"
        
#         # Combine prompt and output
#         full_text = prompt + " " + output_text
        
#         formatted_data.append({'text': full_text})
    
#     return Dataset.from_dict({'text': [item['text'] for item in formatted_data]})
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
   
    data = data[:1000] #2k 
    
    formatted_data = []
    for item in data:
        instruction = "You are an experienced medical doctor specializing in mental health. Provide a detailed and compassionate response to the patient's description." #item.get('instruction', '')
        input_text = item.get('input', '')
        output_text = item.get('output', '')
        
        # Construct the prompt
        if input_text:
            prompt = f"{instruction}\n\nPatient's Description: {input_text}\n\nDoctor's Response"
        else:
            prompt = f"{instruction}\n\nDoctor's Response"
        
        formatted_data.append({
            'text': prompt,
            'label': output_text
        })
    
    return Dataset.from_dict({
        'text': [item['text'] for item in formatted_data],
        'label': [item['label'] for item in formatted_data]
    })


dataset = load_dataset('chatdoctor5k.json')  #'chatdoctor5k.json')

# 4. Tokenize the dataset
# def tokenize_function(examples):
#     return tokenizer(examples['text'], truncation=True, padding="longest", max_length=128 )

# tokenized_dataset = dataset.map(tokenize_function, batched=True)
def tokenize_function(examples):
    inputs = tokenizer(
        examples['text'],
        truncation=True,
        max_length=128,
        padding='longest'
    )
    outputs = tokenizer(
        examples['label'],
        truncation=True,
        max_length=128,
        padding='longest'
    )
    
    # Concatenate the inputs and outputs
    input_ids = []
    labels = []
    attention_masks = []
    for i in range(len(inputs['input_ids'])):
        input_id = inputs['input_ids'][i] + outputs['input_ids'][i]
        attention_mask = inputs['attention_mask'][i] + outputs['attention_mask'][i]
        label = [-100] * len(inputs['input_ids'][i]) + outputs['input_ids'][i]
        
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        labels.append(label)
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

# tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names  # Remove original columns
)

# 5. Prepare data collator
from transformers import default_data_collator

data_collator = default_data_collator

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 6. Set training arguments
training_args = TrainingArguments(
    output_dir='./lora_results',
    overwrite_output_dir=True,
    learning_rate=3e-4,
    per_device_train_batch_size=4, #4 or 8
    num_train_epochs=3, #3 or more
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    remove_unused_columns=True,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=1,
    # evaluation_strategy="steps",
    # eval_steps=200,
)

# 7. Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# 8. Train the model
trainer.train()

# 9. Save the LoRA adapter
model.save_pretrained('./trained_lora_2k')
tokenizer.save_pretrained('./trained_lora_2k')


In [50]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./trained_lora')
tokenizer = GPT2Tokenizer.from_pretrained('./trained_lora')

model.to('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()

# Prepare a prompt
instruction = "You are an experienced medical doctor specializing in mental health. Provide a detailed and compassionate response to the patient's description."
input_text = "I have been experiencing sudden and frequent panic attacks."
prompt = f"{instruction}\n\nPatient's Description: {input_text}\n\nDoctor's Response:"

inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

with torch.no_grad():
    # outputs = model.generate(
    #     **inputs,
    #     max_new_tokens=100,
    #     do_sample=True,
    #     top_p=0.9,
    #     temperature=0.75,
    # )
    outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    no_repeat_ngram_size=3,
)


generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are an experienced medical doctor specializing in mental health. Provide a detailed and compassionate response to the patient's description.

Patient's Description: I have been experiencing sudden and frequent panic attacks.

Doctor's Response: It is normal for you, especially if your symptoms aren't completely consistent with what we've described here. However there may be some other conditions that might indicate something more serious than sleep apnea or dizziness. We'll need blood tests to confirm those factors (including any abnormal oxygenation levels), neurological imaging procedures to check your breathing, psychological evaluation of possible medications used to treat this condition such as medication changes, etc., so that we can diagnose it correctly.
