In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
import os

In [None]:
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

In [None]:
print(hf_token)

In [None]:
login(token=hf_token)

In [None]:
import json
from datasets import Dataset


# Load and process dataset
print("Available datasets:")
print(os.listdir("/kaggle/input/multi-classs-email-dataset"))

with open("/kaggle/input/multi-classs-email-dataset/email_dataset.json", "r", encoding="utf-8") as f:
    emails = json.load(f)


def format_email(e):
    return {
        "text": f"### Subject:\n{e['subject']}\n\n### Body:\n{e['body']}\n\n### Labels:\n{', '.join(e['labels'])}"
    }

formatted_data = [format_email(e) for e in emails]
dataset = Dataset.from_list(formatted_data)
dataset = dataset.train_test_split(test_size=0.1)


In [None]:
!pip install -U transformers accelerate datasets trl peft bitsandbytes


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)


lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)


In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    output_dir="./llama-3.2-1b-lora",
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=lora_config,
    processing_class=tokenizer,
    args=training_args,
)


In [None]:
trainer.train()


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch

base_model_name = "meta-llama/Llama-3.2-1B"
lora_model_path = "/kaggle/working/llama-3.2-1b-lora/checkpoint-711"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Attach LoRA adapter
model = PeftModel.from_pretrained(base_model, lora_model_path)
model.eval()


In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)


prompt = """### Subject:
Reminder: Upcoming Meeting with Client

### Body:
Please note that we have a scheduled meeting with the client this Thursday at 3 PM. Ensure all reports are ready and shared beforehand.
"""

output = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, top_p=0.95, temperature=0.7)[0]['generated_text']
print(output)


In [None]:
!zip -r checkpoint-710.zip /kaggle/working/llama-3.2-1b-lora/checkpoint-711
