<a href="https://colab.research.google.com/github/cineming9-svg/spam-ham/blob/main/finetune_llama3_2_spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv("/content/spam.csv", encoding="latin1")

# Select and rename columns, and convert labels to numerical
df = df[["v1", "v2"]]
df.columns = ["label", "text"]

# Convert 'ham' and 'spam' to numerical labels
label_map = {"ham": 0, "spam": 1}
df["label"] = df["label"].map(label_map)

dataset = Dataset.from_pandas(df)

# Now you can split
dataset = dataset.train_test_split(test_size=0.2)
print(dataset["train"].column_names)

['label', 'text']


In [None]:
from transformers import DistilBertTokenizerFast, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model

model_name = "distilbert-base-uncased"

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Base model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# LoRA config with target modules
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type="SEQ_CLS",
    target_modules=["q_lin", "v_lin"]   # <-- specify layers
)

# Wrap with PEFT
model = get_peft_model(model, lora_config)
# Split dataset into train/test


def tokenize_function(example):
    return tokenizer(
        example["text"], # Changed from "v2" to "text"
        truncation=True,
        padding="max_length",
        max_length=128
        # 512
    )


# Tokenize both splits
# dataset = dataset.train_test_split(test_size=0.2)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir="./results",
#     report_to="none",
#     eval_strategy="epoch",
#     learning_rate=2e-4,
#     per_device_train_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     save_strategy="epoch"
# )
training_args = TrainingArguments(
    output_dir="./results",
    report_to="none",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,   # larger batch
    num_train_epochs=2,               # fewer epochs for testing
    weight_decay=0.01,
    fp16=True,                        # mixed precision
    dataloader_num_workers=4,
    logging_dir="./logs",
    save_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= tokenized_dataset["train"],
    eval_dataset=  tokenized_dataset["test"],
)

trainer.train()




Epoch,Training Loss,Validation Loss
1,No log,0.026153
2,0.057600,0.024555




TrainOutput(global_step=558, training_loss=0.0553730743333003, metrics={'train_runtime': 4758.3608, 'train_samples_per_second': 1.873, 'train_steps_per_second': 0.117, 'total_flos': 300266768160768.0, 'train_loss': 0.0553730743333003, 'epoch': 2.0})

⚠️ Common Pitfalls
Dataset keys: Hugging Face datasets often use "label" instead of "labels". Double‑check with dataset["test"].column_names.

Batching: Passing the entire test set at once can exceed GPU/CPU memory. Use a loop or DataLoader for large datasets: **bold text**

preds, labels = [], []
for batch in DataLoader(dataset["test"], batch_size=32):
    inputs = tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    preds.extend(outputs.logits.argmax(-1).cpu().numpy())
    labels.extend(batch["labels"])
acc = accuracy_score(labels, preds)


In [None]:
# from sklearn.metrics import accuracy_score

# # Run prediction
# preds = trainer.predict(dataset["test"])

# # Extract labels and predictions
# y_true = dataset["test"]["labels"]
# y_pred = preds.predictions.argmax(-1)

# # Compute accuracy
# acc = accuracy_score(y_true, y_pred)
# print("Test Accuracy:", acc)


from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel
from sklearn.metrics import accuracy_score

model.save_pretrained("./lora-distilbert")


# Reload base + adapter
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
model = PeftModel.from_pretrained(base_model, "./lora-distilbert")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Prepare test data
test_texts = list(dataset["test"]["text"])
y_true = list(dataset["test"]["label"])

# Run predictions
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
outputs = model(**inputs)
y_pred = outputs.logits.argmax(-1).numpy()

# Accuracy
acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", acc)

# Show column names
print(dataset["test"].column_names)

# Show first 5 rows
print(dataset["test"][:5])



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.9919282511210762
['label', 'text']
{'label': [0, 0, 0, 0, 1], 'text': ['Weightloss! No more girl friends. Make loads of money on ebay or something. And give thanks to God.', 'Shall i send that exe to your mail id.', "Yo do you know anyone  &lt;#&gt;  or otherwise able to buy liquor? Our guy flaked and right now if we don't get a hold of somebody its just 4 loko all night", 'Aight well keep me informed', '449050000301 You have won a å£2,000 price! To claim, call 09050000301.']}
