<a href="https://colab.research.google.com/github/ethanknights/tune-LLM/blob/main/tune_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

output_dir = '/content/drive/My Drive/tune-LLM-results'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch transformers pandas



In [4]:
!pip install --upgrade nvidia-pyindex
!nvidia-smi

Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8419 sha256=02e7753084c9130ec49938dfebb34b5f33b3535c2326f02ebfb7441f4de58b8e
  Stored in directory: /root/.cache/pip/wheels/2c/af/d0/7a12f82cab69f65d51107f48bcd6179e29b9a69a90546332b3
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9
/bin/bash: line 1: nvidia-smi: command not found


In [5]:
!pip install accelerate



In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
import logging

logging.basicConfig(level=logging.INFO)


In [None]:

def generate_dataset():
    dataset = [
        ("There's a large pothole on Main Street.", "Pothole"),
        ("I hit a pothole on Elm Avenue and my tire popped.", "Pothole"),
        ("There's a deep pothole near the intersection of Oak and Pine streets.", "Pothole"),
        ("There's graffiti on the park wall.", "Graffiti"),
        ("Someone spray-painted graffiti on the bus stop.", "Graffiti"),
        ("There's offensive graffiti on the side of the building downtown.", "Graffiti"),
        ("There's loud construction noise coming from the building next door.", "Noise Complaint"),
        ("My neighbor's party is too loud and it's keeping me awake.", "Noise Complaint"),
        ("The neighbors are playing loud music late at night.", "Noise Complaint"),
        ("The trash bins haven't been emptied for days.", "Trash Pickup Request"),
        ("There's trash scattered all over the sidewalk.", "Trash Pickup Request"),
        ("The garbage truck missed our street on pickup day.", "Trash Pickup Request"),
        ("The street light at the corner of Maple and Elm streets is out.", "Street Light Outage"),
        ("There's a dark area on the street because the light isn't working.", "Street Light Outage"),
        ("The street light flickers on and off intermittently.", "Street Light Outage"),
    ]

    train_size = int(0.8 * len(dataset))
    train_dataset = dataset[:train_size]
    valid_dataset = dataset[train_size:]

    return train_dataset, valid_dataset


In [None]:

train_dataset, valid_dataset = generate_dataset()

train_df = pd.DataFrame(train_dataset, columns=["text", "label"])
valid_df = pd.DataFrame(valid_dataset, columns=["text", "label"])


In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 0]
        label = self.df.iloc[idx, 1]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_attention_mask=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(0)  # Placeholder label
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct")
model = AutoModelForCausalLM.from_pretrained("unsloth/llama-3-8b-Instruct", device_map = 'auto')


In [None]:

train_dataset = CustomDataset(train_df, tokenizer, max_length=512)
valid_dataset = CustomDataset(valid_df, tokenizer, max_length=512)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_steps=500,  # Run evaluation every 500 steps
    evaluation_strategy='epoch'
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (preds == labels).sum().item() / len(labels)
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)



In [None]:
logging.info("Starting training...")
trainer.train()
logging.info("Training completed.")