In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


# Custom dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize the text
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'classification_report': classification_report(labels, predictions)
    }

In [7]:
# Determine the device to use
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Initialize model with correct number of labels
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # Adjust this based on your number of classes
    # torch_dtype=torch.float16
)

# Move model to device
model = model.to(device)

# Prepare model for LoRA
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=16,  # rank
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],  # DistilBERT specific attention modules
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# Get PEFT model
model = get_peft_model(model, lora_config)

Using device: mps


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Load and preprocess data
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
except FileNotFoundError:
    # Try parquet files if CSV files don't exist
    raise FileNotFoundError("CSV files not found. Please ensure train.csv and test.csv exist in the current directory.")

# Convert text labels to numeric values
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

# Get number of unique classes
num_labels = len(label_encoder.classes_)
print(f"Number of classes: {num_labels}")
print(f"Class mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")


Number of classes: 2
Class mapping: {'democratic': np.int64(0), 'republican': np.int64(1)}


In [9]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./distilbert_classification_output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    # fp16=True, # Remove fp16 for MPS compatibility
    save_strategy="epoch",
    logging_steps=100,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    no_cuda=True,  # Disable CUDA
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./distilbert_classification_final")

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


KeyError: 55022

In [None]:
# Evaluate on test set
eval_results = trainer.evaluate()
print("\nEvaluation Results:")
print(eval_results)