## Prepare for finetuning

In [39]:
# import necessary libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the 20 Newsgroups dataset
categories = ['comp.graphics', 'sci.med', 'soc.religion.christian']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
data = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})


In [40]:
# Clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data["cleaned_text"] = data["text"].apply(clean_text)

In [41]:
data.label.value_counts()

label
2    997
1    990
0    973
Name: count, dtype: int64

In [42]:
data.head()

Unnamed: 0,text,label,cleaned_text
0,From: marco@sdf.lonestar.org (Steve Giammarco)...,1,from marcosdflonestarorg steve giammarco subje...
1,From: u0mrm@csc.liv.ac.uk (M.R. Mellodew)\nSub...,2,from u0mrmcsclivacuk mr mellodew subject re if...
2,From: oved3b@kih.no (Ove Petter Tro)\nSubject:...,0,from oved3bkihno ove petter tro subject re nee...
3,From: pkhalsa@wpi.WPI.EDU (Partap S Khalsa)\nS...,1,from pkhalsawpiwpiedu partap s khalsa subject ...
4,From: jhilmer@ruc.dk (Jakob Hilmer)\nSubject: ...,1,from jhilmerrucdk jakob hilmer subject need va...


In [43]:

# Split the dataset into training and testing sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

len(train_data), len(val_data), len(test_data)


(2072, 444, 444)

In [44]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

In [45]:
train_dataset

Dataset({
    features: ['text', 'label', 'cleaned_text', '__index_level_0__'],
    num_rows: 2072
})

In [46]:
val_dataset

Dataset({
    features: ['text', 'label', 'cleaned_text', '__index_level_0__'],
    num_rows: 444
})

In [None]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

In [48]:
train_dataset.column_names

['text',
 'label',
 'cleaned_text',
 '__index_level_0__',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [49]:
# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text", "cleaned_text", "__index_level_0__"])
val_dataset = val_dataset.remove_columns(["text", "cleaned_text", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["text", "cleaned_text", "__index_level_0__"])

# Convert labels to int if they are not already
train_dataset = train_dataset.map(lambda x: {"label": int(x["label"])})
val_dataset = val_dataset.map(lambda x: {"label": int(x["label"])})
test_dataset = test_dataset.map(lambda x: {"label": int(x["label"])})

# Print a sample to confirm input_ids exist
print(train_dataset[0])

Map:   0%|          | 0/2072 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

{'label': 1, 'input_ids': [101, 2013, 8788, 9818, 5358, 7382, 4140, 9006, 8788, 2924, 3395, 2128, 2003, 5796, 2290, 14639, 3565, 16643, 3508, 7514, 3406, 8788, 4783, 6169, 9006, 7382, 4140, 9006, 3029, 29583, 2455, 4684, 3688, 4753, 1050, 3372, 9397, 14122, 2075, 15006, 2102, 13741, 14526, 21472, 19481, 3210, 4464, 1999, 3720, 2857, 9331, 2099, 16932, 12521, 23833, 22610, 16048, 21619, 2549, 21246, 2015, 23499, 2692, 7712, 21709, 11261, 2213, 2585, 21246, 2015, 23499, 2692, 7712, 21709, 11261, 2213, 2585, 2726, 7009, 27166, 12462, 18933, 5666, 11201, 15472, 2389, 18155, 20821, 2226, 6249, 7009, 2003, 2045, 2107, 1037, 2518, 2004, 5796, 2290, 18847, 6499, 12811, 1043, 7630, 28282, 2618, 14639, 1045, 2387, 1999, 1996, 6396, 2335, 4465, 2008, 6529, 2031, 14914, 2077, 2019, 17473, 7319, 5997, 2008, 10821, 2055, 5796, 2290, 14639, 2024, 3565, 16643, 3508, 10334, 2182, 2031, 3325, 2000, 1996, 10043, 10047, 2214, 2438, 2000, 3342, 2008, 1996, 3277, 2038, 2272, 2039, 2012, 2560, 1037, 3232, 19

In [50]:
print(len(input_ids))
print(len(attention_masks))
print(len(labels))

3960
3960
3960


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    return {'f1': f1}

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results_traditional',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    eval_strategy="epoch",
)

# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start fine-tuning
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.088029
2,No log,0.052866
3,No log,0.078432


TrainOutput(global_step=390, training_loss=0.08375586485251402, metrics={'train_runtime': 425.0046, 'train_samples_per_second': 14.626, 'train_steps_per_second': 0.918, 'total_flos': 1635513004597248.0, 'train_loss': 0.08375586485251402, 'epoch': 3.0})

In [52]:
# Evaluate the model on test data
results = trainer.evaluate()
print(results)

{'eval_loss': 0.07843217998743057, 'eval_runtime': 8.4933, 'eval_samples_per_second': 52.277, 'eval_steps_per_second': 6.593, 'epoch': 3.0}


In [59]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # could be CAUSAL_LM, TOKEN_CLS, etc.
    r=8,                         # rank
    lora_alpha=32,               # scaling
    lora_dropout=0.1,            # dropout for stability
    bias="none"
)

In [None]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model = get_peft_model(model, lora_config)

In [61]:
model.print_trainable_parameters()

trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


In [62]:
# Fine-tune the LoRA model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.078432
2,No log,0.078432
3,No log,0.078432


TrainOutput(global_step=390, training_loss=0.0020633193162771374, metrics={'train_runtime': 339.2382, 'train_samples_per_second': 18.323, 'train_steps_per_second': 1.15, 'total_flos': 1641188565467136.0, 'train_loss': 0.0020633193162771374, 'epoch': 3.0})

In [66]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "key", "value"],  # valid for BERT
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 444,675 || all params: 109,929,222 || trainable%: 0.4045


In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    output_dir="./qlora-output",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()