In [1]:
#Path to your dataset files in Google Drive

# Adjust these paths to match the actual location of your files
drive_path = '/work/google-drive-hlt-files/'  # Base Google Drive path
train_path = f"{drive_path}/train_subset.csv"
val_path = f"{drive_path}/val_subset.csv"
test_path = f"{drive_path}/remaining_data.csv"

# Load Tokenizer

In [None]:
from transformers import AutoTokenizer

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

<hr>

In [3]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': train_path,
                                           'validation': val_path,
                                           'test': test_path})

Generating train split: 20000 examples [00:00, 23315.49 examples/s]
Generating validation split: 5000 examples [00:00, 16969.97 examples/s]
Generating test split: 457235 examples [00:26, 17055.30 examples/s]


<hr>

In [None]:
import torch

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

import nlpaug.augmenter.word as naw

from collections import Counter
from transformers import Trainer
from torch import nn
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Add WeightedTrainer class definition here:

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        labels = labels.long()
        outputs = model(**inputs)
        logits = outputs.logits
        logits = logits.float()
        # Use CrossEntropyLoss with class weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Check class distribution
train_labels = dataset["train"]["generated"]
class_distribution = Counter(train_labels)
print("Original class distribution:", class_distribution)

# If there's imbalance, augment minority class
total = sum(class_distribution.values())
if any(class_distribution[k] / total < 0.4 for k in class_distribution):
    minority_class = min(class_distribution, key=class_distribution.get)

    def augment_text(examples):
        aug = naw.SynonymAug(aug_min=1, aug_max=3)
        augmented_texts = []
        for text in examples['text']:
            # Make sure to return a single string, not a list
            if isinstance(text, str):
                augmented = aug.augment(text)
                # augment returns a list, so we take the first element
                if isinstance(augmented, list):
                    augmented = augmented[0]
                augmented_texts.append(augmented)
            else:
                augmented_texts.append(text)
        examples['text'] = augmented_texts
        return examples

    # Apply to minority class only
    minority_dataset = dataset["train"].filter(
        lambda x: x["generated"] == minority_class
    ).map(augment_text, batched=True)

    # Combine with original data
    from datasets import concatenate_datasets
    balanced_dataset = concatenate_datasets([dataset["train"], minority_dataset])
    dataset["train"] = balanced_dataset

    # Verify new distribution
    new_distribution = Counter(dataset["train"]["generated"])
    print("Balanced class distribution:", new_distribution)

# Proceed with tokenization and class weights computation
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove original text column, set format for PyTorch/TensorFlow
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("generated", "labels") # Trainer expects 'labels'
tokenized_datasets.set_format("torch")

train_labels = tokenized_datasets["train"]["labels"]

# Convert to numpy if it's a tensor
if torch.is_tensor(train_labels):
    train_labels = train_labels.cpu().numpy()
else:
    train_labels = np.array(train_labels)

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_labels),
    y=train_labels
)

print("Class weights:", class_weights)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

# Fine Tuning portion

## Load Pre-trained Model for Classification

In [None]:
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


from huggingface_hub import login

# Login to huggingface
login(token="",new_session = True, write_permission=True)

## Define Training Arguments

In [6]:
from accelerate import Accelerator
print("Accelerate loaded successfully!")

Accelerate loaded successfully!


In [None]:
from huggingface_hub import whoami
try:
    user_info = whoami()
    username = user_info["name"]
    print(f"Logged in to Hugging Face as: {username}")
except Exception as e:
    print(f"Error getting Hugging Face user info: {e}")
    print("Make sure you're properly logged in with notebook_login()")
    username = None

if username:
    hub_model_id = f"{username}/hlt-bert-text-classification"  # Use your actual username
else:
    hub_model_id = None
    print("Will not push to Hub due to authentication issues")

from transformers import EarlyStoppingCallback, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="steps", # Make sure eval_strategy is "steps" or "epoch"
    eval_steps=100,       # How often to evaluate and check for early stopping
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,    # Important for early stopping
    metric_for_best_model="f1",     # Metric to monitor
    greater_is_better=True,         # Whether a higher value of the metric is better
    learning_rate=2e-5,
    gradient_accumulation_steps=2,
    fp16=True,

    # Add Hub parameters only if have a valid username
    push_to_hub=username is not None,
    hub_model_id=hub_model_id,
    hub_strategy="checkpoint" if username else None,

    # early_stopping_patience=3,
    # early_stopping_threshold=0.0,
)

## Define Evaluation Metrics

In [8]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy") # Load standard accuracy metric (or f1, precision, recall)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load multiple metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"],
        "precision": precision.compute(predictions=predictions, references=labels)["precision"],
        "recall": recall.compute(predictions=predictions, references=labels)["recall"]
    }


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 10.3MB/s]


## Initialize Trainer

In [None]:
from transformers import Trainer

# Cnvert labels to long
def convert_labels_to_long(examples):
    examples["labels"] = [int(label) for label in examples["labels"]]
    return examples

tokenized_datasets = tokenized_datasets.map(convert_labels_to_long, batched=True)


# Ensure format
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Init early stopping
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.005)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

## Learning Rate Scheduler

In [10]:


# Learning Rate Scheduler
from transformers import get_linear_schedule_with_warmup

# Calculate total steps
num_train_examples = len(tokenized_datasets["train"])
per_device_batch_size = training_args.per_device_train_batch_size
gradient_accumulation_steps = training_args.gradient_accumulation_steps if hasattr(training_args, 'gradient_accumulation_steps') else 1
num_gpus = 1

total_batch_size = per_device_batch_size * gradient_accumulation_steps * num_gpus
total_steps = (num_train_examples // total_batch_size) * training_args.num_train_epochs

# Create optimizer and scheduler
from torch.optim import AdamW

optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay
)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=total_steps
)

# Add optimizer and scheduler to trainer BEFORE training
trainer.optimizer = optimizer
trainer.lr_scheduler = scheduler


## Train

In [11]:
trainer.train()

Step,Training Loss,Validation Loss


Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<00:00, 11.5MB/s]
Downloading builder script: 100%|██████████| 7.56k/7.56k [00:00<00:00, 16.4MB/s]
Downloading builder script: 100%|██████████| 7.38k/7.38k [00:00<00:00, 14.6MB/s]


TrainOutput(global_step=800, training_loss=0.14013625741004943, metrics={'train_runtime': 1119.9737, 'train_samples_per_second': 122.525, 'train_steps_per_second': 3.83, 'total_flos': 6735643017216000.0, 'train_loss': 0.14013625741004943, 'epoch': 0.9324009324009324})

## Push Model

In [None]:

# Push the model to Hugging Face Hub if it hasn't been automatically pushed during training
print("Pushing model to Hugging Face Hub...")
trainer.push_to_hub()
print(f"Model pushed successfully to: https://huggingface.co/{training_args.hub_model_id}")

# Evaluation

In [None]:
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)
# Convert to DataFrame
import pandas as pd
results_df = pd.DataFrame([test_results])

# Save to CSV locally first
csv_path = 'test_results_final.csv'
results_df.to_csv(csv_path, index=False)

# Download locally
files.download(csv_path)

In [None]:
import pandas as pd
results_df = pd.DataFrame([test_results])

# Save to CSV locally first
csv_path = 'test_results_final.csv'
results_df.to_csv(csv_path, index=False)


> Download csv_path 

In [None]:
import shutil
shutil.move('test_results_final.csv', '/work/test_results_final.csv')

The file `test_results_final.csv` has been moved and is now available at `/work/test_results_final.csv`. You can download it from there.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=107e1980-0943-4584-b9d8-50a47211e48c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>