In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
import torch
from torch.nn.functional import softmax
import datasets
from datasets import load_dataset, Dataset
import random
from torch.utils.data import DataLoader, Subset
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
from torchmetrics import F1Score
import pandas as pd
import numpy as np
pd.options.mode.copy_on_write = True

if we don't do data balancing (i.e. 290 per label)

In [32]:
fpath = "/Users/barrychen/Desktop/IELTSWritingHelper/datasets_ready/Task_Achievement.csv"
df3 = pd.read_csv(fpath)
df3['score'] = df3['score'].round(1)

df_filtered3 = df3[(df3['score'] > 3.0) & (df3['score'] < 12.0)]

reverse_mapping_3 = {
    3.5: 0, 4.0: 0,
    4.5: 1, 5.0: 1,
    5.5: 2, 6.0: 2,
    6.5: 3, 7.0: 3,
    7.5: 4, 8.0: 4,
    8.5: 5, 9.0: 5
}

# Apply mapping
df_filtered3['score'] = df_filtered3['score'].map(reverse_mapping_3)

# Find the maximum sample size for each class
max_sample_size = df_filtered3['score'].value_counts().min()

# Sample the maximum available size for each class
df_sampled3 = df_filtered3.groupby('score', group_keys=False).apply(
    lambda x: x.sample(len(x), random_state=42)
).reset_index(drop=True)

dataset = Dataset.from_pandas(df_sampled3)

  df_sampled3 = df_filtered3.groupby('score', group_keys=False).apply(


In [33]:
df_sampled3

Unnamed: 0,prompt,essay,text,score
0,Some people say that to prevent illness and di...,Emerging disease is a complex matter as it IS ...,The candidate has effectively addressed the gi...,0
1,Nowadays celebrities are more famous for their...,"In this present world, famous personalities a...",The essay adequately addresses the task and at...,0
2,More people decided to have children in their ...,"currently, there are more and more people make...",The essay generally addresses the task by disc...,0
3,Many people believe that the current system of...,Once every month no private vehicles a day ca...,The essay fails to address the prompt effectiv...,0
4,Some people think that instead of preventing c...,"In today's time, human activities are having a...",The essay effectively addresses the given task...,0
...,...,...,...,...
9197,Men and women are different in terms of their ...,Owing to the different physical and mental abi...,The essay effectively addresses the prompt and...,5
9198,Some people think technology makes life comple...,There is no denying of the fact that for some ...,The candidate effectively addresses the given ...,5
9199,"In cities and towns all over the world, the hi...","For the past decades,traffic jam has been one ...",The essay effectively addresses the given task...,5
9200,Some people believe that teenagers should be r...,Doing voluntary jobs is one of many ways to co...,The candidate has adequately addressed the giv...,5


In [34]:
value_counts_df = df_sampled3["score"].value_counts().reset_index()
value_counts_df.columns = ["score", "count"]
value_counts_df = value_counts_df.sort_values(by="score", ascending=False).reset_index(drop=True)
value_counts_df

Unnamed: 0,score,count
0,5,291
1,4,1266
2,3,3364
3,2,2257
4,1,694
5,0,1330


In [35]:
dataset

Dataset({
    features: ['prompt', 'essay', 'text', 'score'],
    num_rows: 9202
})

batch=16, lr=2e-5, epoch=20

In [37]:
num_labels = 6

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("nickmuchi/distilroberta-finetuned-financial-text-classification")
model = RobertaForSequenceClassification.from_pretrained("nickmuchi/distilroberta-finetuned-financial-text-classification", num_labels=num_labels, ignore_mismatched_sizes=True)

# Tokenization function
def tokenize_function(examples):
    # Concatenate the input columns for each example in the batch
    combined_text = [
        p + " " + e + " " + t for p, e, t in zip(examples["prompt"], examples["essay"], examples["text"])
    ]
    # Tokenize the concatenated text
    return tokenizer(combined_text, padding="max_length", truncation=True, max_length=512)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["prompt", "essay", "text"])
tokenized_datasets = tokenized_datasets.rename_column("score", "labels")
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Get the labels from the tokenized dataset
labels = tokenized_datasets["labels"]

# Get the unique labels
unique_labels = np.unique(labels)

# Store the indices for each label
label_to_indices = {label: np.where(labels == label)[0] for label in unique_labels}

# Lists to hold the train and validation indices
train_indices = []
val_indices = []

# For each label, split the indices into train and validation
for label, indices in label_to_indices.items():
    # Shuffle the indices within each label to ensure random splitting
    np.random.shuffle(indices)
    
    # Split 80% for training, 20% for validation
    split_idx = int(0.8 * len(indices))
    train_indices.extend(indices[:split_idx])
    val_indices.extend(indices[split_idx:])

# Convert indices to tensors
train_indices = torch.tensor(train_indices)
val_indices = torch.tensor(val_indices)

# Create Subsets for train and validation datasets
train_dataset = Subset(tokenized_datasets, train_indices)
eval_dataset = Subset(tokenized_datasets, val_indices)

# Dataloaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(eval_dataset, batch_size=16)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 20
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Move model to device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Initialize lists to track training/validation losses and accuracies
train_losses = []
val_losses = []
val_f1_scores = []

# Training loop
progress_bar = tqdm(range(num_training_steps))

# Initialize F1 score metric (weighted-averaged for multi-class classification)
f1_metric = F1Score(task="multiclass", num_classes=num_labels, average="weighted").to(device)

for epoch in range(num_epochs):
    epoch_train_loss = 0
    epoch_val_loss = 0
    f1_metric.reset()
    model.train()

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        epoch_train_loss += loss.item()
        progress_bar.update(1)

    # Record training loss for the epoch
    train_losses.append(epoch_train_loss / len(train_dataloader))

    # Evaluate the model
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        loss = F.cross_entropy(logits, batch["labels"])

        epoch_val_loss += loss.item()
        f1_metric(predictions, batch["labels"])  # Update F1 metric with predictions

    # Record validation loss and accuracy
    val_losses.append(epoch_val_loss / len(eval_dataloader))
    val_f1 = f1_metric.compute().item()
    val_f1_scores.append(val_f1)

    print(f"Epoch {epoch + 1}/{num_epochs}: train loss {train_losses[-1]:.4f}, val loss {val_losses[-1]:.4f}, val f1 score {val_f1_scores[-1]:.4f}")

# Plotting function
def eval_plot(train_losses, val_losses, val_f1_scores):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(12, 6))

    # Training and validation loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss")
    plt.plot(epochs, val_losses, label="Validation Loss")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')

    # Validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, val_f1_scores, label="Validation F1")
    plt.xlabel('Epoch')
    plt.ylabel('F1')
    plt.legend()
    plt.title('Validation F1')

    plt.tight_layout()
    plt.show()

# Plot train loss, validation loss, validation accuracy
eval_plot(train_losses, val_losses, val_f1_scores)

# Print final validation accuracy
print(f"Final validation F1: {val_f1_scores[-1]:.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at nickmuchi/distilroberta-finetuned-financial-text-classification and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9202 [00:00<?, ? examples/s]

  0%|          | 0/9200 [02:30<?, ?it/s]


Epoch 1/20: train loss 1.4157, val loss 1.3975, val f1 score 0.3998




Epoch 2/20: train loss 1.3175, val loss 1.3541, val f1 score 0.4180




Epoch 3/20: train loss 1.2456, val loss 1.3143, val f1 score 0.4424




Epoch 4/20: train loss 1.1555, val loss 1.3505, val f1 score 0.4361




Epoch 5/20: train loss 1.0694, val loss 1.3098, val f1 score 0.4648




Epoch 6/20: train loss 0.9667, val loss 1.4227, val f1 score 0.4597




Epoch 7/20: train loss 0.8566, val loss 1.5195, val f1 score 0.4532




Epoch 8/20: train loss 0.7557, val loss 1.6378, val f1 score 0.4506




Epoch 9/20: train loss 0.6589, val loss 1.8191, val f1 score 0.4467




Epoch 10/20: train loss 0.5693, val loss 1.8616, val f1 score 0.4514




Epoch 11/20: train loss 0.4804, val loss 1.9742, val f1 score 0.4515




Epoch 12/20: train loss 0.4185, val loss 2.1554, val f1 score 0.4398




KeyboardInterrupt: 