<a href="https://colab.research.google.com/github/felipe-matsuoka123/bert-enem-regression/blob/main/BERT_ENEM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, AdamW, BertModel

from sklearn.metrics import mean_squared_error, cohen_kappa_score
from sklearn.model_selection import train_test_split

from math import sqrt
import numpy as np
import pandas as pd
import warnings
import logging

logging.disable(logging.WARNING)
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def round_to_nearest_grade(value):
  grades = [0, 40, 80, 120, 160, 200]
  closest_grade = min(grades, key=lambda x: abs(x - value))
  return closest_grade

class EssayDataset(Dataset):
    def __init__(self, themes, essays, scores, tokenizer, max_len):
        self.themes = themes
        self.essays = essays
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.essays)

    def __getitem__(self, item):
        theme = str(self.themes[item])
        essay = str(self.essays[item])
        scores = self.scores[item]
        encoding = self.tokenizer.encode_plus(
            theme,
            essay,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_overflowing_tokens=False,
            return_tensors='pt',
        )
        return {
            'theme_essay_text': theme + " <SEP> " + essay,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'scores': torch.FloatTensor(scores)
        }

def calculate_total_qwk(predictions_cat, actuals_cat):
  tensor1_norm = torch.round(predictions_cat * 200)
  tensor2_norm = torch.round(actuals_cat * 200)
    # Flatten the tensors for QWK calculation
  flat_tensor1 = tensor1_norm.view(-1).numpy()
  flat_tensor2 = tensor2_norm.view(-1).numpy()
  total_score = cohen_kappa_score(flat_tensor1, flat_tensor2, weights='quadratic')
  return total_score

def calculate_qwk_scores(predictions_cat, actuals_cat):
  tensor1_norm = torch.round(predictions_cat * 200)
  tensor2_norm = torch.round(actuals_cat * 200)
  qwk_scores = []
  for i in range(5):  # Iterate over columns
      column1 = tensor1_norm[:, i].numpy()
      column2 = tensor2_norm[:, i].numpy()
      qwk = cohen_kappa_score(column1, column2, weights='quadratic')
      qwk_scores.append(qwk)
  return qwk_scores

def validate(model, val_loader):
  model.eval()  # Set the model to evaluation mode
  total_loss = 0
  predictions, true_labels = [], []

  with torch.no_grad():
    for batch in val_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      scores = batch['scores'].to(device)  # Make sure your labels are named correctly

      outputs = model(input_ids, attention_mask=attention_mask)
      loss = torch.nn.functional.mse_loss(outputs.squeeze(), scores.squeeze())  # Adjust for your loss function
      total_loss += loss.item()

      predictions.extend(outputs.squeeze().tolist())
      true_labels.extend(scores.tolist())

    avg_loss = total_loss / len(val_loader)
    return avg_loss
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [None]:
essays_df = pd.read_csv('/content/drive/MyDrive/CodingProjects/BERT_ENEM_grader/extended_essay-br.csv')
themes_df = pd.read_csv('/content/drive/MyDrive/CodingProjects/BERT_ENEM_grader/prompts.csv')
merged_df = pd.merge(essays_df, themes_df, left_on='prompt', right_on='id')
max_score = 200
for criterion in ['c1', 'c2', 'c3', 'c4', 'c5']:
    merged_df[criterion] = merged_df[criterion] / max_score

In [None]:
max_len = 512
batch_size = 16
test_size = 0.30

# Splitting the data
train_data, testing = train_test_split(merged_df, test_size=test_size)
validation_data, test_data = train_test_split(testing, test_size=0.5)

train_dataset = EssayDataset(train_data['title_y'].tolist(), train_data['essay'].tolist(), train_data[['c1', 'c2', 'c3', 'c4', 'c5']].values.tolist(), tokenizer, max_len)
test_dataset = EssayDataset(test_data['title_y'].tolist(), test_data['essay'].tolist(), test_data[['c1', 'c2', 'c3', 'c4', 'c5']].values.tolist(), tokenizer, max_len)
validation_dataset = EssayDataset(validation_data['title_y'].tolist(), validation_data['essay'].tolist(), validation_data[['c1', 'c2', 'c3', 'c4', 'c5']].values.tolist(), tokenizer, max_len)
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size)

print(f"Train dataset size:{len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Train dataset size:4603
Validation dataset size: 987
Test dataset size: 987


In [None]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

class BERTTimbauRegression(nn.Module):
    def __init__(self):
        super(BERTTimbauRegression, self).__init__()
        # Load the pre-trained BERT model
        self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

        # Dropout layer
        self.dropout = nn.Dropout(0.3)

        # A Linear layer to get 5 continuous values
        self.regressor = nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        # Get the output from BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the last hidden state of the token `[CLS]` for classification task
        pooled_output = outputs.pooler_output

        # Apply dropout
        pooled_output = self.dropout(pooled_output)

        # Pass through the regressor
        return self.regressor(pooled_output)

# Create an instance of the model
model = BERTTimbauRegression().to(device)
model = torch.load('/content/drive/MyDrive/CodingProjects/BERT_ENEM_grader/model.pt')

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
training_losses = []
val_losses = []
num_epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        #Forward pass
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['scores'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.mse_loss(outputs.squeeze(), scores.squeeze())
        total_loss += loss

        #Backward pass
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = validate(model, validation_loader)
    training_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    print(f'Epoch {epoch}: Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}')

Epoch 0: Training Loss: 0.019248507916927338, Validation Loss: 0.020405121297845916
Epoch 1: Training Loss: 0.016650820150971413, Validation Loss: 0.018646912604209878
Epoch 2: Training Loss: 0.015080888755619526, Validation Loss: 0.016442165972905293
Epoch 3: Training Loss: 0.013749651610851288, Validation Loss: 0.019116276394455664
Epoch 4: Training Loss: 0.012879371643066406, Validation Loss: 0.017701562832019502


In [None]:
#model = torch.load('/content/drive/MyDrive/CodingProjects/BERT_ENEM_grader/model.pt').to(device)
torch.save(model.state_dict(), 'pytorch_model.bin')

RuntimeError: ignored

In [None]:
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['scores'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.append(outputs.cpu())
        actuals.append(scores.cpu())

# Convert lists of tensors to a single tensor
predictions_cat = torch.cat(predictions, dim=0)
actuals_cat = torch.cat(actuals, dim=0)

In [None]:
rmse_scores = [sqrt(mean_squared_error(actuals_cat[:, i] * 200, predictions_cat[:, i] * 200)) for i in range(5)]
total_rmse = sqrt(mean_squared_error(actuals_cat.sum(axis=1) * 200, predictions_cat.sum(axis=1) * 200))
total_qwk = calculate_total_qwk(predictions_cat=predictions_cat, actuals_cat=actuals_cat)
qwk_scores = calculate_qwk_scores(predictions_cat=predictions_cat, actuals_cat=actuals_cat)

# Print or store the scores as needed
print("RMSE Scores for each criterion:", rmse_scores)
print("QWK Scores for each criterion:", qwk_scores)
print("Total RMSE:", total_rmse)
print("Total QWK:", total_qwk)

RMSE Scores for each criterion: [21.765426601976255, 24.36610790330431, 25.30244302526685, 24.15164847424369, 34.02874438376969]
QWK Scores for each criterion: [0.7381248150407327, 0.7781586824270629, 0.7561296362364799, 0.8373269654106734, 0.7870129226701628]
Total RMSE: 90.95734319586298
Total QWK: 0.7893125063767017
