In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
pip install transformers




# Data Preparation

## Quick EDA

## Unique emotions

**Imbalance can be seen in the labels**

### Statistics about length of text

## Column Encoding

## Emotion Dataset

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df.reset_index(drop=True)  # Reset the index to ensure sequential values
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.df)
    def __getitem__(self, index):
        essay = str(self.df.loc[index, 'essay'])
        emotion = self.df.loc[index, 'emotion']
        encoding = self.tokenizer.encode_plus(
            essay,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'essay': essay,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'emotion': torch.tensor(emotion, dtype=torch.long)
        }

## Tokenizer

In [None]:
# create an instance of the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Model

In [None]:

import torch.nn as nn
from transformers import BertModel

class BertSentimentClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BertSentimentClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)  # Apply dropout once
        output = self.fc1(pooled_output)
        logits = self.fc2(output)

        return logits

In [None]:
bert_model_name = "bert-base-uncased"
#num_classes = train_data["emotion"].nunique()
#model = BertSentimentClassifier(bert_model_name, num_classes)

In [None]:
criterion = nn.CrossEntropyLoss()

# Training

In [None]:
def train(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss, total_accuracy = 0, 0
    for data in tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['emotion'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)

        loss = criterion(outputs, labels)
        total_loss += loss.item()

        logits = outputs.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        total_accuracy += flat_accuracy(logits, label_ids)

        loss.backward()
        optimizer.step()

    # calculate the average loss
    avg_loss = total_loss / len(data_loader)
    avg_acc = total_accuracy / len(data_loader)


    return avg_loss, avg_acc

In [None]:
def evaluate(model, data_loader, criterion, device):
    # set the model to eval mode
    model.eval()

    # initialize the loss and accuracy variables
    total_loss = 0
    total_accuracy = 0

    # iterate over the data loader
    for data in tqdm(data_loader):
        # move the inputs to the device
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['emotion'].to(device)

        # disable gradient computation
        with torch.no_grad():
            # get the model's predictions
            outputs = model(input_ids, attention_mask)

            # get the loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # calculate the number of correct predictions
            logits = outputs.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            total_accuracy += flat_accuracy(logits, label_ids)

    # calculate the average loss and accuracy
    avg_loss = total_loss / len(data_loader)
    accuracy = total_accuracy / len(data_loader)

    return avg_loss, accuracy

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
def calculate_metrics(model, dataloader, device):
    model.to(device)
    predictions, labels = [], []

    val_predictions, val_labels = [], []
    for data in val_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['emotion'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask)

        logits = outputs.detach().cpu().numpy()
        predicted_labels = np.argmax(logits, axis=1)
        val_predictions.extend(predicted_labels)
        val_labels.extend(labels.to('cpu').numpy())

    f1 = f1_score(val_labels, val_predictions, average='weighted')
    precision = precision_score(val_labels, val_predictions, average='weighted')
    recall = recall_score(val_labels, val_predictions, average='weighted')
    return f1, precision, recall

In [None]:
def label_encoder(data, column_name):
    """
    Encodes categorical data using LabelEncoder.

    Args:
        data: Pandas DataFrame containing the data to be encoded.
        column_name: Name of the column containing categorical data to be encoded.

    Returns:
        Pandas DataFrame with the encoded categorical data.
    """

    encoder = LabelEncoder()
    encoded_data = encoder.fit_transform(data[column_name])
    data[column_name] = encoded_data

    return data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import matplotlib.pyplot as plt

# Load and split data
file_path = 'isear.csv'
df = pd.read_csv(file_path)
df = df.drop(columns=['Unnamed: 2'])
df = df.reset_index(drop=True)

# Perform K-fold cross-validation with 80:20 split
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
train_losses = []
val_losses = []
val_accuracies = []
prec_scores_per_epoch = []
rec_scores_per_epoch = []
f1_scores_per_epoch = []  # List to store F1 scores for each epoch

for train_index, val_index in kfold.split(df):
    train_data = df.iloc[train_index]
    val_data = df.iloc[val_index]

    # Prepare data for BERT model
    le = LabelEncoder()
    train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
    val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])
# set the batch size
    batch_size = 8

# set the maximum sequence length
    max_len = 150
    train_dataset = EmotionDataset(train_data, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = EmotionDataset(val_data, tokenizer, max_len)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize and train the model
    num_classes = train_data["emotion"].nunique()
    model = BertSentimentClassifier(bert_model_name, num_classes)
    lr = 2e-5
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Initialize epoch-wise F1, precision, recall, and loss lists
    epoch_prec_scores = []
    epoch_rec_scores = []
    epoch_f1_scores = []
    epoch_val_losses = []
    epoch_val_acc  =[]
    num_epochs = 10

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}:")

        train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        # Calculate F1, precision, and recall scores for validation data
        f1, precision, recall = calculate_metrics(model, val_loader, device)

        # Append epoch-wise results to lists
        epoch_prec_scores.append(precision)
        epoch_rec_scores.append(recall)
        epoch_f1_scores.append(f1)
        epoch_val_losses.append(val_loss)
        epoch_val_acc.append(val_acc)

        print(f"Training Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | F1 Score: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

    # Append epoch-wise results to lists of all results
    prec_scores_per_epoch.append(epoch_prec_scores)
    rec_scores_per_epoch.append(epoch_rec_scores)
    f1_scores_per_epoch.append(epoch_f1_scores)
    val_losses.append(epoch_val_losses)
    val_accuracies.append(epoch_val_acc)

# Calculate and print average metrics for each epoch
average_prec_scores = np.mean(prec_scores_per_epoch, axis=0)
average_rec_scores = np.mean(rec_scores_per_epoch, axis=0)
average_f1_scores = np.mean(f1_scores_per_epoch, axis=0)
average_val_losses = np.mean(val_losses, axis=0)
average_val_acc = np.mean(val_accuracies, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
  train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])
  val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])


Epoch 1:


100%|██████████| 752/752 [02:40<00:00,  4.68it/s]
100%|██████████| 188/188 [00:13<00:00, 13.61it/s]


Training Loss: 1.2685 | Train Acc: 0.5447 | Val Loss: 0.9404 | Val Acc: 0.6789 | F1 Score: 0.6759 | Precision: 0.7172 | Recall: 0.6789
Epoch 2:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.59it/s]


Training Loss: 0.6806 | Train Acc: 0.7706 | Val Loss: 0.9038 | Val Acc: 0.6981 | F1 Score: 0.6946 | Precision: 0.7099 | Recall: 0.6981
Epoch 3:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.56it/s]


Training Loss: 0.4067 | Train Acc: 0.8639 | Val Loss: 1.0216 | Val Acc: 0.6948 | F1 Score: 0.6920 | Precision: 0.7072 | Recall: 0.6948
Epoch 4:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.60it/s]


Training Loss: 0.2275 | Train Acc: 0.9295 | Val Loss: 1.0744 | Val Acc: 0.7141 | F1 Score: 0.7163 | Precision: 0.7274 | Recall: 0.7141
Epoch 5:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.59it/s]


Training Loss: 0.1295 | Train Acc: 0.9623 | Val Loss: 1.3110 | Val Acc: 0.6769 | F1 Score: 0.6794 | Precision: 0.7046 | Recall: 0.6769
Epoch 6:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.60it/s]


Training Loss: 0.1025 | Train Acc: 0.9706 | Val Loss: 1.4264 | Val Acc: 0.6908 | F1 Score: 0.6920 | Precision: 0.6975 | Recall: 0.6908
Epoch 7:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.61it/s]


Training Loss: 0.0771 | Train Acc: 0.9769 | Val Loss: 1.4209 | Val Acc: 0.7068 | F1 Score: 0.7058 | Precision: 0.7108 | Recall: 0.7068
Epoch 8:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.64it/s]


Training Loss: 0.0610 | Train Acc: 0.9822 | Val Loss: 1.5509 | Val Acc: 0.6888 | F1 Score: 0.6897 | Precision: 0.7010 | Recall: 0.6888
Epoch 9:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.55it/s]


Training Loss: 0.0614 | Train Acc: 0.9807 | Val Loss: 1.6026 | Val Acc: 0.6955 | F1 Score: 0.6949 | Precision: 0.6985 | Recall: 0.6955
Epoch 10:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.56it/s]


Training Loss: 0.0647 | Train Acc: 0.9804 | Val Loss: 1.5707 | Val Acc: 0.7008 | F1 Score: 0.7005 | Precision: 0.7048 | Recall: 0.7008


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
  train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])
  val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])


Epoch 1:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.56it/s]


Training Loss: 1.2507 | Train Acc: 0.5559 | Val Loss: 0.9266 | Val Acc: 0.6791 | F1 Score: 0.6770 | Precision: 0.6840 | Recall: 0.6793
Epoch 2:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.57it/s]


Training Loss: 0.6645 | Train Acc: 0.7780 | Val Loss: 0.9222 | Val Acc: 0.6832 | F1 Score: 0.6830 | Precision: 0.6871 | Recall: 0.6833
Epoch 3:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.61it/s]


Training Loss: 0.3942 | Train Acc: 0.8727 | Val Loss: 1.0638 | Val Acc: 0.6831 | F1 Score: 0.6798 | Precision: 0.6880 | Recall: 0.6833
Epoch 4:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.56it/s]


Training Loss: 0.2261 | Train Acc: 0.9280 | Val Loss: 1.1946 | Val Acc: 0.6898 | F1 Score: 0.6903 | Precision: 0.6996 | Recall: 0.6900
Epoch 5:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.56it/s]


Training Loss: 0.1323 | Train Acc: 0.9613 | Val Loss: 1.2532 | Val Acc: 0.6899 | F1 Score: 0.6905 | Precision: 0.6996 | Recall: 0.6900
Epoch 6:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.57it/s]


Training Loss: 0.0767 | Train Acc: 0.9799 | Val Loss: 1.3592 | Val Acc: 0.6903 | F1 Score: 0.6915 | Precision: 0.6978 | Recall: 0.6906
Epoch 7:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.61it/s]


Training Loss: 0.0797 | Train Acc: 0.9738 | Val Loss: 1.5032 | Val Acc: 0.6846 | F1 Score: 0.6808 | Precision: 0.6846 | Recall: 0.6846
Epoch 8:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.54it/s]


Training Loss: 0.0561 | Train Acc: 0.9820 | Val Loss: 1.6544 | Val Acc: 0.6918 | F1 Score: 0.6922 | Precision: 0.7001 | Recall: 0.6919
Epoch 9:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.57it/s]


Training Loss: 0.0533 | Train Acc: 0.9838 | Val Loss: 1.5751 | Val Acc: 0.6786 | F1 Score: 0.6818 | Precision: 0.6901 | Recall: 0.6786
Epoch 10:


100%|██████████| 752/752 [02:40<00:00,  4.69it/s]
100%|██████████| 188/188 [00:13<00:00, 13.58it/s]


Training Loss: 0.0533 | Train Acc: 0.9849 | Val Loss: 1.7062 | Val Acc: 0.6900 | F1 Score: 0.6885 | Precision: 0.6943 | Recall: 0.6900


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
  train_data.loc[:, 'emotion'] = le.fit_transform(train_data['emotion'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])
  val_data.loc[:, 'emotion'] = le.transform(val_data['emotion'])


Epoch 1:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.58it/s]


Training Loss: 1.2908 | Train Acc: 0.5335 | Val Loss: 0.9125 | Val Acc: 0.6888 | F1 Score: 0.6884 | Precision: 0.6955 | Recall: 0.6886
Epoch 2:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.60it/s]


Training Loss: 0.7095 | Train Acc: 0.7615 | Val Loss: 0.8797 | Val Acc: 0.7127 | F1 Score: 0.7103 | Precision: 0.7197 | Recall: 0.7126
Epoch 3:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.62it/s]


Training Loss: 0.4367 | Train Acc: 0.8560 | Val Loss: 0.9494 | Val Acc: 0.7048 | F1 Score: 0.7093 | Precision: 0.7241 | Recall: 0.7046
Epoch 4:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.58it/s]


Training Loss: 0.2606 | Train Acc: 0.9212 | Val Loss: 1.1134 | Val Acc: 0.6894 | F1 Score: 0.6870 | Precision: 0.6921 | Recall: 0.6893
Epoch 5:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.60it/s]


Training Loss: 0.1494 | Train Acc: 0.9593 | Val Loss: 1.2636 | Val Acc: 0.7007 | F1 Score: 0.7008 | Precision: 0.7115 | Recall: 0.7006
Epoch 6:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.59it/s]


Training Loss: 0.1058 | Train Acc: 0.9714 | Val Loss: 1.4151 | Val Acc: 0.6914 | F1 Score: 0.6949 | Precision: 0.7291 | Recall: 0.6913
Epoch 7:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.58it/s]


Training Loss: 0.0907 | Train Acc: 0.9731 | Val Loss: 1.4960 | Val Acc: 0.6881 | F1 Score: 0.6907 | Precision: 0.7103 | Recall: 0.6880
Epoch 8:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.59it/s]


Training Loss: 0.0717 | Train Acc: 0.9784 | Val Loss: 1.4944 | Val Acc: 0.6953 | F1 Score: 0.6918 | Precision: 0.7020 | Recall: 0.6953
Epoch 9:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.60it/s]


Training Loss: 0.0553 | Train Acc: 0.9835 | Val Loss: 1.5417 | Val Acc: 0.6934 | F1 Score: 0.6929 | Precision: 0.6980 | Recall: 0.6933
Epoch 10:


100%|██████████| 752/752 [02:40<00:00,  4.70it/s]
100%|██████████| 188/188 [00:13<00:00, 13.55it/s]


In [None]:
print("\nAverage Precision, Recall, F1 Score, and Validation Loss per Epoch:")
for i, (prec, rec, f1, val_loss,val_acc) in enumerate(zip(average_prec_scores, average_rec_scores, average_f1_scores, average_val_losses,average_val_acc)):
    print(f"Epoch {i + 1}:")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Acc: {val_acc:.4f}")



Average Precision, Recall, F1 Score, and Validation Loss per Epoch:
Epoch 1:
Precision: 0.6883
Recall: 0.6869
F1 Score: 0.6824
Validation Loss: 0.9099
Validation Acc: 0.6869
Epoch 2:
Precision: 0.7226
Recall: 0.7111
F1 Score: 0.7125
Validation Loss: 0.8542
Validation Acc: 0.7111
Epoch 3:
Precision: 0.7155
Recall: 0.7002
F1 Score: 0.7015
Validation Loss: 0.9725
Validation Acc: 0.7001
Epoch 4:
Precision: 0.7157
Recall: 0.7104
F1 Score: 0.7104
Validation Loss: 1.0610
Validation Acc: 0.7103
Epoch 5:
Precision: 0.7088
Recall: 0.6936
F1 Score: 0.6961
Validation Loss: 1.2578
Validation Acc: 0.6936
Epoch 6:
Precision: 0.7090
Recall: 0.6965
F1 Score: 0.6978
Validation Loss: 1.3372
Validation Acc: 0.6964
Epoch 7:
Precision: 0.7057
Recall: 0.6945
F1 Score: 0.6945
Validation Loss: 1.4280
Validation Acc: 0.6945
Epoch 8:
Precision: 0.7078
Recall: 0.6941
F1 Score: 0.6956
Validation Loss: 1.4890
Validation Acc: 0.6941
Epoch 9:
Precision: 0.7069
Recall: 0.6994
F1 Score: 0.7007
Validation Loss: 1.4721


In [None]:
sample_text = df['essay'][5874]
sample_text

'see toilet unclean'

In [None]:
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Teks: {sample_text}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Teks: see toilet unclean
   Tokens: ['see', 'toilet', 'uncle', '##an']
Token IDs: [2156, 11848, 4470, 2319]


In [None]:
encoding = tokenizer.encode_plus(
  sample_text,
  max_length=14,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  #pad_to_max_length=True,
  padding='max_length',
  #truncation=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
print(len(encoding['input_ids'][0]))
input_ids = encoding['input_ids'].to(device)
input_ids

14


tensor([[  101,  2156, 11848,  4470,  2319,   102,     0,     0,     0,     0,
             0,     0,     0,     0]], device='cuda:0')

In [None]:
print(len(encoding['attention_mask'][0]))
attention_mask = encoding['attention_mask'].to(device)
attention_mask

14


tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'see',
 'toilet',
 'uncle',
 '##an',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [None]:
outputs = model(
          input_ids = input_ids,
          attention_mask = attention_mask
      )

In [None]:
outputs

tensor([[-3.1357e-03,  7.3692e+00, -7.8224e-01, -2.4882e+00, -1.5418e+00,
         -6.3470e-01, -5.1442e-01]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
_, preds = torch.max(outputs, dim=1)

In [None]:
preds

tensor([1], device='cuda:0')

In [None]:
import torch.nn.functional as F
probs = F.softmax(outputs, dim=1)
probs

tensor([[6.2724e-04, 9.9819e-01, 2.8779e-04, 5.2263e-05, 1.3465e-04, 3.3354e-04,
         3.7617e-04]], device='cuda:0', grad_fn=<SoftmaxBackward0>)