In [None]:
import time
from collections import Counter
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertForSequenceClassification, AdamW, BertTokenizer, DistilBertTokenizer, DistilBertModel

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

seed = 42

df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df_200 = df.sample(n=200, random_state=seed).reset_index(drop=True)

df_100 = df_200.iloc[:100].reset_index(drop=True)
df_test = df_200.iloc[100:].reset_index(drop=True)
print("100-sample dataset:", df_100.shape)

100-sample dataset: (100, 2)


In [None]:
def clean_data(data_df):
    data_df['text_length'] = data_df['review'].apply(len)
    data_df['target'] = data_df['sentiment'].map({'positive': 1, 'negative': 0})
    all_words = [word for review in data_df["review"] for word in review.split(" ")]
    vocab = Counter(all_words)
    counts = list(vocab.values())
    data_df["word_count"] = data_df["review"].apply(lambda x: len(x.split(" ")))
    data_df["word_count"].describe()
    data_df[['text_length', 'target']].groupby('target').mean()['text_length']
    data_df[data_df["target"] == 0]["review"].values[1]
    def remove_hashtag(text):
        pattern= "#[\w\d]+"
        return re.sub(pattern, "", text)

    def remove_url(text):
        pattern = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
        return re.sub(pattern, "", text)
        
    data_df['text_without_hastag'] = data_df['review'].apply(remove_hashtag)
    data_df['text_without_url'] = data_df['text_without_hastag'].apply(remove_url)
    pattern = r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'

    def count_non_english(text):
        return len(re.findall(pattern, text))

    data_df['non_english_count'] = data_df['text_without_url'].apply(count_non_english)
    def remove_non_english(text):
        pattern = r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
        return re.sub(pattern, "", text)
    data_df['text_without_non_english'] = data_df['text_without_url'].apply(remove_non_english)
    def remove_markdown(text):
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) 
        text = re.sub(r'\*(.*?)\*', r'\1', text)
        return text

    data_df['cleaned'] = data_df['text_without_non_english'].apply(remove_markdown)

    train_df, val_df= train_test_split(data_df[['cleaned', 'target']], test_size=0.2, random_state=90,shuffle=True, stratify=data_df['target'])
    train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)
    return train_df, val_df

train_df_100, val_df_100 = clean_data(df_100)

In [None]:
#Load the Dataset and Tokenize
class SentimentDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_length=128):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        text = self.reviews[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True, 
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

max_length = 256
train_dataset_100 = SentimentDataset(train_df_100['cleaned'].to_list(), train_df_100['target'].to_list(), tokenizer, max_length=max_length)
val_dataset_100 = SentimentDataset(val_df_100['cleaned'].to_list(), val_df_100['target'].to_list(), tokenizer, max_length=max_length)

batch_size = 32
train_dataloader_100 = DataLoader(train_dataset_100, batch_size=batch_size, shuffle=True)
val_dataloader_100 = DataLoader(val_dataset_100, batch_size=batch_size)

In [None]:
def train_model(train_dataloader, val_dataloader, num_epochs=50, patience=5, clip_value=1.0, learning_rate=2e-5, save_name="classifier.pt"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2, 
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    criterion = nn.CrossEntropyLoss()
    best_val_loss = float('inf')
    epochs_without_improvement = 0

    # Tracking metrics
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    train_f1s, val_f1s = [], []
    epoch_times = []

    for epoch in range(num_epochs):
        start_time = time.time()
        model.train()
        total_train_loss = 0
        all_train_preds, all_train_labels = [], []

        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            total_train_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip_value)  # Gradient clipping
            optimizer.step()

            # Collect predictions for accuracy calculation
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()
            train_f1 = f1_score(preds, labels, average="weighted")
            all_train_preds.extend(preds)
            all_train_labels.extend(labels)

        avg_train_loss = total_train_loss / len(train_dataloader)
        train_accuracy = accuracy_score(all_train_labels, all_train_preds)

        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)
        train_f1s.append(train_f1)

        # Validation phase
        model.eval()
        total_val_loss = 0
        all_val_preds, all_val_labels = [], []

        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                total_val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                labels = labels.cpu().numpy()
                val_f1 = f1_score(preds, labels, average="weighted")
                all_val_preds.extend(preds)
                all_val_labels.extend(labels)

        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = accuracy_score(all_val_labels, all_val_preds)

        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        val_f1s.append(val_f1)

        end_time = time.time()
        epoch_time = end_time - start_time
        epoch_times.append(epoch_time)

        print(f"Epoch {epoch+1}/{num_epochs} - Time: {epoch_time:.2f}s")
        print(f"Train Loss: {avg_train_loss:.3f}, Train Accuracy: {train_accuracy:.3f}, Train F1: {train_f1:.3f}")
        print(f"Val Loss: {avg_val_loss:.3f}, Val Accuracy: {val_accuracy:.3f}, Val F1: {val_f1:.3f}")
        scheduler.step(avg_val_loss)

        # Early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0
            torch.save(model.state_dict(), save_name)
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print("Early stopping triggered.")
            break

    # Training complete
    print("Training complete.")
    print(f"Average epoch time: {sum(epoch_times) / len(epoch_times):.2f} seconds")
    print(f"Total training time: {sum(epoch_times):.2f} seconds")

    return {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accuracies': train_accuracies,
        'val_accuracies': val_accuracies
    }

In [None]:
train_results_100 = train_model(train_dataloader_100, val_dataloader_100, save_name="encoder_100.pt")
print(train_results_100["train_losses"])
print(train_results_100["val_losses"])
print(train_results_100["train_accuracies"])
print(train_results_100["val_accuracies"])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50 - Time: 4.70s
Train Loss: 0.698, Train Accuracy: 0.500, Train F1: 0.431
Val Loss: 0.708, Val Accuracy: 0.400, Val F1: 0.406
Epoch 2/50 - Time: 4.68s
Train Loss: 0.597, Train Accuracy: 0.750, Train F1: 0.684
Val Loss: 0.697, Val Accuracy: 0.600, Val F1: 0.617
Epoch 3/50 - Time: 4.73s
Train Loss: 0.530, Train Accuracy: 0.762, Train F1: 0.754
Val Loss: 0.684, Val Accuracy: 0.650, Val F1: 0.658
Epoch 4/50 - Time: 4.70s
Train Loss: 0.427, Train Accuracy: 0.900, Train F1: 0.868
Val Loss: 0.679, Val Accuracy: 0.650, Val F1: 0.651
Epoch 5/50 - Time: 4.72s
Train Loss: 0.345, Train Accuracy: 0.988, Train F1: 1.000
Val Loss: 0.774, Val Accuracy: 0.600, Val F1: 0.617
Epoch 6/50 - Time: 4.68s
Train Loss: 0.279, Train Accuracy: 0.950, Train F1: 1.000
Val Loss: 0.773, Val Accuracy: 0.600, Val F1: 0.617
Epoch 7/50 - Time: 4.70s
Train Loss: 0.224, Train Accuracy: 1.000, Train F1: 1.000
Val Loss: 0.766, Val Accuracy: 0.450, Val F1: 0.463
Epoch 8/50 - Time: 4.69s
Train Loss: 0.215, Train Accur

In [None]:
class Encoder(nn.Module):
    def __init__(self, bert_encoder, hidden_dim, z_dim):
        super(Encoder, self).__init__()
        self.bert = bert_encoder
        self.hidden2mean = nn.Linear(hidden_dim, z_dim)
        self.hidden2logvar = nn.Linear(hidden_dim, z_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]
        mean = self.hidden2mean(hidden_state)
        logvar = self.hidden2logvar(hidden_state)
        return mean, logvar
    
class Decoder(nn.Module):
    def __init__(self, z_dim, hidden_dim, vocab_size):
        super(Decoder, self).__init__()
        self.fc = nn.Linear(z_dim, hidden_dim)
        self.embedding = nn.Embedding(vocab_size, hidden_dim)  # Embedding layer for input tokens
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, vocab_size)

    def forward(self, z, target_ids=None, teacher_forcing_ratio=0.5):
        h = torch.tanh(self.fc(z)).unsqueeze(0)  # Initial hidden state from latent vector z
        batch_size = z.size(0)
        max_length = target_ids.size(1) if target_ids is not None else 20  # Set max length

        # Initialize output tensor to store logits
        outputs = torch.zeros(batch_size, max_length, self.output_layer.out_features).to(z.device)
        
        # Initialize input token (you may replace this with the start token if available)
        input_token = torch.zeros(batch_size, 1, hidden_dim).to(z.device)

        for t in range(max_length):
            output, h = self.gru(input_token, h)
            output_logits = self.output_layer(output.squeeze(1))
            outputs[:, t, :] = output_logits
            
            # Teacher forcing: use ground truth with probability teacher_forcing_ratio
            if target_ids is not None and torch.rand(1).item() < teacher_forcing_ratio:
                input_token = self.embedding(target_ids[:, t]).unsqueeze(1)
            else:
                _, top_token = output_logits.max(dim=1)
                input_token = self.embedding(top_token).unsqueeze(1) 
        
        return outputs


class SentenceVAE(nn.Module):
    def __init__(self, encoder, decoder, z_dim):
        super(SentenceVAE, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.z_dim = z_dim

    def sample_z(self, mean, logvar):
        # Reparameterization trick
        std = torch.exp(0.5 * logvar)  # Standard deviation
        epsilon = torch.randn_like(std)  # Sample noise
        z = mean + std * epsilon  # Sample z
        return z

    def forward(self, input_ids, attention_mask, target_ids=None, teacher_forcing_ratio=1.0):
        # Encode to obtain mean and logvar
        mean, logvar = self.encoder(input_ids, attention_mask)
        z = self.sample_z(mean, logvar)
        # Decode the latent vector z, using target_ids and teacher_forcing_ratio if provided
        recon_x = self.decoder(z, target_ids=target_ids, teacher_forcing_ratio=teacher_forcing_ratio)
        return recon_x, mean, logvar

def sample_from_logits(logits, temperature=1.0):
    logits = logits / temperature 
    probabilities = torch.softmax(logits, dim=-1) 
    return torch.multinomial(probabilities, 1).squeeze(-1)

def load_state_dict(model, filepath):
    state_dict = torch.load(filepath, map_location=torch.device('cpu'))
    new_state_dict = {}
    for key, value in state_dict.items():
        new_key = key.replace("module.", "") if key.startswith("module.") else key
        new_state_dict[new_key] = value
    model.load_state_dict(new_state_dict)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
hidden_dim = 768
z_dim = 16  # Latent space dimensionality
vocab_size = tokenizer.vocab_size

encoder = Encoder(distilbert_encoder, hidden_dim, z_dim)
decoder = Decoder(z_dim, hidden_dim, vocab_size)
model = SentenceVAE(encoder, decoder, z_dim=16)  # Adjust based on your model's structure

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
load_state_dict(model, "/kaggle/input/best-sen/pytorch/default/1/best_sentence_model (3).pt")
model.eval()

  state_dict = torch.load(filepath, map_location=torch.device('cpu'))


SentenceVAE(
  (encoder): Encoder(
    (bert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
              (dropout):

In [None]:
def generate_variations(model, input_text, max_length=256, temperature=0.7, top_k=50, num_variations=4, perturb_scale=0.1):
    encoding = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        mean, logvar = model.encoder(input_ids, attention_mask)
        
        variations = []
        for _ in range(num_variations):
            noise = torch.randn_like(mean) * perturb_scale
            z = model.sample_z(mean, logvar) + noise
            
            generated_ids = [tokenizer.cls_token_id]
            input_token = model.decoder.embedding(torch.tensor([[tokenizer.cls_token_id]]).to(device))
            h = torch.tanh(model.decoder.fc(z)).unsqueeze(0)
            
            for _ in range(max_length):
                output, h = model.decoder.gru(input_token, h)
                logits = model.decoder.output_layer(output.squeeze(1)) / temperature

                k = min(top_k, logits.size(-1))
                top_k_values, top_k_indices = torch.topk(logits, k)
                probabilities = F.softmax(top_k_values, dim=-1)
                
                next_token_index = torch.multinomial(probabilities, 1).item()
                next_token_id = top_k_indices[0, next_token_index].item()

                generated_ids.append(next_token_id)
                if next_token_id == tokenizer.sep_token_id:
                    break

                input_token = model.decoder.embedding(torch.tensor([[next_token_id]]).to(device))

            generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
            variations.append(generated_text)
        
    return variations

# Dataset augmentation function
def augment_dataset(df, num_variations=4, max_length=256, temperature=0.8, top_k=50, perturb_scale=0.1):
    augmented_texts = []
    augmented_labels = []

    for _, row in df.iterrows():
        input_text = row['review']
        label = row['sentiment']

        variations = generate_variations(
            model, input_text, max_length=max_length,
            temperature=temperature, top_k=top_k, 
            num_variations=num_variations, perturb_scale=perturb_scale
        )

        augmented_texts.extend(variations)
        augmented_labels.extend([label] * num_variations)

    augmented_df = pd.DataFrame({'review': augmented_texts, 'sentiment': augmented_labels})
    combined_df = pd.concat([df, augmented_df], ignore_index=True)
    
    return combined_df

In [None]:
augmented_df_400 = augment_dataset(df_100, num_variations=1)
augmented_df_1000 = augment_dataset(df_100, num_variations=4)

In [27]:
augmented_train_df_400, augmented_val_df_400 = clean_data(augmented_df_400)
augmented_train_df_1000, augmented_val_df_1000 = clean_data(augmented_df_1000)

max_length = 256
augmented_train_dataset_400 = SentimentDataset(augmented_train_df_400['cleaned'], augmented_train_df_400['target'], tokenizer)
augmented_val_dataset_400 = SentimentDataset(augmented_val_df_400['cleaned'], augmented_val_df_400['target'], tokenizer)
augmented_train_dataset_1000 = SentimentDataset(augmented_train_df_1000['cleaned'], augmented_train_df_1000['target'], tokenizer)
augmented_val_dataset_1000 = SentimentDataset(augmented_val_df_1000['cleaned'], augmented_val_df_1000['target'], tokenizer)

batch_size = 32
augmented_train_dataloader_400 = DataLoader(augmented_train_dataset_400, batch_size=batch_size)
augmented_val_dataloader_400 = DataLoader(augmented_val_dataset_400, batch_size=batch_size)
augmented_train_dataloader_1000 = DataLoader(augmented_train_dataset_1000, batch_size=batch_size)
augmented_val_dataloader_1000 = DataLoader(augmented_val_dataset_1000, batch_size=batch_size)

In [28]:
augmented_train_results_400 = train_model(augmented_train_dataloader_400, augmented_val_dataloader_400, save_name="augmented_400_encoder")
print(augmented_train_results_400["train_losses"])
print(augmented_train_results_400["val_losses"])
print(augmented_train_results_400["train_accuracies"])
print(augmented_train_results_400["val_accuracies"])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50 - Time: 4.39s
Train Loss: 0.739, Train Accuracy: 0.444, Train F1: 0.564
Val Loss: 0.703, Val Accuracy: 0.525, Val F1: 0.385
Epoch 2/50 - Time: 4.39s
Train Loss: 0.660, Train Accuracy: 0.606, Train F1: 0.614
Val Loss: 0.701, Val Accuracy: 0.475, Val F1: 0.477
Epoch 3/50 - Time: 4.44s
Train Loss: 0.620, Train Accuracy: 0.669, Train F1: 0.699
Val Loss: 0.691, Val Accuracy: 0.625, Val F1: 0.750
Epoch 4/50 - Time: 4.46s
Train Loss: 0.533, Train Accuracy: 0.863, Train F1: 0.907
Val Loss: 0.687, Val Accuracy: 0.625, Val F1: 0.767
Epoch 5/50 - Time: 4.37s
Train Loss: 0.435, Train Accuracy: 0.938, Train F1: 0.877
Val Loss: 0.678, Val Accuracy: 0.625, Val F1: 0.767
Epoch 6/50 - Time: 4.45s
Train Loss: 0.356, Train Accuracy: 0.944, Train F1: 0.938
Val Loss: 0.706, Val Accuracy: 0.550, Val F1: 0.767
Epoch 7/50 - Time: 4.41s
Train Loss: 0.253, Train Accuracy: 0.981, Train F1: 0.969
Val Loss: 0.715, Val Accuracy: 0.550, Val F1: 0.767
Epoch 8/50 - Time: 4.43s
Train Loss: 0.188, Train Accur

In [29]:
augmented_train_results_1000 = train_model(augmented_train_dataloader_1000, augmented_val_dataloader_1000, save_name="augmented_1000_encoder")
print(augmented_train_results_1000["train_losses"])
print(augmented_train_results_1000["val_losses"])
print(augmented_train_results_1000["train_accuracies"])
print(augmented_train_results_1000["val_accuracies"])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50 - Time: 9.86s
Train Loss: 0.710, Train Accuracy: 0.502, Train F1: 0.625
Val Loss: 0.707, Val Accuracy: 0.520, Val F1: 0.000
Epoch 2/50 - Time: 9.81s
Train Loss: 0.686, Train Accuracy: 0.540, Train F1: 0.571
Val Loss: 0.735, Val Accuracy: 0.540, Val F1: 0.000
Epoch 3/50 - Time: 9.73s
Train Loss: 0.648, Train Accuracy: 0.605, Train F1: 0.600
Val Loss: 0.827, Val Accuracy: 0.520, Val F1: 0.000
Epoch 4/50 - Time: 9.69s
Train Loss: 0.590, Train Accuracy: 0.705, Train F1: 0.937
Val Loss: 0.951, Val Accuracy: 0.480, Val F1: 0.000
Epoch 5/50 - Time: 9.71s
Train Loss: 0.540, Train Accuracy: 0.748, Train F1: 0.882
Val Loss: 0.756, Val Accuracy: 0.520, Val F1: 0.100
Epoch 6/50 - Time: 9.72s
Train Loss: 0.426, Train Accuracy: 0.860, Train F1: 1.000
Val Loss: 0.766, Val Accuracy: 0.560, Val F1: 0.333
Early stopping triggered.
Training complete.
Average epoch time: 9.75 seconds
Total training time: 58.51 seconds
[0.7104106958095844, 0.6863334133074834, 0.6479951326663678, 0.58983260163894

In [None]:
def evaluate_model_on_test_dataset(dataloader, model_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    
    # Track metrics
    all_preds = []
    all_labels = []
    total_loss = 0
    
    # Evaluate
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Calculate loss for each batch
            loss = F.cross_entropy(logits, labels)
            total_loss += loss.item()
            
            # Calculate predictions
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate average loss, accuracy, and F1 score
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")

    # Output metrics
    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")
    
    return avg_loss, accuracy, f1, all_preds, all_labels

In [None]:
_, test_df_100 = clean_data(df_test)
test_dataset_100 = SentimentDataset(test_df_100['cleaned'].to_list(), test_df_100['target'].to_list(), tokenizer, max_length=max_length)
test_dataloader_100 = DataLoader(test_dataset_100, batch_size=batch_size)


val_loss, val_accuracy, val_f1, val_preds, val_labels = evaluate_model_on_test_dataset(test_dataloader_100, "/kaggle/working/augmented_400_encoder")
print(f"Validation Accuracy on initial 100-sample dataset: {val_accuracy:.4f}")

val_loss, val_accuracy, val_f1, val_preds, val_labels = evaluate_model_on_test_dataset(test_dataloader_100, "/kaggle/working/augmented_1000_encoder")
print(f"Validation Accuracy on initial 1000-sample dataset: {val_accuracy:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))


Validation Loss: 0.5492
Validation Accuracy: 0.6500
Validation F1 Score: 0.6267
Validation Accuracy on initial 200-sample dataset: 0.6500


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))


Validation Loss: 0.6641
Validation Accuracy: 0.8500
Validation F1 Score: 0.8496
Validation Accuracy on initial 1000-sample dataset: 0.8500
