<a href="https://colab.research.google.com/github/dcsweezy/EE4483_Sentiment-Analysis-with-Varying-Model-Parameters/blob/main/bert_sentiment_output_Muru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the Hugging Face library
!pip install transformers

# Import libraries
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW  # <-- Corrected AdamW import
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive

# Set a random seed for reproducibility
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:

drive.mount('/content/drive')


base_path = '/content/drive/MyDrive/IE4483_Project/'


train_df = pd.read_json(base_path + 'train.json')
test_df = pd.read_json(base_path + 'test.json')

print("Files loaded successfully from Google Drive!")

#  BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        # Check if labels are None (for test set)
        if self.labels is not None:
            label = self.labels[idx]
        else:
            label = 0 # Dummy label

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

Mounted at /content/drive
Files loaded successfully from Google Drive!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:

full_dataset = ReviewDataset(
    texts=train_df['reviews'].values,
    labels=train_df['sentiments'].values, # Use 'sentiments' column
    tokenizer=tokenizer
)


train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

print(f"Total training samples: {train_size}")
print(f"Total validation samples: {val_size}")

Total training samples: 6660
Total validation samples: 741


In [None]:

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)


model.to(device)
print("Model loaded and moved to GPU.")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to GPU.


In [None]:

optimizer = AdamW(model.parameters(), lr=3e-5) # 3e-5 is a good starting learning rate

EPOCHS = 3 # 3 epochs is a good starting point for fine-tuning
total_steps = len(train_loader) * EPOCHS


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
print("Starting training...")

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1}/{EPOCHS} ---")

    # --- Training Phase ---
    model.train()
    total_train_loss = 0

    for batch in train_loader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)


        model.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()


        loss.backward()


        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    print(f"  Average training loss: {total_train_loss / len(train_loader):.4f}")


    model.eval()
    total_val_accuracy = 0
    total_val_f1 = 0

    with torch.no_grad():
        for batch in val_loader:

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)


            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)


            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels_np = labels.cpu().numpy()

            # 4. Calculate metrics
            total_val_accuracy += accuracy_score(labels_np, preds)
            total_val_f1 += f1_score(labels_np, preds, average='weighted')

    print(f"  Validation Accuracy: {total_val_accuracy / len(val_loader):.4f}")
    print(f"  Validation F1 Score: {total_val_f1 / len(val_loader):.4f}")

# --- End of Training ---
print("\nTraining complete!")

Starting training...

--- Epoch 1/3 ---
  Average training loss: 0.2092
  Validation Accuracy: 0.9681
  Validation F1 Score: 0.9661

--- Epoch 2/3 ---
  Average training loss: 0.0853
  Validation Accuracy: 0.9628
  Validation F1 Score: 0.9600

--- Epoch 3/3 ---
  Average training loss: 0.0355
  Validation Accuracy: 0.9668
  Validation F1 Score: 0.9656

Training complete!


In [None]:
print("Starting prediction on test set...")

# Create the test dataset
test_dataset = ReviewDataset(
    texts=test_df['reviews'].values,
    labels=None, # No labels for the test set
    tokenizer=tokenizer
)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# --- Prediction Phase ---
model.eval() # Evaluation mode
all_predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.extend(preds)

print("Predictions generated.")

# --- Save Results to Google Drive ---

# Create a DataFrame with just the 'sentiment' column
submission_df = pd.DataFrame({'sentiment': all_predictions})

# Save the DataFrame to a CSV file in your Drive
save_path = base_path + 'submission.csv'
submission_df.to_csv(save_path, index=False)

print(f"\nSuccessfully saved submission.csv to: {save_path}")

Starting prediction on test set...
Predictions generated.

Successfully saved submission.csv to: /content/drive/MyDrive/IE4483_Project/submission.csv


Predictions generated.
submission.csv file saved to your Google Drive at: /content/drive/MyDrive/IE4483_Project/submission.csv
