In [None]:
import pandas as pd

# load processed datasets
train_data = pd.read_csv("../Dataset/train_preprocessed.csv")
val_data = pd.read_csv("../Dataset/val_preprocessed.csv")
test_data = pd.read_csv("../Dataset/test_preprocessed.csv")

print(train_data.head())

   PhraseId  SentenceId                             Phrase  Sentiment  \
0     18674         819                             paulin          2   
1     15525         665                             tri go          2   
2    119919        6413                          week live          2   
3     17938         781                essenti collect bit          2   
4     98852        5185  director fake backdrop state pace          1   

   words_num  
0          1  
1          3  
2          4  
3          5  
4         11  


In [2]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_data(df):
    return tokenizer(
        df["Phrase"].tolist(),  # Ensure we're tokenizing the "Phrase" column
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# set the Phrase column as str
train_data["Phrase"] = train_data["Phrase"].astype(str)
val_data["Phrase"] = val_data["Phrase"].astype(str)
test_data["Phrase"] = test_data["Phrase"].astype(str)

# Tokenize train, val, and test data
train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)
test_encodings = tokenize_data(test_data)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}  # Tokenized input
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)  # Labels
        return item


In [4]:
# Convert Sentiment labels to tensors
train_labels = torch.tensor(train_data["Sentiment"].values, dtype=torch.long)
val_labels = torch.tensor(val_data["Sentiment"].values, dtype=torch.long)
test_labels = torch.tensor(test_data["Sentiment"].values, dtype=torch.long)

# Create datasets using the tokenized encodings and labels
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move labels to GPU
train_labels = train_labels.to(device)
val_labels = val_labels.to(device)
test_labels = test_labels.to(device)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3,
)

# Move model to GPU
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.3, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
import transformers
import accelerate

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)

Torch version: 2.5.1+cu121
Transformers version: 4.49.0
Accelerate version: 0.28.0


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="Model/",
    num_train_epochs=7,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs/",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
    max_grad_norm=1.0,
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8), None),  # Custom optimizer
)




In [None]:
training_args = TrainingArguments(
    gradient_accumulation_steps=2  # Effective batch size increases
)

from transformers import EarlyStoppingCallback
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))


NameError: name 'TrainingArguments' is not defined

In [3]:
trainer.train()
model.save_pretrained("../Model/sentiment_bert", save_config=True)
tokenizer.save_pretrained("../Model/sentiment_bert")

NameError: name 'trainer' is not defined

In [10]:
results = trainer.evaluate(test_dataset)
print(results)

  item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)  # Labels


{'eval_loss': 0.9135347604751587, 'eval_runtime': 102.0823, 'eval_samples_per_second': 272.937, 'eval_steps_per_second': 34.12, 'epoch': 7.0}


In [11]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}  # Move to GPU if available
    outputs = model(**inputs)
    prediction = outputs.logits.argmax().item()  # Get the predicted class
    return prediction

sentence = "This movie was absolutely amazing!"
print(f"Predicted Sentiment: {predict_sentiment(sentence)}")


Predicted Sentiment: 4
