1. Data Loading
Assume the dataset is saved in a CSV file named dataset.csv:



In [1]:
import pandas as pd

train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

train_data.head()

Unnamed: 0,sentence,sentiment,encoded,lengths
0,awww that s a bummer you shoulda got david car...,0,"[101, 22091, 2860, 2860, 2008, 1055, 1037, 263...",128
1,is upset that he can t update his facebook by ...,0,"[101, 2003, 6314, 2008, 2002, 2064, 1056, 1065...",128
2,i dived many times for the ball managed to sav...,0,"[101, 1045, 11529, 2094, 2116, 2335, 2005, 199...",128
3,my whole body feels itchy and like its on fire,0,"[101, 2026, 2878, 2303, 5683, 2009, 11714, 199...",128
4,no it s not behaving at all i m mad why am i h...,0,"[101, 2053, 2009, 1055, 2025, 2022, 3270, 6455...",128


2. Tokenizer Initialization
Since the dataset already contains encoded tokens, ensure you are using the same tokenizer for consistency:


In [10]:
from transformers import AutoTokenizer
from tqdm import tqdm

tqdm.pandas()

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize sentences correctly
def tokenize_and_pad(sentence):
    tokens = tokenizer(
        sentence,
        padding='max_length',  # Ensures uniform length
        truncation=True,       # Truncates if too long
        max_length=128,        # Set max sequence length
        return_tensors=None    # Return as Python lists
    )
    return tokens['input_ids']  # Extract token IDs as a list of integers

# Apply tokenization
train_data['encoded'] = train_data['sentence'].progress_map(tokenize_and_pad)

100%|██████████| 1218942/1218942 [01:16<00:00, 15974.45it/s]


In [5]:
from transformers import BertTokenizer


# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

3. Dataset Preparation for PyTorch
You need to convert the data into a PyTorch-compatible dataset.

In [11]:
import torch
from torch.utils.data import Dataset

class SentimentAnalysisDataset(Dataset):
    def __init__(self, data):
        self.encoded = list(data['encoded'])
        self.labels = list(data['sentiment'])
        self.lengths = list(data['lengths'])

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return {'input_ids': input_ids, 'labels': label}
        

4. Create DataLoaders
Split the dataset into training and validation sets:

In [12]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Split data
train_dataset = SentimentAnalysisDataset(train_data)
val_dataset = SentimentAnalysisDataset(val_data)


# create dataloaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

5. Load Model
Load a pre-trained transformer model like BERT for binary classification:



In [8]:
from transformers import BertForSequenceClassification, AdamW

# load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

2025-01-22 21:42:57.344347: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-22 21:42:57.510501: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-22 21:42:58.259331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-01-22 21:42:58.259388: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

6. Training Loop
Define the training loop for multiple epochs:

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")