In [1]:
!pip install torch transformers pandas scikit-learn numpy tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from torch.optim import AdamW

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Define constants
MODEL_NAME = "bert-base-uncased"  # Pre-trained model to use
MAX_LEN = 128  # Maximum sequence length
BATCH_SIZE = 16  # Batch size for training and evaluation
EPOCHS = 2  # Number of training epochs
LEARNING_RATE = 2e-5  # Learning rate for fine-tuning

# Define paths to dataset
TRAIN_PATH = "/content/train.csv"
TEST_PATH = "/content/test.csv"

# Define dataset class
class SentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Map sentiment labels to numerical values
        self.sentiment_mapping = {"negative": 0, "neutral": 1, "positive": 2}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.iloc[index]["text"])
        sentiment = self.data.iloc[index]["sentiment"]

        # Convert sentiment to numerical label
        label = self.sentiment_mapping[sentiment]

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }

# Function to load and preprocess data
def load_data(file_path):
    # Try to read the CSV with different encodings
    try:
        df = pd.read_csv(file_path, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file_path, encoding='latin1')
        except UnicodeDecodeError:
            try:
                df = pd.read_csv(file_path, encoding='cp1252')
            except UnicodeDecodeError:
                # If all common encodings fail, try with error handling
                df = pd.read_csv(file_path, encoding='utf-8', errors='replace')

    # Ensure 'text' and 'sentiment' columns exist
    if 'text' not in df.columns or 'sentiment' not in df.columns:
        # Try to find columns that might contain text and sentiment
        text_col = df.columns[0] if len(df.columns) > 0 else None
        sentiment_col = df.columns[1] if len(df.columns) > 1 else None

        if text_col and sentiment_col:
            df = df.rename(columns={text_col: 'text', sentiment_col: 'sentiment'})
        else:
            raise ValueError("Could not find required columns in the dataset")

    # Convert sentiment labels to lowercase for consistency
    df['sentiment'] = df['sentiment'].str.lower()

    return df

# Function to clean dataset by removing NaN values
def clean_dataset(dataset):
    # Create a new dataset with only valid entries
    valid_indices = []
    for i in range(len(dataset.data)):
        sentiment = dataset.data.iloc[i]["sentiment"]
        if isinstance(sentiment, str) and sentiment in dataset.sentiment_mapping:
            valid_indices.append(i)

    print(f"Found {len(valid_indices)} valid entries out of {len(dataset.data)}")
    return Subset(dataset, valid_indices)

# Function to train the model
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

# Function to evaluate the model
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            try:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

                _, preds = torch.max(outputs.logits, dim=1)

                predictions.extend(preds.cpu().tolist())
                true_labels.extend(labels.cpu().tolist())
            except Exception as e:
                print(f"Error processing batch: {e}")
                continue

    return predictions, true_labels

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3  # 3 sentiment classes: negative, neutral, positive
)
model.to(device)

try:
    # Load and preprocess data
    train_df = load_data(TRAIN_PATH)
    test_df = load_data(TEST_PATH)

    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")

    # Create datasets
    train_dataset = SentimentDataset(train_df, tokenizer, MAX_LEN)
    test_dataset = SentimentDataset(test_df, tokenizer, MAX_LEN)

    # Clean datasets by removing NaN values
    print("Cleaning train dataset...")
    clean_train_dataset = clean_dataset(train_dataset)
    print("Cleaning test dataset...")
    clean_test_dataset = clean_dataset(test_dataset)

    # Create data loaders with cleaned datasets
    train_loader = DataLoader(clean_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(clean_test_dataset, batch_size=BATCH_SIZE)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

    # Train the model
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch+1}/{EPOCHS}")
        train_loss = train_model(model, train_loader, optimizer, device)
        print(f"Training loss: {train_loss:.4f}")

    # Evaluate the model
    print("Evaluating model...")
    predictions, true_labels = evaluate_model(model, test_loader, device)

    # Generate classification report
    label_names = ["negative", "neutral", "positive"]
    report = classification_report(true_labels, predictions, target_names=label_names)
    print("\nClassification Report:")
    print(report)

    # Save the model
    model.save_pretrained("sentiment_analysis_model")
    tokenizer.save_pretrained("sentiment_analysis_model")
    print("Model saved successfully!")

except Exception as e:
    print(f"An error occurred: {e}")

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train data shape: (27481, 10)
Test data shape: (4815, 9)
Cleaning train dataset...
Found 27481 valid entries out of 27481
Cleaning test dataset...
Found 3534 valid entries out of 4815

Epoch 1/2


Training: 100%|██████████| 1718/1718 [09:42<00:00,  2.95it/s]


Training loss: 0.5726

Epoch 2/2


Training: 100%|██████████| 1718/1718 [09:42<00:00,  2.95it/s]


Training loss: 0.4096
Evaluating model...


Evaluating: 100%|██████████| 221/221 [00:26<00:00,  8.34it/s]



Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.81      0.80      1001
     neutral       0.75      0.77      0.76      1430
    positive       0.86      0.81      0.83      1103

    accuracy                           0.79      3534
   macro avg       0.80      0.80      0.80      3534
weighted avg       0.79      0.79      0.79      3534

Model saved successfully!
