In [1]:
pip install transformers torch scikit-learn pandas numpy



In [10]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
data = pd.read_csv("hi-test (1).csv")
data.columns = ["sentiment", "text"]

# Load tokenizer and model from Hugging Face
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = AutoModel.from_pretrained(model_name)

# Convert sentiments to numeric labels
data["sentiment"] = data["sentiment"].astype("category").cat.codes
texts = data["text"].tolist()
labels = data["sentiment"].tolist()

# Custom Dataset Class
class HindiDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Parameters
MAX_LENGTH = 128
BATCH_SIZE = 16

# Prepare Dataloaders
train_dataset = HindiDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
test_dataset = HindiDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Define CNN Model
class SentimentCNN(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(SentimentCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1)
        self.fc = nn.Linear(64, num_classes)

    def forward(self, embeddings):
        x = F.relu(self.conv1(embeddings))
        x = F.max_pool1d(x, kernel_size=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2)
        x = x.mean(dim=2)
        x = self.fc(x)
        return x

# Define training and evaluation functions
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with torch.no_grad():
            outputs = transformer_model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.permute(0, 2, 1)

        logits = model(embeddings)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()

    return total_loss / len(train_loader), correct / len(train_loader.dataset)

def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = transformer_model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.permute(0, 2, 1)

            logits = model(embeddings)
            loss = criterion(logits, labels)

            total_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(data_loader), correct / len(data_loader.dataset), all_preds, all_labels

# Calculate classification report
def generate_classification_report(model, data_loader, device):
    model.eval()
    _, _, all_preds, all_labels = evaluate_model(model, data_loader, criterion, device)
    report = classification_report(all_labels, all_preds, target_names=data["sentiment"].astype("category").cat.categories)
    return report

# Training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_dim = 768
num_classes = len(data["sentiment"].unique())
model = SentimentCNN(embedding_dim, num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Train the model
EPOCHS = 5
for epoch in range(EPOCHS):
    train_loss, train_acc = train_model(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc, _, _ = evaluate_model(model, test_loader, criterion, device)
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# Generate and print the classification report
print("\nClassification Report:")
classification_report = generate_classification_report(model, test_loader, device)
print(classification_report)




Epoch 1/5
Train Loss: 1.0397, Train Acc: 0.4796
Val Loss: 0.9878, Val Acc: 0.4857
Epoch 2/5
Train Loss: 0.9451, Train Acc: 0.5516
Val Loss: 0.9659, Val Acc: 0.5048
Epoch 3/5
Train Loss: 0.9216, Train Acc: 0.5971
Val Loss: 0.9559, Val Acc: 0.5238
Epoch 4/5
Train Loss: 0.8690, Train Acc: 0.6235
Val Loss: 0.9240, Val Acc: 0.5524
Epoch 5/5
Train Loss: 0.8165, Train Acc: 0.6643
Val Loss: 0.9009, Val Acc: 0.5619

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TypeError: object of type 'int' has no len()

In [11]:
from sklearn.metrics import accuracy_score

def calculate_accuracy(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Extract embeddings
            outputs = transformer_model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.permute(0, 2, 1)

            # Forward pass through CNN
            logits = model(embeddings)
            preds = logits.argmax(dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    return accuracy, all_preds, all_labels

# Calculate training accuracy
train_accuracy, train_preds, train_labels = calculate_accuracy(model, train_loader, device)
print(f"Training Accuracy: {train_accuracy:.4f}")


Training Accuracy: 0.6906
