In [3]:
# importing all required libraries

import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
import pandas as pd

data_file = "files\\Twitter_Data (2).csv"
try:
    df = pd.read_csv(data_file)
    df.dropna(subset=['category'], inplace=True)
    

    
    # Convert data types and replace category values
    df['clean_text'] = df['clean_text'].astype(str)
    df['category'] = df['category'].astype(int)
    df['category'] = df['category'].replace({1: 2, 0: 1, -1: 0})
    
    sampled_data = []
    
    for category in df['category'].unique():
        # Filter the DataFrame for the current category
        category_data = df[df['category'] == category]
        # Sample 35k samples for the current category
        sampled_category = category_data.sample(n=35000, random_state=42)
        # Append the sampled data to the list
        sampled_data.append(sampled_category)
    
    # Concatenate sampled dataframes
    df = pd.concat(sampled_data, ignore_index=True)
    
    # Extract texts and labels
    texts = df['clean_text'].tolist()
    labels = df['category'].tolist()
    
except Exception as e:
    print("Error:", e)
    texts, labels = None, None


SyntaxError: invalid syntax (3359038636.py, line 1)

In [None]:


class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


In [None]:

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


In [None]:

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)


In [None]:

def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)  # Assuming logits is the output of your model
    label = preds.item()

    if label == 2:
        return "Positive Tweet"
    elif label == 1:
        return "Neutral Tweet"
    elif label == 0:
        return "Negative Tweet"
    else:
        return "Unknown Label"


In [None]:

# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 3
max_length = 120
batch_size = 16
num_epochs = 2
learning_rate = 2e-5


In [None]:

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:


tokenizer = BertTokenizer.from_pretrained(bert_model_name, cache_dir="/path/to/cache/directory")

train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)


In [None]:

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [None]:

for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

In [None]:
torch.save(model.state_dict(), "bert_classifier_three_labeled.pth")

In [None]:
# Test sentiment prediction
test_text = "PM Modi's unwavering dedication to economic development and his efforts to uplift the marginalized communities are truly commendable. His leadership has been instrumental in India's progress towards becoming a global powerhouse. #RespectForModi"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

In [None]:
# Test sentiment prediction
test_text = "The rising unemployment rate and the widening gap between the rich and the poor under PM Modi's rule are alarming. His economic policies seem to favor the wealthy at the expense of the common people."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted sentiment: {sentiment}")