# Importing Necessary Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim

# Importing Data for Training

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset from the specified path
path = "/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv"
data = pd.read_csv(path)

# Drop rows with missing values (NaN) from the dataset
data.dropna(inplace=True)

# Drop duplicate rows based on the 'clean_text' column
data.drop_duplicates(subset=['clean_text'], inplace=True)

# Extract texts and labels from the cleaned dataset
texts = data['clean_text'].values
labels = data['category'].values  

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

## Encoding the labels

In [3]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_train = label_encoder.fit_transform(train_labels)

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(train_labels, encoded_labels_train)}
label_mapping

{1.0: 2, 0.0: 1, -1.0: 0}

In [4]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
encoded_labels_valid = label_encoder.fit_transform(val_labels)

# Save the mapping between original labels and encoded labels
label_mapping = {original_label: int_label for original_label, int_label in zip(val_labels, encoded_labels_valid)}
label_mapping

{0.0: 1, 1.0: 2, -1.0: 0}

# Importing the RoBERTa Model

In [5]:
# Import the necessary classes from the transformers library
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Initialize a tokenizer using the 'roberta-base' pre-trained model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Initialize a model for sequence classification using the 'roberta-base' pre-trained weights
# Set the number of labels to 3
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should pr

In [19]:
from torch.utils.data import Dataset, DataLoader

# Define a custom Dataset class for sentiment classification
class Sentiment(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize and encode the text using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        # Return a dictionary containing tokenized data and label
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Use the encoded labels 
train_labels = encoded_labels_train
val_labels = encoded_labels_valid

# Create Sentiment instances for training and validation data
train_data = Sentiment(train_texts, train_labels, tokenizer, max_len=128)
val_data = Sentiment(val_texts, val_labels, tokenizer, max_len=128)

# Create DataLoader instances for training and validation data
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [20]:
# Import the AdamW optimizer class from the transformers library
from transformers import AdamW

# Initialize the AdamW optimizer with the parameters of the model
# Set the learning rate (lr) to 1e-5
optimizer = AdamW(model.parameters(), lr=1e-5)

In [22]:
import torch
from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

import torch
from sklearn.metrics import accuracy_score

# Check if CUDA (GPU) is available, and move the model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loop over training epochs (here, only 1 epoch as training time is too long for a single epoch)
for epoch in range(1):
    model.train()  # Set the model to training mode
    for batch in train_loader:
        optimizer.zero_grad()  # Clear gradients
        input_ids = batch['input_ids'].to(device)  # Move input to the device
        attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the device
        labels = batch['labels'].to(device)  # Move labels to the device
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass
        loss = outputs[0]  # Get the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters using gradients

    print('I am here')  # Print a message to indicate completion of an epoch
    
    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            val_loss += loss.item()

            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            accuracy = accuracy_score(labels.cpu(), predictions.cpu())
            val_accuracy += accuracy

    val_loss /= len(val_loader)  # Calculate average validation loss
    val_accuracy /= len(val_loader)  # Calculate average validation accuracy
    print(f'Validation Loss: {val_loss}')

Validation Loss: 0.08897072631558194  
Validation Accuracy: 0.9727128993566677

# Importing Test data

In [4]:
import pandas as pd

# Load the dataset from the specified path
reddit = pd.read_csv('/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv')

# Rename the 'clean_comment' column to 'clean_text'
reddit.rename(columns={'clean_comment': 'clean_text'}, inplace=True)

# Drop rows with missing values (NaN) from the dataset
reddit.dropna(inplace=True)

# Drop duplicate rows based on the 'clean_text' column
reddit.drop_duplicates(subset=['clean_text'], inplace=True)

# Reset the index after dropping rows
reddit.reset_index(drop=True, inplace=True)

# Print the cleaned test data
reddit

Unnamed: 0,clean_text,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
36794,jesus,0
36795,kya bhai pure saal chutiya banaya modi aur jab...,1
36796,downvote karna tha par upvote hogaya,0
36797,haha nice,1


In [44]:
from torch.utils.data import DataLoader

# Transform the labels of the test data using the label encoder
test_enc_labels = label_encoder.transform(reddit['category'])

# Create a test dataset using the Sentiment class we defined earlier
test_dataset = Sentiment(reddit['clean_text'], test_enc_labels, tokenizer, max_len=128)

# Create a DataLoader for the test dataset
# DataLoader helps manage batches and shuffling of data during testing
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [45]:
model.eval()  # Set the model to evaluation mode

# Lists to store predicted and true labels
all_predictions = []
all_true_labels = []

# Disable gradient calculation for inference
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Perform forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        # Extend the lists with predicted and true labels
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert encoded predictions back to original labels using the label encoder
predicted_labels = label_encoder.inverse_transform(all_predictions)
true_labels = label_encoder.inverse_transform(all_true_labels)



## Test Results

In [46]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy using true_labels and predicted_labels
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

# Generate a classification report using true_labels and predicted_labels
report = classification_report(true_labels, predicted_labels)
print(report)

Accuracy: 0.9576618929862225
              precision    recall  f1-score   support

        -1.0       0.92      0.92      0.92      8250
         0.0       0.98      0.97      0.98     12778
         1.0       0.95      0.96      0.96     15771

    accuracy                           0.96     36799
   macro avg       0.95      0.95      0.95     36799
weighted avg       0.96      0.96      0.96     36799



In [47]:
# Save the state dictionary of the model to the specified file
torch.save(model.state_dict(), 'Pre-trained-Sentiment.pth')