In [None]:
#importing required packages
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from gensim.models import Word2Vec


In [165]:
# Loading the twitter data and labels
train_features = pd.read_csv('/Users/chathurya/Downloads/Chathurya/datasets/sentiment/train_text.txt', 
                             sep='\t', header=None, names=['text'])
train_labels = pd.read_csv('/Users/chathurya/Downloads/Chathurya/datasets/sentiment/train_labels.txt', 
                           sep='\t', header=None, names=['label'])
train_data = pd.concat([train_features, train_labels], axis=1)

train_data = train_data[train_data['text'].apply(lambda x: type(x) == str)]

# Extracting the features and labels
train_features = train_data.drop('label', axis=1).values
train_labels = train_data['label'].values

# Splitting the data and labels into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(train_features, train_labels, 
                                                                    test_size=0.3, random_state = 25)


In [167]:
#Tokenize tweets and create word2vec embeddings
tweets = data['text'].apply(lambda x: x.split())
model = Word2Vec(tweets, min_count=1)
word_vectors = model.wv


In [168]:
# Define the model architecture
import torch.nn as nn

class TweetClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super(TweetClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.mean(dim=1)
        log = self.fc(embedded)
        log = self.relu(log)
        log = self.softmax(log)
        
        return log

In [169]:
#collecting unique words from the dataset
uniq_words = set()
for tweet in train_data:
    words = tweet[0].split()
    uniq_words.update(words)
    
#creating a vocabulary to index mapping
vocab = list(uniq_words)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

In [170]:
train_ind = []
for tweet in train_data:
    if tweet[0]:
        words = tweet[0].split()
        indices = [word_to_idx[word] if word in word_to_idx else 0 for word in words]
        train_ind.append(indices)

test_ind = []
for tweet in test_data:
    if tweet[0]:
        words = tweet[0].split()
        indices = [word_to_idx[word] if word in word_to_idx else 0 for word in words]
        test_ind.append(indices)




In [171]:
#Create a PyTorch Dataset class
class TweetDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        tweet = self.X[idx]
        label = self.y[idx]
        return torch.tensor(tweet), torch.tensor(label)


In [172]:
# Create a PyTorch DataLoader class
from torch.nn.utils.rnn import pad_sequence

#pad the sequence within a batch

def collate_fn(batch):
    inputs, labels = zip(*batch)
    padded_inputs = pad_sequence(inputs, batch_first=True)
    return padded_inputs, torch.stack(labels)

In [173]:
from torch.utils.data import Dataset, DataLoader

batch_size = 32
train_dataset = TweetDataset(train_ind, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataset = TweetDataset(test_ind, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [174]:
#Initializing the sentiment classifier

# Define the embedding layer
embedding_dim = 100
vocab_size = len(vocab)
embedding = nn.Embedding(vocab_size, embedding_dim)


In [175]:
# Define the model, loss function, and optimizer
model = TweetClassifier(vocab_size, embedding_dim=embedding_dim, output_dim=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [176]:
# Train the model
criterion = nn.CrossEntropyLoss()
num_epochs = 50
for epoch in range(num_epochs):
    for tweets, labels in train_loader:
        labels = labels.unsqueeze(1)
        #Forward pass
        outputs = model(tweets)
        
        #Computing the loss
        loss = criterion(outputs, labels.squeeze(1))
        
        #Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [180]:
#Evaluating the model on the test dataset
model.eval()
predict = []
true_labels = []
correct = 0
total = 0

with torch.no_grad():
    for tweets, labels in test_loader:
        #Forward Pass
        outputs = model(tweets)
        #Computing predictions
        _, predicted = torch.max(outputs, dim=1)
         
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        #collect predicted and t labels
        predict.extend(predicted.tolist())
        true_labels.extend(labels.tolist())
    print(f"Test accuracy: {100 * correct / total:.2f}%")
    print('Accuracy of the network on the test tweets: %d %%' % (100 * correct / total))
    #Generating the confusion matrix
    matrix = confusion_matrix(true_labels, predict)
    print('Confusion Matrix:', matrix)

Test accuracy: 37.45%
Accuracy of the network on the test tweets: 37 %
Confusion Matrix: [[ 309  826  614]
 [ 868 2319 1676]
 [ 803 2038 1459]]


In [181]:
# Comparison of model performances
from sklearn.metrics import classification_report
report = classification_report(true_labels, predict)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.18      0.17      1749
           1       0.45      0.48      0.46      4863
           2       0.39      0.34      0.36      4300

    accuracy                           0.37     10912
   macro avg       0.33      0.33      0.33     10912
weighted avg       0.38      0.37      0.38     10912

