In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

df = pd.read_csv('text.csv')[0:20000]
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


#### Text Preprocessing
1) Lowercasing
2) Removing Punctuation and Special Characters
3) Tokenization - Splitting the individual text sentences into a string of tokens, where each token represents a particular subset of the sentence.
4) Removing Stop Words - Removing common words in the English language that won't provide much value.
5) Stemming - Reducing words to their base form.
6) Encoding - Converting categorical labels into a numerical format.

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder


data = pd.DataFrame()

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stop words - Removing commonly used words in English sentences
stop_words = set(stopwords.words('english'))
# print(stop_words)

# Function to preprocess text - Takes in each individual sentence
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply preprocessing to the text column - Column-wise operation across the entire dataset
data['processed_text'] = df['text'].apply(preprocess_text)
data['label'] = df['label']

# Inspect the DataFrame
print(data.head())

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/esvaranarun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/esvaranarun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                      processed_text  label
0                 feel really helpless heavy hearted      4
1  ive enjoyed able slouch relax unwind frankly n...      0
2            gave internship dmrg feeling distraught      4
3                                dont know feel lost      0
4  kindergarten teacher thoroughly weary job take...      4


In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

# Create a vocabulary and encode text
vocab = set(token for text in data['processed_text'] for token in text)
word2idx = {word: i+1 for i, word in enumerate(vocab)}  # Start indexing from 1

def encode_text(text):
    return [word2idx[word] for word in text]

data['encoded_text'] = data['processed_text'].apply(encode_text)

# Pad sequences to ensure uniform length
MAX_LEN = max(len(text) for text in data['encoded_text'])
data['padded_text'] = data['encoded_text'].apply(lambda x: x + [0]*(MAX_LEN - len(x)))

# Split into training and testing datasets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Prepare data loaders
train_dataset = TextDataset(train_data['padded_text'].tolist(), train_data['label'].tolist())
test_dataset = TextDataset(test_data['padded_text'].tolist(), test_data['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [6]:
# Define a more complex model
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary


class ImprovedTextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(ImprovedTextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)  # [batch_size, seq_len, embed_size]
        _, (hn, _) = self.lstm(x)  # [1, batch_size, hidden_size]
        x = hn.squeeze(0)  # [batch_size, hidden_size]
        x = self.fc1(x)
        x = self.relu(x)
        out = self.fc2(x)
        return out

    
# Hyperparameters
VOCAB_SIZE = len(vocab) + 1  # +1 for padding index
EMBED_SIZE = 100
HIDDEN_SIZE = 128
NUM_CLASSES = 6  # Example number of classes

# Initialize the model, loss function, and optimizer
model = ImprovedTextClassificationModel(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Total number of parameters: {count_parameters(model)}')

Total number of parameters: 129206


In [9]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

Epoch [1/20], Loss: 1.5695
Epoch [2/20], Loss: 1.5668
Epoch [3/20], Loss: 1.5670
Epoch [4/20], Loss: 1.5660
Epoch [5/20], Loss: 1.5664
Epoch [6/20], Loss: 1.5669
Epoch [7/20], Loss: 1.5664
Epoch [8/20], Loss: 1.5664
Epoch [9/20], Loss: 1.5666
Epoch [10/20], Loss: 1.5662
Epoch [11/20], Loss: 1.5660
Epoch [12/20], Loss: 1.5653
Epoch [13/20], Loss: 1.5655
Epoch [14/20], Loss: 1.5655
Epoch [15/20], Loss: 1.5657
Epoch [16/20], Loss: 1.5657
Epoch [17/20], Loss: 1.5654
Epoch [18/20], Loss: 1.5657
Epoch [19/20], Loss: 1.5654
Epoch [20/20], Loss: 1.5653


In [8]:
# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 33.00%
