In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
target = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}

In [3]:
df = pd.read_parquet('twitter_data.parquet')
df.sample(10)

Unnamed: 0,text,label
298263,i feel like i fucked up my grad school applica...,3
287984,im also having a few pretzels which i feel gui...,0
169731,my younger brother and i had been fighting ove...,3
103982,i feel ashamed using it i only saw the movie o...,0
119042,i feel like i am pretty talented,1
166772,i want to frame these blog entries around a se...,1
112647,i feel unsuccessful at work my personal life i...,0
258019,i feel more confident about one than my theori...,1
310051,i do feel like a fucking needy little shit rig...,0
374501,im not feeling particularly clever tonight may...,1


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stops =stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocessText(text):
  tokens = word_tokenize(text.lower())
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stops]
  return ' '.join(tokens)

df['text'] = df['text'].apply(preprocessText)
df.sample(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...


Unnamed: 0,text,label
360291,feel privileged part club,1
395529,positive idea sat feeling groggy waiting somet...,0
156528,post feel heart heart await ridicule almost su...,2
271779,write name description last time stand sweatin...,4
143863,feel perfect song escape artist never die brou...,1
269597,want others understand importance looking good...,1
84741,starting feel seasonally overwhelmed,5
362421,want feel ok admit life easy right,1
108092,really enjoyed feeling sweet spirit,1
372613,feel heartbroken loss fantastic genius someone...,0


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df['text'])

In [6]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
  def __init__(self, text_data, labels, vectorizer):
    self.text_data = text_data
    self.labels = labels
    self.vectorizer = vectorizer

  def __len__(self):
    return len(self.text_data)

  def __getitem__(self, index):
    text_vector = self.vectorizer.transform([self.text_data[index]]).toarray()[0]
    label = self.labels[index]
    return torch.tensor(text_vector,dtype=torch.float32), torch.tensor(label,dtype=torch.long)

In [7]:
labels = df['label'].values
text_data = df['text'].values


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.01, random_state=42)


dataset = TextDataset(X_train,y_train,vectorizer)
dataloader = DataLoader(dataset,batch_size=32,shuffle=True)

In [9]:
import torch.nn as nn
import torch.optim as optim

class TextClassifier(nn.Module):
  def __init__(self,input_size):
    super(TextClassifier,self).__init__()
    self.fc1 = nn.Linear(input_size,128)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(128,64)
    self.fc3 = nn.Linear(64,6)

  def forward(self,x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.relu(x)
    x = self.fc3(x)
    return x

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

model = TextClassifier(5000).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [13]:
for epoch in range(10):
    for batch_features, batch_labels in dataloader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.12573200464248657
Epoch 2, Loss: 0.13748891651630402
Epoch 3, Loss: 0.26515278220176697
Epoch 4, Loss: 0.3547509014606476
Epoch 5, Loss: 0.1502569317817688
Epoch 6, Loss: 0.0965069830417633
Epoch 7, Loss: 0.1277620792388916
Epoch 8, Loss: 0.08728886395692825
Epoch 9, Loss: 0.0887359231710434
Epoch 10, Loss: 0.06963247060775757


In [14]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score


# Create a TextDataset for the test data
test_dataset = TextDataset(X_test, y_test, vectorizer)

# Create a DataLoader for the test set
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)  # Adjust batch_size as needed

# Now you can use test_dataloader in your evaluation code
model.eval()  # Set model to evaluation mode

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_features, batch_labels in test_dataloader:
        # Move data to device
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        # Get model outputs
        outputs = model(batch_features)

        # Get the predicted classes
        _, predictions = torch.max(outputs, dim=1)

        # Collect predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())  # Move to CPU for easy processing
        all_labels.extend(batch_labels.cpu().numpy())

# Calculate accuracy or any other evaluation metric
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 89.69%


In [15]:
# Save the model
torch.save(model.state_dict(), 'model.pth')
print("Model saved successfully.")

Model saved successfully.
