In [49]:
import numpy as np
import pandas as pd
import torch

In [65]:
target = {0:'sad', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}

In [51]:
df = pd.read_parquet('twitter_data.parquet')
df.sample(10)

Unnamed: 0,text,label
124913,i feel embarrassed handing them money that and...,0
35353,ive been feeling a desire to be alone for quit...,0
289073,im just in one of those moods where im feeling...,3
397894,i feel almost rude to celebrate in front of you,3
268927,i feel like the people who really need to get ...,0
19371,i just feel so fucking furious with myself,3
304774,i wouldnt feel any disgrace about it but why s...,3
362631,i feel happy at work when i dont have to think...,1
378040,i feel extremely bad,0
12314,i can just lie there and feel like im supporti...,2


In [52]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stops =stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocessText(text):
  tokens = word_tokenize(text.lower())
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stops]
  return ' '.join(tokens)

df['text'] = df['text'].apply(preprocessText)
df.sample(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Devve\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label
73337,done something point stay closed guard feel li...,2
252981,smiled timidly feeling pained guilt chest dece...,0
322054,feel like perfect time ask whether anyone knew...,1
302226,almost feel petty mentioning,3
84960,excited continue personal learning skill learn...,1
265937,wore feel cute wore feel cute href http www,1
341191,feel distraught worried panicked sick scared sad,4
137826,could actually feel curious effect system also...,5
34556,cant help feeling perhaps could friendly phone,1
21945,ill eat im done huge box spring mix hopefully ...,1


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
vectorizer.fit(df['text'])

In [54]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
  def __init__(self, text_data, labels, vectorizer):
    self.text_data = text_data
    self.labels = labels
    self.vectorizer = vectorizer

  def __len__(self):
    return len(self.text_data)

  def __getitem__(self, index):
    text_vector = self.vectorizer.transform([self.text_data[index]]).toarray()[0]
    label = self.labels[index]
    return torch.tensor(text_vector,dtype=torch.float32), torch.tensor(label,dtype=torch.long)

In [55]:
labels = df['label'].values
text_data = df['text'].values


In [56]:
from sklearn.model_selection import train_test_split

# First, split off the test set
X_train, X_temp, y_train, y_temp = train_test_split(text_data, labels, test_size=0.3, random_state=42)

# Then, split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



train_dataset = TextDataset(X_train,y_train,vectorizer)
train_loader = DataLoader(train_dataset,batch_size=32,shuffle=True)

val_dataset = TextDataset(X_val,y_val,vectorizer)
val_loader = DataLoader(val_dataset,batch_size=32,shuffle=True)

In [57]:
import torch.nn as nn
import torch.optim as optim

class TextClassifier(nn.Module):
  def __init__(self,input_size):
    super(TextClassifier,self).__init__()
    self.fc1 = nn.Linear(input_size,128)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(128,64)
    self.fc3 = nn.Linear(64,6)

  def forward(self,x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.relu(x)
    x = self.fc3(x)
    return x

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

model = TextClassifier(5000).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.001)

Using cuda


In [59]:
from sklearn.metrics import accuracy_score
# Number of epochs
num_epochs = 10

# To store metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    # Training Phase
    model.train()  # Set model to training mode
    running_loss = 0.0
    all_labels = []
    all_preds = []

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Track accuracy for training
        _, preds = torch.max(outputs, 1)  # Get predictions
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

    # Calculate and store metrics
    train_loss = running_loss / len(train_loader)
    train_acc = accuracy_score(all_labels, all_preds)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)

    # Validation Phase
    model.eval()  # Set model to evaluation mode
    val_running_loss = 0.0
    val_all_labels = []
    val_all_preds = []

    with torch.no_grad():  # Disable gradient calculation for validation
        for val_inputs, val_labels in val_loader:
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
            
            # Forward pass
            val_outputs = model(val_inputs)
            val_loss = criterion(val_outputs, val_labels)
            val_running_loss += val_loss.item()

            # Track accuracy for validation
            _, val_preds = torch.max(val_outputs, 1)
            val_all_labels.extend(val_labels.cpu().numpy())
            val_all_preds.extend(val_preds.cpu().numpy())

    # Calculate and store metrics
    val_loss = val_running_loss / len(val_loader)
    val_acc = accuracy_score(val_all_labels, val_all_preds)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    # Print metrics for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}] - "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


Epoch [1/10] - Train Loss: 0.2832, Train Acc: 0.8781, Val Loss: 0.1921, Val Acc: 0.9013
Epoch [2/10] - Train Loss: 0.1719, Train Acc: 0.9111, Val Loss: 0.1839, Val Acc: 0.9024
Epoch [3/10] - Train Loss: 0.1494, Train Acc: 0.9195, Val Loss: 0.1947, Val Acc: 0.9001
Epoch [4/10] - Train Loss: 0.1322, Train Acc: 0.9255, Val Loss: 0.2087, Val Acc: 0.8980
Epoch [5/10] - Train Loss: 0.1178, Train Acc: 0.9319, Val Loss: 0.2278, Val Acc: 0.8940
Epoch [6/10] - Train Loss: 0.1058, Train Acc: 0.9374, Val Loss: 0.2722, Val Acc: 0.8919
Epoch [7/10] - Train Loss: 0.0969, Train Acc: 0.9410, Val Loss: 0.2923, Val Acc: 0.8934
Epoch [8/10] - Train Loss: 0.0905, Train Acc: 0.9440, Val Loss: 0.3300, Val Acc: 0.8926
Epoch [9/10] - Train Loss: 0.0860, Train Acc: 0.9455, Val Loss: 0.3617, Val Acc: 0.8858
Epoch [10/10] - Train Loss: 0.0817, Train Acc: 0.9478, Val Loss: 0.3975, Val Acc: 0.8849


In [60]:
from torch.utils.data import DataLoader


# Create a TextDataset for the test datas
test_dataset = TextDataset(X_test, y_test, vectorizer)

# Create a DataLoader for the test set
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)  # Adjust batch_size as needed

# Now you can use test_dataloader in your evaluation code
model.eval()  # Set model to evaluation mode

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_features, batch_labels in test_dataloader:
        # Move data to device
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        # Get model outputs
        outputs = model(batch_features)

        # Get the predicted classes
        _, predictions = torch.max(outputs, dim=1)

        # Collect predictions and true labels
        all_predictions.extend(predictions.cpu().numpy())  # Move to CPU for easy processing
        all_labels.extend(batch_labels.cpu().numpy())

# Calculate accuracy or any other evaluation metric
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 88.28%


In [61]:
# Save the model
torch.save(model.state_dict(), 'model.pth')
print("Model saved successfully.")

Model saved successfully.


In [82]:
import torch.nn.functional as F

def predict(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available

    text = preprocessText(text)
    text_vector = vectorizer.transform([text]).toarray()  # Make sure it's a list
    text_vector = torch.tensor(text_vector, dtype=torch.float32).to(device)  # Move tensor to the device
    
    model.to(device)  # Move model to the same device
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():  # Disable gradient tracking for inference
        output = model(text_vector)
    
    probabilities = F.softmax(output, dim=1)

    # Get the predicted class index
    predicted_class = torch.argmax(probabilities, dim=1).item()

    return target[predicted_class]

In [83]:
text = "I'm speechless! I knew you could do this, that's awesome!!"
predict(text)

'joy'

In [84]:
import pickle

with open("vectorizer.pkl","wb") as file:
    pickle.dump(vectorizer,file)