In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import nltk
from collections import Counter
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l1
from tensorflow.keras.optimizers import Adam

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^A-Za-z\s.,!?]', '', text)  # Keeps .,!? punctuation
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df = pd.read_csv('data/tweet_emotions.csv')
df = df.drop(columns='tweet_id')

df['content'] = df['content'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Drago\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
all_words = ' '.join(df['content']).split()
vocab_size = len(set(all_words))
print(f'Vocabulary Size: {vocab_size}')

tweet_lengths = df['content'].apply(lambda x: len(x.split()))
max_length = np.percentile(tweet_lengths, 95)
max_length = int(max_length)
print(f'Max Length: {max_length}')

Vocabulary Size: 68055
Max Length: 15


In [5]:
X = df['content']
Y = df['sentiment']

label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.2, random_state=42)

embedding_dim = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)


X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, truncating='post', padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, truncating='post', padding='post')


In [6]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(13, activation='softmax')
])


model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [8]:
trained_model = model.fit(X_train_pad, Y_train, epochs=15, validation_data=(X_test_pad, Y_test), batch_size=32)

test_loss, test_acc = model.evaluate(X_test_pad, Y_test)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_acc}')

Epoch 1/15
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 61ms/step - accuracy: 0.2380 - loss: 2.1437 - val_accuracy: 0.3256 - val_loss: 1.9588
Epoch 2/15
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 59ms/step - accuracy: 0.3758 - loss: 1.8362 - val_accuracy: 0.3293 - val_loss: 1.9728
Epoch 3/15
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 59ms/step - accuracy: 0.4986 - loss: 1.4946 - val_accuracy: 0.3018 - val_loss: 2.2009
Epoch 4/15
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 60ms/step - accuracy: 0.6336 - loss: 1.1402 - val_accuracy: 0.2901 - val_loss: 2.5080
Epoch 5/15
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 59ms/step - accuracy: 0.7282 - loss: 0.8644 - val_accuracy: 0.2784 - val_loss: 2.8079
Epoch 6/15
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 59ms/step - accuracy: 0.7829 - loss: 0.6684 - val_accuracy: 0.2730 - val_loss: 3.3290
Epoc

In [9]:
new_df = pd.read_csv('data/tweet_emotions.csv')
new_df['content'] = new_df['content'].apply(clean_text)

new_sequences = tokenizer.texts_to_sequences(new_df['content'])
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')

predictions = model.predict(new_padded)
predicted_classes = np.argmax(predictions, axis=1)
predicted_sentiments = label_encoder.inverse_transform(predicted_classes)

new_df['predicted_sentiment'] = predicted_sentiments

new_df

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step


Unnamed: 0,tweet_id,sentiment,content,predicted_sentiment
0,1956967341,empty,tiffanylue know listenin bad habit earlier sta...,empty
1,1956967666,sadness,layin n bed headache ughhhh...waitin call...,sadness
2,1956967696,sadness,funeral ceremony...gloomy friday...,sadness
3,1956967789,enthusiasm,wants hang friends soon!,surprise
4,1956968416,neutral,dannycastillo want trade someone houston ticke...,worry
...,...,...,...,...
39995,1753918954,neutral,johnlloydtaylor,neutral
39996,1753919001,love,happy mothers day love,love
39997,1753919005,love,"happy mothers day mommies there, woman man lon...",love
39998,1753919043,happiness,niariley wassup beautiful!!! follow me!! peep ...,happiness


In [10]:
new_df[new_df['sentiment'] == new_df['predicted_sentiment']].shape[0] / df.shape[0]

0.79935