In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
!wget https://raw.githubusercontent.com/databyhuseyn/DeepLearning/refs/heads/main/helper_functions.py

--2025-05-01 13:16:59--  https://raw.githubusercontent.com/databyhuseyn/DeepLearning/refs/heads/main/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2025-05-01 13:16:59 (15.3 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [3]:
# Set up Kaggle Environment
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle

In [4]:
# Download dataset
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0


In [5]:
from helper_functions import unzip_data

In [6]:
unzip_data('/content/twitter-entity-sentiment-analysis.zip')

In [130]:
training_data = pd.read_csv('/content/twitter_training.csv')
training_data

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [131]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [132]:
training_data.describe()

Unnamed: 0,2401
count,74681.0
mean,6432.640149
std,3740.423819
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [135]:
# Rename columns
training_data.rename(columns={'2401':'tweet_id','Borderlands':'message',
                              'Positive':'class',
                              'im getting on borderlands and i will murder you all ,':'text'},
                     inplace=True)

In [136]:
training_data

Unnamed: 0,tweet_id,message,class,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [137]:
# Check null values
training_data.isna().sum()

Unnamed: 0,0
tweet_id,0
message,0
class,0
text,686


In [139]:
# Remove them
training_data.dropna(subset=['text'],axis=0,inplace=True)

In [140]:
# Shuffle data
train_shuffled_df = training_data.sample(frac=1,random_state=42)
train_shuffled_df

Unnamed: 0,tweet_id,message,class,text
61734,4984,GrandTheftAuto(GTA),Irrelevant,Do you think you can hurt me?
11260,13136,Xbox(Xseries),Positive,About The time!!
55969,11207,TomClancysRainbowSix,Neutral,Calls from _ z1rv _ & @ Tweet98 got me this so...
4111,1909,CallOfDutyBlackopsColdWar,Negative,So CoD: Black Ops Cold War is gonna be ass? @ ...
2308,1604,CallOfDutyBlackopsColdWar,Negative,Y HAPPY ABOUT THIS.
...,...,...,...,...
37541,5238,Hearthstone,Irrelevant,Liked on YouTube: Hearthstone Felfire Festival...
6332,289,Amazon,Neutral,Not even gonna change the 7-2 loss ffs
55392,2311,CallOfDuty,Negative,Fuck this call of duty update..
864,2553,Borderlands,Positive,I should get up & feed my dogs & stuff when th...


In [144]:
train_shuffled_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73995 entries, 61734 to 15956
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  73995 non-null  int64 
 1   message   73995 non-null  object
 2   class     73995 non-null  object
 3   text      73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [145]:
# Split the data into feature and target
X = train_shuffled_df['text']
y = train_shuffled_df['class']

In [146]:
# Check wether our dataset is balanced or not
y.value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
Negative,22358
Positive,20654
Neutral,18108
Irrelevant,12875


In [148]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Convert categorical dtaa into numeric
onehot = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

# Fit and transform
# transform into 2D
y_train = onehot.fit_transform(y_train.values.reshape(-1,1))
y_test = onehot.transform(y_test.values.reshape(-1,1))

In [149]:
X_test.head()

Unnamed: 0,text
45300,We are pleased to announce that Dave Matthews ...
62802,Finally I downloaded GTA 5 on my new Xbox One.
42302,@PUBG hi I m facing some in game problem where...
61696,I really forgot how fun GTA 2011 is
61087,The latter tells us what we need to know about...


In [150]:
# Calculate the avg number of words in X_train sentences
max_length = round(sum([len(i.split()) for i in X_train if isinstance(i, str)]) / len(X_train))
max_length

19

In [151]:
# Convert sentences to numeric vectors
max_vocab_length = 10000
tweet_text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                                          output_mode='int',
                                                          output_sequence_length=max_length)

In [152]:
# Adapt the training sentences to the model to build the vocabulary
tweet_text_vectorizer.adapt(X_train.astype(str))

In [153]:
# Convert into NumPy array
X_train_vectorized = tweet_text_vectorizer(X_train.astype(str))
X_train_vectorized_np = X_train_vectorized.numpy()

In [154]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_vectorized_np,y_train)

In [155]:
sample_sentence = "He lost his grandma"
tweet_text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 19), dtype=int64, numpy=
array([[ 118,  365,  166, 4580,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]])>

In [156]:
tf.random.set_seed(42)
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer='uniform',
                             input_length=max_length,
                             name='embedding')



# Build Model 0: LSTM

In [157]:
inputs = layers.Input(shape=(1,),dtype='string')
x = tweet_text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64,activation='relu',kernel_regularizer='l2')(x)
outputs = layers.Dense(len(training_data['class'].value_counts()),activation='sigmoid')(x)
model_0 = tf.keras.Model(inputs,outputs,name='model_0')

In [158]:
model_0.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [159]:
model_0.summary()

In [160]:
model_0_history = model_0.fit(X_train.astype(str).to_numpy(),
                              y_train,
                              epochs=5,
                              validation_data=(X_test.astype(str).to_numpy(),y_test))

Epoch 1/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 38ms/step - accuracy: 0.4238 - loss: 1.4039 - val_accuracy: 0.6474 - val_loss: 0.9010
Epoch 2/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 36ms/step - accuracy: 0.6915 - loss: 0.8120 - val_accuracy: 0.7187 - val_loss: 0.7625
Epoch 3/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 35ms/step - accuracy: 0.7722 - loss: 0.6261 - val_accuracy: 0.7325 - val_loss: 0.7719
Epoch 4/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 35ms/step - accuracy: 0.8086 - loss: 0.5449 - val_accuracy: 0.7302 - val_loss: 0.8293
Epoch 5/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 37ms/step - accuracy: 0.8291 - loss: 0.4893 - val_accuracy: 0.7206 - val_loss: 0.9689


In [161]:
# Make a prediction based on the sample sentence
prediction_position = np.argmax(model_0.predict(tf.constant(['She lost her mum'])))
prediction_position



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step


np.int64(0)

In [162]:
model_0.predict(tf.constant(['She lost her mum']))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


array([[0.75585514, 0.46520174, 0.3463955 , 0.16841334]], dtype=float32)

In [163]:
# Extract the class with the highest probability
models_prediction = onehot.get_feature_names_out()[prediction_position][3:]
models_prediction

'Irrelevant'

# Build Model 1: GRU

In [164]:
inputs = layers.Input(shape=(1,),dtype='string')
x = tweet_text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64,activation='relu',kernel_regularizer='l2')(x)
outputs = layers.Dense(len(training_data['class'].value_counts()),activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs,outputs,name='model_1')

In [165]:
model_1.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [166]:
model_1.summary()

In [167]:
model_1_history = model_0.fit(X_train.astype(str).to_numpy(),
                              y_train,
                              epochs=5,
                              validation_data=(X_test.astype(str).to_numpy(),y_test))

Epoch 1/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 35ms/step - accuracy: 0.8447 - loss: 0.4506 - val_accuracy: 0.7365 - val_loss: 0.9497
Epoch 2/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 35ms/step - accuracy: 0.8579 - loss: 0.4148 - val_accuracy: 0.7343 - val_loss: 1.0018
Epoch 3/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 38ms/step - accuracy: 0.8653 - loss: 0.3926 - val_accuracy: 0.7380 - val_loss: 0.9538
Epoch 4/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 35ms/step - accuracy: 0.8723 - loss: 0.3711 - val_accuracy: 0.7358 - val_loss: 0.9953
Epoch 5/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 35ms/step - accuracy: 0.8793 - loss: 0.3548 - val_accuracy: 0.7368 - val_loss: 1.0723


In [168]:
prediction_position1 = np.argmax(model_1.predict(tf.constant(['She lost her mum'])))
prediction_position1

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step


np.int64(0)

In [169]:
model_1.predict(tf.constant(['She lost her mum']))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


array([[0.507824  , 0.5024548 , 0.50766504, 0.5039015 ]], dtype=float32)

In [170]:
model_1_prediction = onehot.get_feature_names_out()[prediction_position1][3:]
model_1_prediction

'Irrelevant'

# Build Model 2: Bidirectional LSTM

In [171]:
inputs = layers.Input(shape=(1,),dtype='string')
x = tweet_text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64,activation='relu',kernel_regularizer='l2'))(x)
outputs = layers.Dense(len(training_data['class'].value_counts()),activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs,outputs,name='model_2')

In [172]:
model_2.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [173]:
model_2.summary()

In [174]:
model_2_history = model_2.fit(X_train.astype(str).to_numpy(),
                              y_train,
                              epochs=5,
                              validation_data=(X_test.astype(str).to_numpy(),y_test))

Epoch 1/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 61ms/step - accuracy: 0.7363 - loss: 1.1811 - val_accuracy: 0.7302 - val_loss: 1.1603
Epoch 2/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 59ms/step - accuracy: 0.8744 - loss: 0.3888 - val_accuracy: 0.7190 - val_loss: 1.2429
Epoch 3/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 61ms/step - accuracy: 0.8843 - loss: 0.3592 - val_accuracy: 0.7293 - val_loss: 1.1075
Epoch 4/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 61ms/step - accuracy: 0.8891 - loss: 0.3432 - val_accuracy: 0.7186 - val_loss: 1.1474
Epoch 5/5
[1m1850/1850[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 60ms/step - accuracy: 0.8958 - loss: 0.3244 - val_accuracy: 0.7208 - val_loss: 1.1803


In [175]:
# Make a prediction based on the sample sentence
prediction_position2 = np.argmax(model_2.predict(tf.constant(['She lost her mum'])))
prediction_position2

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 379ms/step


np.int64(2)

In [176]:
model_2.predict(tf.constant(['She lost her mum']))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


array([[0.05113928, 0.5821769 , 0.8228262 , 0.0316528 ]], dtype=float32)

In [177]:
# Extract the class with the highest probability
model_2_prediction = onehot.get_feature_names_out()[prediction_position2][3:]
model_2_prediction

'Neutral'