In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from joblib import dump
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow-

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = pd.read_csv("Twitter_Data.csv")

In [7]:
data.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment
27009,71d8145311,received her first pressie. Happy happy.,Happy,positive
8741,1cf21feb71,My goodbey pressents http://mobypicture.com/?...,My goodbey pressents http://mobypicture.com/?...,neutral
8082,7d41dbe15c,... but i could barely speak! He was probably...,sorry,negative
2710,bd2b4a032d,on my 42nd written ****.. i get less ideas eve...,"i get less ideas every day, and every day",negative
19392,1bd0b9380f,LMAO. Joey is mine I cant wait to meet them ...,I cant wait to meet them,positive
16761,5abd2744bd,"just put the brats to bed, now im chillin with...",im chillin,positive
12076,943aea9981,It`s 4 am and I`m hungry,It`s 4 am and I`m hungry,neutral
9110,97abfda038,"no, not yet i have 3 weeks left!! :O do you ...","no, not yet i have 3 weeks left!! :O do you h...",neutral
18766,e989780e68,"my lil sister Charese, her best friend passed...",passed,positive
21867,040e7230e2,that sucks man,that sucks man,negative


In [8]:
import re
data['text'] = data['text'].astype(str)
data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s.]+', '', x).lower())
data['text']

0                        id have responded if i were going
1               sooo sad i will miss you here in san diego
2                                my boss is bullying me...
3                            what interview leave me alone
4         sons of  why couldnt they put them on the rel...
                               ...                        
27476     wish we could come see u on denver  husband l...
27477     ive wondered about rake to.  the client has m...
27478     yay good for both of you. enjoy the break  yo...
27479                               but it was worth it  .
27480       all this flirting going on  the atg smiles....
Name: text, Length: 27481, dtype: object

In [9]:
x = data['text']
y = data['sentiment']

In [10]:
unique_sentiments = y.unique()
print("Unique Sentiments:", unique_sentiments)
y = y.replace({'negative': 0, 'neutral': 1, 'positive': 2})
y

Unique Sentiments: ['neutral' 'negative' 'positive']


0        1
1        0
2        0
3        0
4        0
        ..
27476    0
27477    0
27478    2
27479    2
27480    1
Name: sentiment, Length: 27481, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [22]:
tokenizer = Tokenizer(num_words=9000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=250, padding='pre')
X_test_pad = pad_sequences(X_test_seq, maxlen=250, padding='pre')
X_test_pad

array([[   0,    0,    0, ...,   25,  160,  778],
       [   0,    0,    0, ...,   21,   12, 1065],
       [   0,    0,    0, ...,  501,  629,  947],
       ...,
       [   0,    0,    0, ...,    1,   85,    5],
       [   0,    0,    0, ..., 5995,  108,  216],
       [   0,    0,    0, ..., 3813,   12,   90]])

In [23]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_test_encoded

array([2, 1, 1, ..., 0, 0, 1], dtype=int64)

In [24]:
num_classes = len(unique_sentiments)
y_train_onehot = tf.keras.utils.to_categorical(y_train_encoded, num_classes=num_classes)
y_test_onehot = tf.keras.utils.to_categorical(y_test_encoded, num_classes=num_classes)
y_test_onehot

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [28]:
# model = tf.keras.Sequential([
#     Embedding(input_dim=7000, output_dim=100, input_length=200),
#     LSTM(128, return_sequences=True),
#     LSTM(64),
#     Dense(32, activation='relu'),
#     Dropout(0.5),
#     Dense(num_classes, activation='softmax')
# ])

model = tf.keras.Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=250),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(32, activation='relu'),
    Dropout(0.2),  # Adjusted dropout rate
    Dense(num_classes, activation='softmax')
])


In [29]:
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [30]:
# model.fit(X_train_pad, y_train_onehot, epochs=15, batch_size=32, validation_data=(X_test_pad, y_test_onehot))
model.fit(X_train_pad, y_train_onehot, epochs=10, batch_size=64, validation_data=(X_test_pad, y_test_onehot))


Epoch 1/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 305ms/step - accuracy: 0.5008 - loss: 0.9728 - val_accuracy: 0.6930 - val_loss: 0.7174
Epoch 2/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 336ms/step - accuracy: 0.7562 - loss: 0.6100 - val_accuracy: 0.7180 - val_loss: 0.6921
Epoch 3/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 318ms/step - accuracy: 0.8156 - loss: 0.4833 - val_accuracy: 0.7133 - val_loss: 0.7022
Epoch 4/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 321ms/step - accuracy: 0.8569 - loss: 0.3948 - val_accuracy: 0.7072 - val_loss: 0.7530
Epoch 5/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 313ms/step - accuracy: 0.8778 - loss: 0.3456 - val_accuracy: 0.7111 - val_loss: 0.8189
Epoch 6/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 377ms/step - accuracy: 0.8961 - loss: 0.2953 - val_accuracy: 0.6981 - val_loss: 0.9668
Epoch 7/

<keras.src.callbacks.history.History at 0x227959cb3d0>

In [31]:
model.save('sentiment_model.h5')
dump(tokenizer, 'tokenizer.joblib')



['tokenizer.joblib']