In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Loading the Twitter Dataset

In [30]:
tweet = pd.read_csv('/content/twitter_training.csv', header= None)

In [31]:
tweet.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [32]:
tweet.shape

(74682, 4)

In [33]:
tweet.columns

Index([0, 1, 2, 3], dtype='int64')

In [34]:
# Define column names
column_names = ["id", "entity", "sentiment", "text"]

In [35]:
# Assign column names to the DataFrame
tweet.columns = column_names

# Now you can use .info() on the DataFrame
tweet.head()

Unnamed: 0,id,entity,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [36]:
tweet['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Negative,22542
Positive,20832
Neutral,18318
Irrelevant,12990


In [37]:
# Encode sentiment labels as integers
label_map = {"Negative": 0, "Positive": 1, "Neutral": 2, "Irrelevant": 3}
tweet["sentiment"] = tweet["sentiment"].map(label_map)

In [38]:
tweet['entity'].value_counts()

Unnamed: 0_level_0,count
entity,Unnamed: 1_level_1
Microsoft,2400
MaddenNFL,2400
TomClancysRainbowSix,2400
LeagueOfLegends,2394
CallOfDuty,2394
Verizon,2382
CallOfDutyBlackopsColdWar,2376
ApexLegends,2376
Facebook,2370
WorldOfCraft,2364


In [39]:
# Features and labels
X = tweet["text"].astype(str).values
y = tweet["sentiment"].astype(int).values

In [40]:
#splitting the data into training and test data
train_data, test_data = train_test_split(tweet, test_size=0.2, random_state=42)

In [41]:
print(train_data.shape)
print(test_data.shape)

(59745, 4)
(14937, 4)


Data Preprocessing

In [42]:
# Fill NaN with empty strings and convert to string type
train_data['text'] = train_data['text'].fillna('').astype(str)
test_data['text'] = test_data['text'].fillna('').astype(str)

# Tokenize test data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['text'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=200)

In [43]:
print(X_train)

[[   0    0    0 ...    4  957 3738]
 [   0    0    0 ...  608    6 4646]
 [   0    0    0 ... 1089 2497  140]
 ...
 [   0    0    0 ... 2595    5 2845]
 [   0    0    0 ...  828  202   48]
 [   0    0    0 ...  890   10  341]]


In [44]:
print(X_test)

[[   0    0    0 ...   98  798  134]
 [   0    0    0 ...   83  158 2423]
 [   0    0    0 ...    6 2337 2269]
 ...
 [   0    0    0 ...  194 1941 2652]
 [   0    0    0 ... 1048  945 1212]
 [   0    0    0 ... 4843  116 2816]]


In [45]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [46]:
print(Y_train)

8581     2
71534    1
67252    2
41061    1
16591    1
        ..
37194    2
6265     2
54886    0
860      0
15795    2
Name: sentiment, Length: 59745, dtype: int64


Training the LSTM (Long Short-Term Memory) Model

In [47]:
# build the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))



In [48]:
model.build(input_shape=(None, 200))

In [49]:
model.summary()

In [50]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Training the Model

In [51]:
val_data = pd.read_csv("/content/twitter_validation.csv")

In [52]:
# Preprocess validation data
val_data.columns = ["id", "entity", "sentiment", "text"]
val_data['text'] = val_data['text'].fillna('').astype(str)
val_data["sentiment"] = val_data["sentiment"].map(label_map)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_data['text']), maxlen=200)
Y_val = val_data['sentiment'].astype(int).values

model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_data = (X_val, Y_val))

Epoch 1/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 624ms/step - accuracy: 0.5084 - loss: 1.1306 - val_accuracy: 0.7538 - val_loss: 0.6338
Epoch 2/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m576s[0m 617ms/step - accuracy: 0.7166 - loss: 0.7267 - val_accuracy: 0.8388 - val_loss: 0.4743
Epoch 3/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m573s[0m 614ms/step - accuracy: 0.7771 - loss: 0.5770 - val_accuracy: 0.8669 - val_loss: 0.4245
Epoch 4/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m624s[0m 617ms/step - accuracy: 0.8236 - loss: 0.4705 - val_accuracy: 0.8779 - val_loss: 0.3754
Epoch 5/5
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 617ms/step - accuracy: 0.8464 - loss: 0.4078 - val_accuracy: 0.8869 - val_loss: 0.3695


<keras.src.callbacks.history.History at 0x7d4ad8107e90>

Model Evaluation

In [55]:
import numpy as np

loss, accuracy = model.evaluate(X_test, np.array(Y_test))
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 83ms/step - accuracy: 0.7772 - loss: 0.6115
Test Loss: 0.6098723411560059
Test Accuracy: 0.7818169593811035


In [57]:
# Predict and evaluate using confusion matrix
Y_pred = np.argmax(model.predict(X_test), axis=-1)
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", cm)

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 88ms/step
Confusion Matrix:
 [[3934  238  232  115]
 [ 331 3364  356  179]
 [ 480  330 2584  202]
 [ 277  294  225 1796]]


In [59]:
# Performance metrics
report = classification_report(Y_test, Y_pred, target_names=['Negative', 'Positive', 'Neutral', 'Irrelevant'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    Negative       0.78      0.87      0.82      4519
    Positive       0.80      0.80      0.80      4230
     Neutral       0.76      0.72      0.74      3596
  Irrelevant       0.78      0.69      0.74      2592

    accuracy                           0.78     14937
   macro avg       0.78      0.77      0.77     14937
weighted avg       0.78      0.78      0.78     14937



Building a predictive system

In [62]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)

  # get the index of the highest probability
  sentiment_index = np.argmax(prediction[0])

  # map index to sentiment label
  sentiments = ['Negative', 'Positive', 'Neutral', 'Irrelevant']
  sentiment = sentiments[sentiment_index]

  return sentiment

In [63]:
# example usage
new_review = 'This movie was fantastic. I loves it.'
sentiment = predict_sentiment(new_review)
print(f'The sentiment of the review is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
The sentiment of the review is: Positive


In [64]:
new_review = 'This Movie was not that good'
sentiment = predict_sentiment(new_review)
print(f'The sentiment of the review is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
The sentiment of the review is: Negative


In [68]:
new_review = 'It’s okay, nothing too great.'
sentiment = predict_sentiment(new_review)
print(f'The sentiment of the review is: {sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
The sentiment of the review is: Neutral
