# **Project: Sentiment Analysis with Deep Learning (LSTM)**

In [73]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM,Dense, Dropout



In [74]:
# load dataset
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [75]:
# Convert labels to numbers
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


# Data Preprocessing

In [76]:
# Remove HTML Tags & Special Characters
import re
def clean_text(text):
  # remote html tags
  text = re.sub(r'<.*?>', '', text)
  # remove special characters
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  # Lowercase everything
  text = text.lower()

  return text


In [77]:
df['review'] = df['review'].apply(clean_text)
print(df.head())

                                              review  sentiment
0  one of the other reviewers has mentioned that ...          1
1  a wonderful little production the filming tech...          1
2  i thought this was a wonderful way to spend ti...          1
3  basically theres a family where a little boy j...          0
4  petter matteis love in the time of money is a ...          1


In [78]:
# Remove Stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
df['review'] = df['review'].apply(remove_stopwords)
print(df.head())

                                              review  sentiment
0  one reviewers mentioned watching 1 oz episode ...          1
1  wonderful little production filming technique ...          1
2  thought wonderful way spend time hot summer we...          1
3  basically theres family little boy jake thinks...          0
4  petter matteis love time money visually stunni...          1


In [80]:
# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['review'])

sequences = tokenizer.texts_to_sequences(df['review'])

In [81]:
# Padding Sequences
max_len = 200  # truncate/pad reviews to 200 words
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')


In [82]:
print(padded_sequences)

[[   4 1809  941 ...    0    0    0]
 [ 278   38  253 ...    0    0    0]
 [  97  278   27 ...    0    0    0]
 ...
 [3399 4148    1 ...    0    0    0]
 [  55   71 2856 ...    0    0    0]
 [   4 5594  236 ...    0    0    0]]


## Train-Test Split

In [85]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'],
                                                    test_size=0.2, random_state=42)

# Build Model

In [86]:
max_len = 200
vocab_size = 10000
embedding_dim = 64

In [87]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))  # sans input_length
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Construire le modèle explicitement pour afficher le résumé
model.build(input_shape=(None, max_len))
model.summary()



In [88]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [89]:
# train model
history = model.fit(X_train, y_train,
                    epochs=5,
                    batch_size=64,
                    validation_data=(X_test, y_test))

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.5092 - loss: 0.6914 - val_accuracy: 0.6521 - val_loss: 0.6362
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.5474 - loss: 0.6743 - val_accuracy: 0.6964 - val_loss: 0.6507
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.6338 - loss: 0.6288 - val_accuracy: 0.7588 - val_loss: 0.5415
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.8187 - loss: 0.4251 - val_accuracy: 0.8785 - val_loss: 0.2861
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.9209 - loss: 0.2130 - val_accuracy: 0.8879 - val_loss: 0.2709


In [90]:
# evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8922 - loss: 0.2679
Test Accuracy: 0.89


In [97]:
# interactif test

# Fonction pour prédire le sentiment d'une phrase
def predict_sentiment(text):
    # Nettoyage comme pour les données d'entraînement
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Tokenizer + padding
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    # Prédiction
    pred = model.predict(padded)[0][0]
    if pred > 0.5:
        print(f"Prediction: POSITIVE ({pred:.2f})")
    else:
        print(f"Prediction: NEGATIVE ({pred:.2f})")

# Boucle d'entrée utilisateur
while True:
    user_input = input("Tape une phrase à tester (ou 'exit' pour quitter) : ")
    if user_input.lower() == 'exit':
        break
    predict_sentiment(user_input)


Tape une phrase à tester (ou 'exit' pour quitter) : This movie was terrible and a complete waste of time
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
Prediction: NEGATIVE (0.02)
Tape une phrase à tester (ou 'exit' pour quitter) : Amazing plot and wonderful characters, highly recommended.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Prediction: POSITIVE (0.96)
Tape une phrase à tester (ou 'exit' pour quitter) : exit
