In [9]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

  if not hasattr(np, "object"):


In [10]:
df = pd.read_csv("imdb.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)     
    text = re.sub(r'[^a-zA-Z ]', '', text) 
    text = re.sub(r'\s+', ' ', text)        
    return text.strip()

df["review"] = df["review"].apply(clean_text)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,i am a catholic taught in parochial elementary...,negative
49998,im going to have to disagree with the previous...,negative


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])

X = df["review"].values
y = df["sentiment"].values

In [13]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
MAX_WORDS = 10000
MAX_LEN = 200      

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [15]:
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    LSTM(128),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()



In [16]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

history = model.fit(
    X_train_pad,
    y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop]
)

Epoch 1/10
[1m394/500[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m10s[0m 101ms/step - accuracy: 0.6763 - loss: 0.5708

KeyboardInterrupt: 

In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Accuracy test : {accuracy:.4f}")

In [None]:
def predict_sentiment(review):
    review = clean_text(review)
    seq = tokenizer.texts_to_sequences([review])
    pad = pad_sequences(seq, maxlen=MAX_LEN)
    proba = model.predict(pad)[0][0]
    return "positive" if proba > 0.5 else "negative"

In [None]:
example = "This movie was absolutely fantastic, I loved it!"
print("Critique :", example)
print("Prédiction :", predict_sentiment(example))