<a href="https://colab.research.google.com/github/clemgi0/movie-analyser_deep-learning-proyecto/blob/main/03_arquitectura_de_linea_de_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###DATASET
https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows

In [112]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import kagglehub
import os
import nltk
from nltk import word_tokenize

In [113]:
path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

files_in_path = os.listdir(path)
csv_files = [f for f in files_in_path if f.endswith('.csv')]

if csv_files:
    data_file = os.path.join(path, csv_files[0])
    df = pd.read_csv(data_file)

    df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle the datas to avoid linear IMDB rating

    data = df.to_numpy()
    data = data[:, [1, 5, 7, 9, 6, 8]] # Name of the movie / Genre / Overview / Director / IMDB rating / meta-score
    print("Data shape:", data[:3,:])
else:
    print("No CSV files found in the specified path. Please specify which file to load if it's not a CSV or has a different extension.")

Using Colab cache for faster access to the 'imdb-dataset-of-top-1000-movies-and-tv-shows' dataset.
Data shape: [['Trois couleurs: Bleu' 'Drama, Music, Mystery'
  'A woman struggles to find a way to live her life after the death of her husband and child.'
  'Krzysztof Kieslowski' 7.9 85.0]
 ['Captain America: The Winter Soldier' 'Action, Adventure, Sci-Fi'
  'As Steve Rogers struggles to embrace his role in the modern world, he teams up with a fellow Avenger and S.H.I.E.L.D agent, Black Widow, to battle a new threat from history: an assassin known as the Winter Soldier.'
  'Anthony Russo' 7.7 70.0]
 ['Wreck-It Ralph' 'Animation, Adventure, Comedy'
  'A video game villain wants to be a hero and sets out to fulfill his dream, but his quest brings havoc to the whole arcade where he lives.'
  'Rich Moore' 7.7 72.0]]


In [114]:
nltk.download('punkt_tab')
nltk.download('stopwords')
stopwords_en = nltk.corpus.stopwords.words('english')

# Remove the stopwords from nltk english stopwords dictionnary to get a clean dataset
cleaned_texts = np.array([])
for text in data[:,2]:
    tokens = [word.lower() for word in nltk.word_tokenize(text) if word.lower() not in stopwords_en]
    cleaned_texts = np.append(cleaned_texts, ' '.join(tokens))

# Tokenize the cleaned dataset of movie's Overview
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(cleaned_texts)
tokenizer.word_index.update({'<pad>': 0})
X_cleaned = tokenizer.texts_to_sequences(cleaned_texts)

# Retrieve the differents data's
#x_train = data[:800, [0, 1, 2, 3]] # Name of the movie / Genre / Director
x_train = X_cleaned[:800]
y_train = data[:800, [4, 5]] # IMDB rating / meta-score

#x_test = data[800:, [0, 1, 2, 3]] # Name of the movie / Genre / Director
x_test = X_cleaned[800:]
y_test = data[800:, [4, 5]] # IMDB rating / meta-score


# Normalization of the goal's datas
y_train[:, 0] = y_train[:, 0] / 10.0   # IMDB rating
y_train[:, 1] = y_train[:, 1] / 100.0  # Meta-score

y_test[:, 0] = y_test[:, 0] / 10.0
y_test[:, 1] = y_test[:, 1] / 100.0

print("\n3 firsts samples of x_train:", x_train[:3], '\nfor\n', cleaned_texts[:3])
print("\n3 firsts samples of x_test:", x_test[:3], '\nfor\n', cleaned_texts[800:][:3])
print("\n3 firsts samples of y_train:", y_train[:3,:])
print("\n3 firsts samples of y_test:", y_test[:3,:])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



3 firsts samples of x_train: [[10, 82, 14, 65, 68, 4, 60, 135, 69], [2219, 2220, 82, 2221, 508, 659, 6, 921, 228, 1312, 55, 1313, 229, 922, 660, 509, 66, 109, 413, 56, 7, 271, 230, 136, 414, 923, 272], [2222, 197, 1314, 415, 2223, 79, 510, 329, 165, 273, 1315, 924, 2224, 31]] 
for
 ['woman struggles find way live life death husband child .'
 'steve rogers struggles embrace role modern world , teams fellow avenger s.h.i.e.l.d agent , black widow , battle new threat history : assassin known winter soldier .'
 'video game villain wants hero sets fulfill dream , quest brings havoc whole arcade lives .']

3 firsts samples of x_test: [[1227, 146, 1308, 108, 4876, 1654, 5, 416, 57, 207, 34], [2161, 300, 914, 100, 237, 1296, 1297, 95, 4877, 4878], [11, 2181, 91, 128, 802, 2101, 505, 101, 582, 4879, 1242, 4880, 4881, 4882, 662, 4883, 4884, 2050]] 
for
 ['psychopath forces tennis star comply theory two strangers get away murder .'
 'oddball journalist psychopathic lawyer travel las vegas series

In [115]:
cleaned_texts = np.array([])
for text in data[:,2]:
    tokens = [word.lower() for word in nltk.word_tokenize(text) if word.lower() not in stopwords_en]
    cleaned_texts = np.append(cleaned_texts, ' '.join(tokens))
cleaned_texts[:10]

array(['woman struggles find way live life death husband child .',
       'steve rogers struggles embrace role modern world , teams fellow avenger s.h.i.e.l.d agent , black widow , battle new threat history : assassin known winter soldier .',
       'video game villain wants hero sets fulfill dream , quest brings havoc whole arcade lives .',
       'summer 1962 , new kid town taken wing young baseball prodigy rowdy team , resulting many adventures .',
       'life lawyer became famed leader indian revolts british rule philosophy nonviolent protest .',
       'lone survivor onslaught flesh-possessing spirits holes cabin group strangers demons continue attack .',
       'family determined get young daughter finals beauty pageant take cross-country trip vw bus .',
       '2027 , chaotic world women become somehow infertile , former activist agrees help transport miraculously pregnant woman sanctuary sea .',
       "sex-repulsed woman disapproves sister 's boyfriend sinks depression horrif

In [143]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_features = 5000
embedding_dim = 64
max_len = 60

X_train_seq = pad_sequences(X_cleaned[:800], maxlen=max_len)
X_test_seq  = pad_sequences(X_cleaned[800:], maxlen=max_len)

y_train_reg = y_train[:, 0].astype(np.float32)     # IMDB rating
y_test_reg  = y_test[:, 0].astype(np.float32)

In [144]:
y_train_reg[:50]

array([0.79, 0.77, 0.77, 0.78, 0.8 , 0.78, 0.78, 0.79, 0.77, 0.82, 0.77,
       0.84, 0.78, 0.76, 0.76, 0.76, 0.81, 0.76, 0.77, 0.81, 0.79, 0.82,
       0.8 , 0.79, 0.81, 0.81, 0.83, 0.76, 0.76, 0.76, 0.8 , 0.82, 0.78,
       0.79, 0.8 , 0.81, 0.78, 0.78, 0.76, 0.84, 0.8 , 0.83, 0.81, 0.76,
       0.81, 0.81, 0.76, 0.78, 0.8 , 0.77], dtype=float32)

In [145]:
# --- Architecture ---
inputs = Input(shape=(max_len,))
x = Embedding(max_features, embedding_dim)(inputs)
x = GlobalAveragePooling1D()(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs, outputs)

In [146]:
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

model.summary()

In [147]:
model.fit(X_train_seq, y_train_reg, epochs=15, batch_size=128, verbose=1)

Epoch 1/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.0379 - mae: 0.1652
Epoch 2/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0115 - mae: 0.1013
Epoch 3/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0025 - mae: 0.0412
Epoch 4/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0022 - mae: 0.0366
Epoch 5/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0018 - mae: 0.0354
Epoch 6/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0013 - mae: 0.0275
Epoch 7/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0014 - mae: 0.0295
Epoch 8/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0012 - mae: 0.0269
Epoch 9/15
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0012 - mae:

<keras.src.callbacks.history.History at 0x7f932c461940>

In [148]:
loss, mae = model.evaluate(X_test_seq, y_test_reg, verbose=1)
print("MSE (loss) :", loss)
print("MAE :", mae)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0011 - mae: 0.0276      
MSE (loss) : 0.0011439939262345433
MAE : 0.02805621176958084


In [149]:
y_pred = model.predict(X_test_seq)

for i in range(10):
    print("Overview:", data[800+i, 2][:80], "...")
    print("Real rating :", y_test_reg[i], " – Prediction :", y_pred[i][0])
    print("---")



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Overview: A psychopath forces a tennis star to comply with his theory that two strangers c ...
Real rating : 0.79  – Prediction : 0.8110066
---
Overview: An oddball journalist and his psychopathic lawyer travel to Las Vegas for a seri ...
Real rating : 0.76  – Prediction : 0.81397665
---
Overview: A story that revolves around drug abuse in the affluent north Indian State of Pu ...
Real rating : 0.78  – Prediction : 0.7783048
---
Overview: Danny Ocean and his ten accomplices plan to rob three Las Vegas casinos simultan ...
Real rating : 0.77  – Prediction : 0.80995965
---
Overview: A mythological story about a goddess who created the entire universe. The plot r ...
Real rating : 0.83  – Prediction : 0.79330766
---
Overview: After his son is captured in the Great Barrier Reef and taken to Sydney, a timid ...
Real rating : 0.81  – Prediction : 0.80290115
---
Overview: A poor but hopeful boy seeks one of the five covet