<a href="https://colab.research.google.com/github/clemgi0/movie-analyser_deep-learning-proyecto/blob/main/02_preprocesado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import kagglehub
import os
import nltk
from nltk import word_tokenize

In [56]:
path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")

files_in_path = os.listdir(path)
csv_files = [f for f in files_in_path if f.endswith('.csv')]

if csv_files:
    data_file = os.path.join(path, csv_files[0])
    df = pd.read_csv(data_file)
    data = df.to_numpy()
    data = data[:, [1, 5, 7, 9, 6, 8]]
    print("Data shape:", data[:3,:])
else:
    print("No CSV files found in the specified path. Please specify which file to load if it's not a CSV or has a different extension.")

Using Colab cache for faster access to the 'imdb-dataset-of-top-1000-movies-and-tv-shows' dataset.
Data shape: [['The Shawshank Redemption' 'Drama'
  'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'
  'Frank Darabont' 9.3 80.0]
 ['The Godfather' 'Crime, Drama'
  "An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son."
  'Francis Ford Coppola' 9.2 100.0]
 ['The Dark Knight' 'Action, Crime, Drama'
  'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.'
  'Christopher Nolan' 9.0 84.0]]


In [75]:
# Calculate the number of unique words in the vocabulary
unique_words_count = len(tokenizer.word_index) - 1 # Subtract 1 for the <pad> token
print(f"The number of different words in the dataset is: {unique_words_count}")

The number of different words in the dataset is: 5586


In [57]:
nltk.download('punkt')
nltk.download('stopwords')
stopwords_en = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
def tokenize_and_remove_stopwords(text_list):
    cleaned_texts = []
    for text in text_list:
        tokens = [word.lower() for word in nltk.word_tokenize(text) if word.lower() not in stopwords_en]
        cleaned_texts.append(' '.join(tokens))
    return cleaned_texts

cleaned_descriptions = tokenize_and_remove_stopwords(data[:,2])
print(cleaned_descriptions[:3])

['two imprisoned men bond number years , finding solace eventual redemption acts common decency .', "organized crime dynasty 's aging patriarch transfers control clandestine empire reluctant son .", 'menace known joker wreaks havoc chaos people gotham , batman must accept one greatest psychological physical tests ability fight injustice .']


In [78]:
from tensorflow.keras.preprocessing.text import Tokenizer

max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(cleaned_descriptions)
tokenizer.word_index.update({'<pad>': 0})
X_cleaned = tokenizer.texts_to_sequences(cleaned_descriptions)

# Convert each element of X_cleaned to a numpy array
data[:,2] = np.array([np.array(x) for x in X_cleaned], dtype=object)

print("Original X (first 10):")
print(X[:10])
print("\nCleaned X (first 10):")
print(data[:10,2])

Original X (first 10):
[[26, 337, 92, 226, 93, 1, 4, 58, 582, 5, 338, 94, 4], [8, 90, 339, 4, 7, 293, 3, 7, 61], [25, 2, 486, 19, 2, 5, 12, 2, 95, 4, 739, 740, 49, 39, 4, 2, 5, 487, 4, 7, 3, 137], [2, 194, 23, 5, 195, 4, 6, 31, 110, 75, 9, 42, 7, 61, 741, 5, 7, 12, 2, 41, 90, 742], [1, 340, 3, 488, 1, 4, 15, 7, 3, 2], [5, 341, 2, 27, 4, 92, 54, 145, 3, 7, 18, 5, 19, 28, 10, 2, 39, 294], [2, 69, 4, 26, 227, 1, 342, 1, 228, 5, 7, 68, 5, 1, 402, 4, 743, 6, 138, 583, 4, 229, 5, 338], [6, 120, 584, 46, 27, 35, 99, 53, 11, 7, 196, 22, 16, 15, 2, 489], [1, 490, 14, 744, 94, 2, 403, 4, 404, 9, 745, 2, 4, 8, 32, 2, 4, 1, 585, 586], [8, 230, 5, 1, 343, 263, 405, 8, 137, 24, 32, 746, 131]]

Cleaned X (first 10):
[array([   5,  271,   48,  135, 1312,   25,  508, 1313, 2219,  272, 1314,
         921, 2220])
 array([2221,   45, 2222,    1,  273, 1315, 2223,  922, 2224,  197,  923,
          26])
 array([2225,  413,  924, 1316, 1317, 1318,   46,  659,  660,   16, 2226,
          13,  925, 1319,  414,

In [77]:
tokenizer.word_index

{"'s": 1,
 'young': 2,
 'man': 3,
 'life': 4,
 'two': 5,
 'world': 6,
 'new': 7,
 'family': 8,
 'war': 9,
 'woman': 10,
 'story': 11,
 'love': 12,
 'one': 13,
 'find': 14,
 'old': 15,
 'must': 16,
 'finds': 17,
 'boy': 18,
 'help': 19,
 'father': 20,
 'wife': 21,
 'becomes': 22,
 'girl': 23,
 'american': 24,
 'years': 25,
 'son': 26,
 'friends': 27,
 'year': 28,
 'former': 29,
 'three': 30,
 'city': 31,
 'lives': 32,
 'murder': 33,
 'town': 34,
 'time': 35,
 'mother': 36,
 "''": 37,
 'team': 38,
 'mysterious': 39,
 'tries': 40,
 'home': 41,
 'school': 42,
 'small': 43,
 'group': 44,
 'crime': 45,
 'people': 46,
 'friend': 47,
 'men': 48,
 'ii': 49,
 'become': 50,
 'police': 51,
 'daughter': 52,
 'day': 53,
 'search': 54,
 's': 55,
 'battle': 56,
 'high': 57,
 'back': 58,
 'get': 59,
 'york': 60,
 'first': 61,
 'takes': 62,
 'death': 63,
 'past': 64,
 'way': 65,
 'u': 66,
 'agent': 67,
 'german': 68,
 'set': 69,
 'journey': 70,
 'leads': 71,
 'live': 72,
 'child': 73,
 'save': 74,
 "'":

In [73]:
x_train = data[:800, [0, 1, 2, 3]] # Columns 2, 6, 8, 10 (1-based indexing)
y_train = data[:800, [4, 5]] # Columns 7, 9 (1-based indexing)

x_test = data[800:, [0, 1, 2, 3]] # Columns 2, 6, 8, 10 (1-based indexing)
y_test = data[800:, [4, 5]] # Columns 7, 9 (1-based indexing)
print("Shapes of x_train and x_test:", x_train.shape, x_test.shape)
print("\nShapes of y_train and y_test:", y_train.shape, y_test.shape)

Shapes of x_train and x_test: (800, 4) (200, 4)

Shapes of y_train and y_test: (800, 2) (200, 2)


In [74]:
x_train[:10, :], y_train[:10, :], x_test[:10, :], y_test[:10, :]

(array([['The Shawshank Redemption', 'Drama',
         array([  5, 271,  48, 135,  25, 508, 272, 921]),
         'Frank Darabont'],
        ['The Godfather', 'Crime, Drama',
         array([ 45,   1, 273, 922, 197, 923,  26]),
         'Francis Ford Coppola'],
        ['The Dark Knight', 'Action, Crime, Drama',
         array([413, 924,  46, 659, 660,  16,  13, 925, 414,  82]),
         'Christopher Nolan'],
        ['The Godfather: Part II', 'Crime, Drama',
         array([136,   4, 137, 926,   7,  60,  31,  26, 661,   8,  45, 662]),
         'Francis Ford Coppola'],
        ['12 Angry Men', 'Crime, Drama', array([274, 415, 927]),
         'Sidney Lumet'],
        ['The Lord of the Rings: The Return of the King',
         'Action, Adventure, Drama',
         array([275,   6,  48, 928,   1,  89, 929,  13, 228]),
         'Peter Jackson'],
        ['Pulp Fiction', 'Crime, Drama',
         array([ 32,   5, 165, 276, 138,  21, 329, 663,  83, 509, 166, 272]),
         'Quentin Tarantino'],