In [43]:
!pip install tensorflow



In [1]:
import kagglehub
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

path = kagglehub.dataset_download("harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows")
print(f"Dataset path: {path}")

import pandas as pd
df = pd.read_csv(f'{path}/imdb_top_1000.csv')

df.head()

Dataset path: C:\Users\F8091169\.cache\kagglehub\datasets\harshitshankhdhar\imdb-dataset-of-top-1000-movies-and-tv-shows\versions\1


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

class PositionalEncoding(layers.Layer):
    def __init__(self, embed_dim, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.encoding = self.positional_encoding(embed_dim, max_len)

    def positional_encoding(self, embed_dim, max_len):
        position = np.arange(0, max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, embed_dim, 2) * -(np.log(10000.0) / embed_dim))
        encoding = np.zeros((max_len, embed_dim))
        encoding[:, 0::2] = np.sin(position * div_term)
        encoding[:, 1::2] = np.cos(position * div_term)
        return tf.convert_to_tensor(encoding, dtype=tf.float32)

    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]  # Evita erro com NoneType
        return inputs + self.encoding[:seq_len, :]

class TransformerEncoder(tf.keras.Model):
    def __init__(self, embed_dim, num_heads, ff_dim, num_layers, vocab_size, max_len):
        super(TransformerEncoder, self).__init__()

        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)

        self.positional_encoding = PositionalEncoding(embed_dim, max_len)

        self.encoder_layers = [layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) for _ in range(num_layers)]
        self.ffn_layers = [layers.Dense(embed_dim, activation="relu") for _ in range(num_layers)]  # Corrigido: ff_dim = embed_dim
        self.output_layers = [layers.Dense(embed_dim) for _ in range(num_layers)]

        self.layer_norm = layers.LayerNormalization(epsilon=1e-6)

        self.fc_out = layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.positional_encoding(x)
        x = self.layer_norm(x)

        for encoder_layer, ffn_layer, output_layer in zip(self.encoder_layers, self.ffn_layers, self.output_layers):
            
            attn_output = encoder_layer(x, x, x)
            x = x + attn_output  
            x = self.layer_norm(x)

            ffn_output = ffn_layer(x)
            x = x + ffn_output  
            x = self.layer_norm(x)

            x = output_layer(x)

        return self.fc_out(x[:, 0, :])  

In [15]:
dummy_input = tf.keras.Input(shape=(max_len,), dtype=tf.int32)
dummy_output = model(dummy_input)
model.build(input_shape=(None, max_len))
model.summary()

In [16]:
import numpy as np

threshold = 7.0
df['Sentiment'] = np.where(df['IMDB_Rating'] >= threshold, 'positive', 'negative')

In [17]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train, X_test, y_train, y_test = train_test_split(df['Overview'], df['Sentiment'], test_size=0.2, random_state=42)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

y_train_bin = np.where(y_train == 'positive', 1, 0)
y_test_bin = np.where(y_test == 'positive', 1, 0)


In [18]:
model

<TransformerEncoder name=transformer_encoder_3, built=True>

In [19]:
history = model.fit(
    X_train_pad, y_train_bin,
    epochs=5,
    batch_size=64,
    validation_data=(X_test_pad, y_test_bin)
)

Epoch 1/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 3s/step - accuracy: 0.7669 - loss: 0.2668 - val_accuracy: 1.0000 - val_loss: 2.6803e-06
Epoch 2/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 3s/step - accuracy: 1.0000 - loss: 2.0683e-06 - val_accuracy: 1.0000 - val_loss: 1.0900e-06
Epoch 3/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 3s/step - accuracy: 1.0000 - loss: 9.8613e-07 - val_accuracy: 1.0000 - val_loss: 7.6576e-07
Epoch 4/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 3s/step - accuracy: 1.0000 - loss: 7.2783e-07 - val_accuracy: 1.0000 - val_loss: 6.3552e-07
Epoch 5/5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 3s/step - accuracy: 1.0000 - loss: 6.1508e-07 - val_accuracy: 1.0000 - val_loss: 5.6142e-07


In [20]:
test_loss, test_acc = model.evaluate(X_test_pad, y_test_bin, verbose=2)
print(f"Test accuracy: {test_acc}")

7/7 - 2s - 295ms/step - accuracy: 1.0000 - loss: 5.6142e-07
Test accuracy: 1.0


In [24]:
model.save('../transformer-architecture/model/transformer_imdb_model.keras')