In [1]:
import tensorflow as tf
print("GPU disponible :", tf.config.list_physical_devices('GPU'))
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("GPU configuré avec memory growth")
    except RuntimeError as e:
        print(e)

GPU disponible : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU configuré avec memory growth


In [2]:
import os
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from sklearn.model_selection import train_test_split

In [3]:
!wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip -o cornell_movie_dialogs_corpus.zip
!ls "cornell movie-dialogs corpus"

URL transformed to HTTPS due to an HSTS policy
--2025-04-09 12:30:37--  https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.53
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9916637 (9.5M) [application/zip]
Saving to: ‘cornell_movie_dialogs_corpus.zip.2’


2025-04-09 12:30:40 (4.27 MB/s) - ‘cornell_movie_dialogs_corpus.zip.2’ saved [9916637/9916637]

Archive:  cornell_movie_dialogs_corpus.zip
  inflating: cornell movie-dialogs corpus/.DS_Store  
  inflating: __MACOSX/cornell movie-dialogs corpus/._.DS_Store  
  inflating: cornell movie-dialogs corpus/chameleons.pdf  
  inflating: __MACOSX/cornell movie-dialogs corpus/._chameleons.pdf  
  inflating: cornell movie-dialogs corpus/movie_characters_metadata.txt  
  inflating: cornell movie-dialogs corpus/movie_conversations.txt  
  inflating: cornell movie-dia

In [4]:
lines_path = "cornell movie-dialogs corpus/movie_lines.txt"
convs_path = "cornell movie-dialogs corpus/movie_conversations.txt"

with open(lines_path, encoding='iso-8859-1') as f:
    lines = f.read().split("\n")

id2line = {}
for line in lines:
    parts = line.split(" +++$+++ ")
    if len(parts) == 5:
        id2line[parts[0]] = parts[4]

with open(convs_path, encoding='iso-8859-1') as f:
    conversations = f.read().split("\n")

questions = []
answers = []

for conv in conversations:
    parts = conv.split(" +++$+++ ")
    if len(parts) == 4:
        ids = eval(parts[3])
        for i in range(len(ids)-1):
            if ids[i] in id2line and ids[i+1] in id2line:
                questions.append(id2line[ids[i]])
                answers.append(id2line[ids[i+1]])

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

questions = [clean_text(q) for q in questions]
answers = [clean_text(a) for a in answers]

In [6]:
vocab_size = 10000
max_len = 40

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(questions + answers)

question_seq = tokenizer.texts_to_sequences(questions)
answer_seq = tokenizer.texts_to_sequences(answers)

question_pad = pad_sequences(question_seq, maxlen=max_len, padding='post')
answer_pad = pad_sequences(answer_seq, maxlen=max_len, padding='post')

# Préparer les données pour le modèle Seq2Seq
decoder_input_data = answer_pad[:, :-1]
decoder_target_data = answer_pad[:, 1:]

decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_len, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_len, padding='post')

In [7]:
embedding_dim = 256
lstm_units = 256

encoder_inputs = Input(shape=(max_len,), name='encoder_input')
x = Embedding(vocab_size, embedding_dim, name='embedding')(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(lstm_units, return_state=True, name='encoder_lstm'))
encoder_outputs, f_h, f_c, b_h, b_c = encoder_lstm(x)
state_h = tf.keras.layers.Concatenate()([f_h, b_h])
state_c = tf.keras.layers.Concatenate()([f_c, b_c])
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_len,), name='decoder_input')
decoder_emb = Embedding(vocab_size, embedding_dim, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax', name='output')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [8]:
X_train_enc, X_val_enc, X_train_dec, X_val_dec, y_train, y_val = train_test_split(
    question_pad, decoder_input_data, decoder_target_data, test_size=0.2)

model.fit([X_train_enc, X_train_dec],
          np.expand_dims(y_train, -1),
          validation_data=([X_val_enc, X_val_dec], np.expand_dims(y_val, -1)),
          batch_size=128, epochs=5)

Epoch 1/5
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 150ms/step - accuracy: 0.7638 - loss: 1.7229 - val_accuracy: 0.7876 - val_loss: 1.2885
Epoch 2/5
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 150ms/step - accuracy: 0.7889 - loss: 1.2611 - val_accuracy: 0.7924 - val_loss: 1.2239
Epoch 3/5
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 151ms/step - accuracy: 0.7953 - loss: 1.1838 - val_accuracy: 0.7948 - val_loss: 1.1911
Epoch 4/5
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 150ms/step - accuracy: 0.7970 - loss: 1.1464 - val_accuracy: 0.7965 - val_loss: 1.1731
Epoch 5/5
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 151ms/step - accuracy: 0.7991 - loss: 1.1108 - val_accuracy: 0.7975 - val_loss: 1.1641


<keras.src.callbacks.history.History at 0x790d23707a10>

In [14]:
# 🔁 Modèles d'inférence

# Encodeur
encoder_model = Model(encoder_inputs, encoder_states)

# Décodeur
decoder_state_input_h = Input(shape=(lstm_units*2,), name='input_h')
decoder_state_input_c = Input(shape=(lstm_units*2,), name='input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# On redéfinit ici la couche d'embedding du décodeur
decoder_inputs_inf = Input(shape=(max_len,), name='decoder_input_inf')
decoder_embedding_layer = Embedding(vocab_size, embedding_dim, name='decoder_embedding_inf')
decoder_emb_inf = decoder_embedding_layer(decoder_inputs_inf)

# Même couche LSTM et Dense
decoder_outputs_inf, state_h, state_c = decoder_lstm(
    decoder_emb_inf, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs_inf)

decoder_model = Model([decoder_inputs_inf] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)


In [19]:
def decode_sequence(input_seq, tokenizer, max_len=40):
    # États initiaux à partir de l'encodeur
    states_value = encoder_model.predict(input_seq)

    # Séquence de démarrage pour le décodeur : un seul token (ex: <OOV> comme <start>)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get('<OOV>', 1)

    decoded_sentence = ''

    for _ in range(max_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word in ['', None, '<end>']:
            break

        decoded_sentence += ' ' + sampled_word

        # Mise à jour pour la prochaine prédiction
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()


In [None]:
# 💬 Tester le modèle

input_text = "how are you today?"
input_text = clean_text(input_text)

seq = tokenizer.texts_to_sequences([input_text])
padded = pad_sequences(seq, maxlen=max_len, padding='post')

response = decode_sequence(padded, tokenizer)
print("💬 Input:", input_text)
print("🤖 Bot:", response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
🤖 Bot: 
