In [None]:
import tensorflow as tf
# Afficher les appareils disponibles
print("GPU disponible :", tf.config.list_physical_devices('GPU'))

In [None]:
# Téléchargement et extraction du jeu de données Cornell Movie Dialogs si nécessaire
import os
if(not os.path.exists("cornell movie-dialogs corpus")):
    !wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
    !unzip -o cornell_movie_dialogs_corpus.zip
    !ls "cornell movie-dialogs corpus"
else:
    print("Le jeu de données est déjà téléchargé.")

In [None]:
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Nettoyage rapide
def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-zA-Z0-9.!?']", " ", text)
    return " ".join(text.split())

# Chargement et création des paires (exemple simplifié)
lines = open("movie_lines.txt", encoding='utf-8', errors='ignore').read().split('\n')
pairs = [(lines[i], lines[i+1]) for i in range(0, len(lines)-1, 2)]  # Paires adjacentes
cleaned_pairs = [(clean_text(q), clean_text(a)) for q, a in pairs[:50000]]  # Limite à 50k

# Tokenisation
tokenizer = Tokenizer(num_words=8000, oov_token='<OOV>')
tokenizer.fit_on_texts([q for q, a in cleaned_pairs] + [a for q, a in cleaned_pairs])
VOCAB_SIZE = len(tokenizer.word_index) + 1

# Séquences
MAX_LEN = 15  # Réduit pour vitesse
input_seqs = pad_sequences(tokenizer.texts_to_sequences([q for q, a in cleaned_pairs]), maxlen=MAX_LEN)
target_seqs = pad_sequences(tokenizer.texts_to_sequences([a for q, a in cleaned_pairs]), maxlen=MAX_LEN)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Encodeur
encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embed = Embedding(VOCAB_SIZE, 128)(encoder_inputs)
_, state_h, state_c = LSTM(128, return_state=True)(encoder_embed)  # Unidirectionnel pour vitesse

# Décodeur
decoder_inputs = Input(shape=(None,))
decoder_embed = Embedding(VOCAB_SIZE, 128)(decoder_inputs)
decoder_outputs = LSTM(128, return_sequences=True)(decoder_embed, initial_state=[state_h, state_c])
outputs = Dense(VOCAB_SIZE, activation='softmax')(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# Préparation des données
decoder_input_data = target_seqs[:, :-1]
decoder_output_data = target_seqs[:, 1:]

# Entraînement express
model.fit(
    [input_seqs, decoder_input_data],
    np.expand_dims(decoder_output_data, -1),
    batch_size=256,  # Batch large pour T4
    epochs=10,
    validation_split=0.1
)

In [None]:
def generate_response(text, max_len=15):
    text_seq = pad_sequences(tokenizer.texts_to_sequences([clean_text(text)]), maxlen=MAX_LEN)
    states = model.layers[2].predict(text_seq)  # Récupère les états

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<OOV>']  # Token de départ

    response = []
    for _ in range(max_len):
        output, h, c = model.layers[4](target_seq, initial_state=states)
        next_token = np.argmax(output[0, -1, :])
        response.append(tokenizer.index_word.get(next_token, ''))
        target_seq = np.array([[next_token]])
        states = [h, c]

    return ' '.join(response)

# Test
print(generate_response("Hello how are you?"))