## Projet BiLSTM

Liste des imports

In [28]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re, ast
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.model_selection import train_test_split

In [29]:
##Si l'archive n'est pas présente, on la télécharge et on la dézippe
if not os.path.exists("cornell_movie_dialogs_corpus.zip"):
  print("Downloading data (234 MB)")
  !wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
  !unzip cornell_movie_dialogs_corpus.zip




In [30]:
import tensorflow as tf
print("GPU disponible :", tf.config.list_physical_devices('GPU'))


GPU disponible : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [31]:
# Your existing data loading code with improvements
def load_cornell_data(movie_lines_path, movie_conversations_path):
    # Load movie lines
    with open(movie_lines_path, encoding='iso-8859-1') as file:
        lines = file.read().split("\n")

    # Dictionary: line_id -> text
    id2line = {}
    for line in lines:
        parts = line.split(" +++$+++ ")
        if len(parts) == 5:
            id2line[parts[0]] = parts[4]

    print(id2line['L194'])

    # Load conversations
    with open(movie_conversations_path, encoding='iso-8859-1') as file:
        conversations = file.read().split("\n")



    # Create question-answer pairs
    questions = []
    answers = []

    for conv in conversations:
        parts = conv.split(" +++$+++ ")
        if len(parts) == 4:
            try:
                utterance_ids = eval(parts[3])  # Safer than eval
                for i in range(len(utterance_ids) - 1):
                    if utterance_ids[i] in id2line and utterance_ids[i+1] in id2line:
                        questions.append(id2line[utterance_ids[i]])
                        answers.append(id2line[utterance_ids[i+1]])
            except:
                print("Error in conversation:", conv)
                continue

    return questions, answers

# Paths to your data files
movie_lines_path = "cornell movie-dialogs corpus/movie_lines.txt"
movie_conversations_path = "cornell movie-dialogs corpus/movie_conversations.txt"

# Load and verify data
questions, answers = load_cornell_data(movie_lines_path, movie_conversations_path)
print(f"Loaded {len(questions)} question-answer pairs")
print("Sample pair:")
print("Q:", questions[0])
print("A:", answers[0])

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Loaded 221616 question-answer pairs
Sample pair:
Q: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
A: Well, I thought we'd start with pronunciation, if that's okay with you.


## Nettoyage du texte

In [32]:
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    # Replace contractions
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    # Remove special characters
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Clean all questions and answers
clean_questions = [clean_text(q) for q in questions]
clean_answers = [clean_text(a) for a in answers]

# Filter out empty pairs
filtered_questions = []
filtered_answers = []
for q, a in zip(clean_questions, clean_answers):
    if q and a:  # Only keep non-empty pairs
        filtered_questions.append(q)
        filtered_answers.append(a)

print(f"After cleaning, kept {len(filtered_questions)} pairs")

After cleaning, kept 221277 pairs


## Tokenisation

In [33]:
vocab_size = 10000
max_len = 40

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(questions + answers)

question_seq = tokenizer.texts_to_sequences(questions)
answer_seq = tokenizer.texts_to_sequences(answers)

question_pad = pad_sequences(question_seq, maxlen=max_len, padding='post')
answer_pad = pad_sequences(answer_seq, maxlen=max_len, padding='post')


## Entraînement du modèle

In [34]:
model = Sequential([
    Embedding(vocab_size, 256, input_length=max_len),
    Bidirectional(LSTM(256, return_sequences=True)),
    Bidirectional(LSTM(256)),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [35]:
X_train, X_val, y_train, y_val = train_test_split(question_pad, answer_pad, test_size=0.2)

# On prédit le premier mot de la réponse comme approximation simple
model.fit(X_train, np.expand_dims(y_train[:, 0], -1),
          validation_data=(X_val, np.expand_dims(y_val[:, 0], -1)),
          batch_size=128, epochs=10)


Epoch 1/10
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 50ms/step - accuracy: 0.0937 - loss: 5.5781 - val_accuracy: 0.0972 - val_loss: 5.3167
Epoch 2/10
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 46ms/step - accuracy: 0.0987 - loss: 5.2435 - val_accuracy: 0.0988 - val_loss: 5.3078
Epoch 3/10
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 49ms/step - accuracy: 0.0984 - loss: 5.1751 - val_accuracy: 0.0992 - val_loss: 5.3176
Epoch 4/10
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 50ms/step - accuracy: 0.1022 - loss: 5.0994 - val_accuracy: 0.0960 - val_loss: 5.3487
Epoch 5/10
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 46ms/step - accuracy: 0.1039 - loss: 5.0134 - val_accuracy: 0.0976 - val_loss: 5.3878
Epoch 6/10
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 50ms/step - accuracy: 0.1049 - loss: 4.9279 - val_accuracy: 0.0972 - val_loss: 5.4498
Epoc

<keras.src.callbacks.history.History at 0x79887e3f63d0>

In [36]:
## On évalue le modèle sur les données de validation
loss, accuracy = model.evaluate(X_val, np.expand_dims(y_val[:, 0], -1))
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")

# On évaluera aussi la perplexité du modèle
y_pred = model.predict(X_val)
perplexity = np.exp(np.mean(np.log(y_pred) - np.log(y_val[:, 0])))
print(f"Validation Perplexity: {perplexity}")
# On sauvegarde le modèle
model.save("movie_dialog_model.h5")

[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - accuracy: 0.0898 - loss: 5.8726
Validation Loss: 5.821889877319336
Validation Accuracy: 0.08990614861249924
[1m1386/1386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step


  perplexity = np.exp(np.mean(np.log(y_pred) - np.log(y_val[:, 0])))


ValueError: operands could not be broadcast together with shapes (44324,10000) (44324,) 

In [None]:
def generate_reply(input_text):
    input_text = clean_text(input_text)
    seq = tokenizer.texts_to_sequences([input_text])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    prediction = model.predict(padded)
    word_index = np.argmax(prediction[0])

    for word, index in tokenizer.word_index.items():
        if index == word_index:
            return word
    return "..."

# Exemple :
generate_reply("Are you ready for the adventure?")
