<a href="https://colab.research.google.com/github/fatazeouedraogo/Deep-Learning-for-Healthcare-Brain-Tumor-Classification-using-VGG16/blob/main/Chatbot%20with%20Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **chatbot with Seq2Seq or Transformers**

In [None]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
import os

# 1. Chargement des données Cornell Movie Dialogs
print("Chargement des données Cornell Movie Dialogs...")

def load_cornell_data():
    """Charge les données du corpus Cornell Movie Dialogs"""
    base_path = "cornell movie-dialogs corpus"

    # Charger les lignes
    lines = {}
    with open(os.path.join(base_path, "movie_lines.txt"), "r", encoding="iso-8859-1", errors='ignore') as f:
        for line in f:
            try:
                parts = line.split(" +++$+++ ")
                if len(parts) >= 5:
                    lines[parts[0]] = parts[4].strip()
            except:
                continue

    # Charger les conversations
    conversations = []
    with open(os.path.join(base_path, "movie_conversations.txt"), "r", encoding="iso-8859-1", errors='ignore') as f:
        for line in f:
            try:
                parts = line.split(" +++$+++ ")
                if len(parts) >= 4:
                    conv = eval(parts[3])
                    for i in range(len(conv) - 1):
                        if conv[i] in lines and conv[i+1] in lines:
                            conversations.append((lines[conv[i]], lines[conv[i+1]]))
            except:
                continue

    return conversations[:10000]  # Prendre les premières 10000 paires

# Charger les données
conversations = load_cornell_data()
print(f"Nombre de paires de conversation chargées: {len(conversations)}")

# 2. Prétraitement du texte
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    # Nettoyage de base
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# 3. Préparation des données
print("Prétraitement des données...")

input_texts = []
target_texts = []

for i, (input_text, target_text) in enumerate(conversations):
    if i % 1000 == 0:
        print(f"Traitement de la paire {i}/{len(conversations)}")

    input_clean = preprocess_text(input_text)
    target_clean = preprocess_text(target_text)

    # Filtrer les textes trop courts ou trop longs
    if (input_clean and target_clean and
        len(input_clean.split()) > 1 and len(target_clean.split()) > 1 and
        len(input_clean.split()) < 20 and len(target_clean.split()) < 20):
        input_texts.append(input_clean)
        target_texts.append('<start> ' + target_clean + ' <end>')

print(f"Paires valides après filtrage: {len(input_texts)}")

# 4. Tokenization
tokenizer = Tokenizer(filters='', oov_token='<OOV>')
all_texts = input_texts + target_texts
tokenizer.fit_on_texts(all_texts)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print(f"Taille du vocabulaire: {vocab_size}")

# Convertir en séquences
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# 5. Padding
max_input_length = max(len(seq) for seq in input_sequences) if input_sequences else 15
max_target_length = max(len(seq) for seq in target_sequences) if target_sequences else 20
max_seq_length = max(max_input_length, max_target_length)

print(f"Longueur maximale des séquences: {max_seq_length}")

input_data = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
target_data = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')

# 6. Construction du modèle
print("Construction du modèle...")

embedding_dim = 128
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention
attention = Attention()([decoder_outputs, encoder_outputs])
decoder_combined = Concatenate()([decoder_outputs, attention])
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined)

# Modèle
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# 7. Préparation des données d'entraînement
decoder_input_data = target_data[:, :-1]
decoder_target_data = target_data[:, 1:]

# 8. Entraînement
print("Début de l'entraînement...")

history = model.fit(
    [input_data, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=32,
    epochs=20,
    validation_split=0.2,
    verbose=1
)

# 9. Fonction de chat
def chat(input_text):
    # Prétraitement
    input_clean = preprocess_text(input_text)
    input_seq = tokenizer.texts_to_sequences([input_clean])
    input_seq = pad_sequences(input_seq, maxlen=max_seq_length, padding='post')

    # Préparation de la séquence cible
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word_index.get('<start>', 1)

    response = []
    stop_condition = False
    max_response_length = 20

    while not stop_condition and len(response) < max_response_length:
        output_tokens = model.predict([input_seq, target_seq], verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Trouver le mot correspondant
        sampled_word = None
        for word, idx in word_index.items():
            if idx == sampled_token_index:
                sampled_word = word
                break

        if sampled_word == '<end>' or sampled_token_index == 0 or sampled_word is None:
            stop_condition = True
        else:
            response.append(sampled_word)

        # Mettre à jour la séquence cible
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return ' '.join(response)

# 10. Interface de chat
print("\n" + "="*50)
print("CHATBOT CORNELL MOVIE DIALOGS")
print("="*50)
print("Tapez 'quit' pour quitter")

while True:
    user_input = input("\nVous: ")
    if user_input.lower() == 'quit':
        break

    response = chat(user_input)
    print(f"Bot: {response}")

# 11. Sauvegarde du modèle
model.save('cornell_chatbot.h5')
print("Modèle sauvegardé sous 'cornell_chatbot.h5'")

Chargement des données Cornell Movie Dialogs...
Nombre de paires de conversation chargées: 10000
Prétraitement des données...
Traitement de la paire 0/10000
Traitement de la paire 1000/10000
Traitement de la paire 2000/10000
Traitement de la paire 3000/10000
Traitement de la paire 4000/10000
Traitement de la paire 5000/10000
Traitement de la paire 6000/10000
Traitement de la paire 7000/10000
Traitement de la paire 8000/10000
Traitement de la paire 9000/10000
Paires valides après filtrage: 6011
Taille du vocabulaire: 7301
Longueur maximale des séquences: 21
Construction du modèle...


Début de l'entraînement...
Epoch 1/20
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 2s/step - accuracy: 0.5505 - loss: 4.2134 - val_accuracy: 0.6009 - val_loss: 2.8011
Epoch 2/20
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 1s/step - accuracy: 0.6185 - loss: 2.5453 - val_accuracy: 0.6057 - val_loss: 2.7627
Epoch 3/20
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 2s/step - accuracy: 0.6260 - loss: 2.4266 - val_accuracy: 0.6100 - val_loss: 2.7045
Epoch 4/20
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 1s/step - accuracy: 0.6293 - loss: 2.3280 - val_accuracy: 0.6146 - val_loss: 2.6785
Epoch 5/20
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 1s/step - accuracy: 0.6298 - loss: 2.2754 - val_accuracy: 0.6199 - val_loss: 2.6510
Epoch 6/20
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 2s/step - accuracy: 0.6417 - loss: 2.1498 - val_accuracy: 0.6237 - val_loss: 2.

# Preprocessing and Tokenization