In [1]:
import os
import time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import numpy as np
from tensorflow.keras.models import load_model
import pickle
from tensorflow.keras.optimizers import Adam

In [2]:
path = 'dataset/'
conversations_file = 'movie_conversations.txt'
lines_file = 'movie_lines.txt'


# Specify the parent directory where you want to save the models
parent_dir = "model/gan_model"

# Get input from the user for the number
number = input("Enter the number: ")

# Create the folder name by concatenating "model_" with the number
folder_name = f"model_{number}"

# Create the full path for the new folder
save_dir = os.path.join(parent_dir, folder_name)

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

In [3]:
subset_size = 50

In [4]:
# Load and preprocess the dataset
def load_dataset():
    conversations = open(path+conversations_file, 'r', encoding='utf-8', errors='ignore').read().split('\n')
    lines = open(path+lines_file, 'r', encoding='utf-8', errors='ignore').read().split('\n')

    id2line = {}
    for line in lines:
        parts = line.split(' +++$+++ ')
        if len(parts) == 5:
            id2line[parts[0]] = parts[4]

    # print(id2line)

    # if subset_size is not None:
      # lines = lines(subset_size)


    conversation_pairs = []
    for conversation in conversations:
        parts = conversation.split(' +++$+++ ')
        if len(parts) == 4:
            line_ids = parts[3][1:-1].replace("'", "").replace(" ", "").split(",")
            pair = [id2line[line_id] for line_id in line_ids]
            conversation_pairs.append(pair)

    print('Totla conversation_pairs',len(conversation_pairs))
    return conversation_pairs[:subset_size]
    # return conversation_pairs

conversation_pairs = load_dataset()

Totla conversation_pairs 83097


In [5]:
len(conversation_pairs)

50

In [6]:
# Prepare input and target sequences
input_texts = []
target_texts = []

for pair in conversation_pairs:
    for i in range(len(pair) - 1):
        input_texts.append(pair[i])
        target_texts.append(pair[i + 1])

In [7]:
len(conversation_pairs)

50

In [8]:
print(input_texts[0])
print(target_texts[0])

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.


In [9]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)
vocab_size = len(tokenizer.word_index) + 1

In [10]:
vocab_size

465

In [11]:
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [12]:
len(input_sequences[0])

22

In [13]:
# Pad sequences
max_sequence_length = max(max(len(seq) for seq in input_sequences), max(len(seq) for seq in target_sequences))
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

In [14]:
len(input_sequences[0])

44

In [15]:
# Prepare the training data
encoder_input_data = input_sequences
decoder_input_data = target_sequences[:, :-1]
decoder_target_data = target_sequences[:, 1:]

In [16]:
# Define the Generator network
latent_dim = 256

generator_inputs = keras.Input(shape=(max_sequence_length-1,))
generator_embedding = keras.layers.Embedding(vocab_size, latent_dim)(generator_inputs)
generator_lstm = keras.layers.LSTM(10, return_sequences=True)(generator_embedding)
generator_outputs = keras.layers.TimeDistributed(keras.layers.Dense(vocab_size, activation='softmax'))(generator_lstm)
generator_model = keras.Model(generator_inputs, generator_outputs)


In [17]:
print(generator_model.summary())
output_shape = generator_model.output_shape
print("Generator Output Shape:", output_shape)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 43)]              0         
                                                                 
 embedding (Embedding)       (None, 43, 256)           119040    
                                                                 
 lstm (LSTM)                 (None, 43, 10)            10680     
                                                                 
 time_distributed (TimeDistr  (None, 43, 465)          5115      
 ibuted)                                                         
                                                                 
Total params: 134,835
Trainable params: 134,835
Non-trainable params: 0
_________________________________________________________________
None
Generator Output Shape: (None, 43, 465)


In [18]:
# # Define the Discriminator network
discriminator_inputs = keras.Input(shape=(max_sequence_length-1,vocab_size))
# discriminator_embedding = keras.layers.Embedding(vocab_size, latent_dim)(discriminator_inputs)
# discriminator_lstm = keras.layers.LSTM(128)(discriminator_inputs)
discriminator_lstm = keras.layers.LSTM(10)(discriminator_inputs)
# discriminator_lstm = keras.layers.LSTM(30)(discriminator_lstm)
discriminator_outputs = keras.layers.Dense(1, activation='sigmoid')(discriminator_lstm)

discriminator_model = keras.Model(discriminator_inputs, discriminator_outputs)


# inputs = keras.Input(shape=(max_sequence_length-1,))
# embedding = keras.layers.Embedding(vocab_size, latent_dim, input_length=max_sequence_length)(inputs)
# lstm = keras.layers.LSTM(10, return_sequences=True)(embedding)
# flatten = keras.layers.Flatten()(lstm)
# output = keras.layers.Dense(1, activation='sigmoid')(flatten)

# discriminator_model = keras.Model(inputs=inputs, outputs=output)



In [19]:
print(discriminator_model.summary())
input_shape = discriminator_model.input_shape
print("Discriminator Input Shape:", input_shape)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 43, 465)]         0         
                                                                 
 lstm_1 (LSTM)               (None, 10)                19040     
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 19,051
Trainable params: 19,051
Non-trainable params: 0
_________________________________________________________________
None
Discriminator Input Shape: (None, 43, 465)


In [20]:
# Define the GAN model
gan_inputs = keras.Input(shape=(max_sequence_length-1,))
generated_sequences = generator_model(gan_inputs)
gan_outputs = discriminator_model(generated_sequences)
gan_model = keras.Model(gan_inputs, gan_outputs)

In [21]:
# Compile the Discriminator model
discriminator_model.compile(loss='binary_crossentropy', optimizer='adam')

# Compile the GAN model
gan_model.compile(loss='binary_crossentropy', optimizer='adam')

In [22]:
# Training parameters
batch_size = 64
epochs = 100

In [23]:
real_data=encoder_input_data

In [24]:
# Training loop
for epoch in range(epochs):
    for batch in range(0, len(real_data), batch_size):
        real_batch = real_data[batch:batch+batch_size]
        real_batch = np.expand_dims(real_batch, axis=2)
        real_batch = np.repeat(real_batch, 465, axis=2)  # Adjust the third dimension to match fake_data
        real_labels = np.ones((len(real_batch), 1))

        # Generate fake data using the Generator model
        noise = np.random.randint(0, vocab_size, size=(len(real_batch), max_sequence_length-1))
        fake_batch = generator_model.predict(noise)
        fake_batch = fake_batch[:, :max_sequence_length-1, :]
        fake_labels = np.zeros((len(fake_batch), 1))

        # Adjust the size of real_batch
        real_batch = real_batch[:, :max_sequence_length-1, :]

        # Reshape the fake_batch to match the dimensions of real_batch
        fake_batch = np.squeeze(fake_batch)

        # Combine real and fake data
        combined_data = np.concatenate((real_batch, fake_batch[:-1, :]), axis=0)
        combined_labels = np.concatenate((real_labels, fake_labels), axis=0)

        # Shuffle the combined data and labels
        # combined_data, combined_labels = shuffle(combined_data, combined_labels)

        # Train the Discriminator
        num_samples = combined_data.shape[0]
        if batch_size > num_samples:
            indices = np.arange(num_samples)
        else:
            indices = np.random.choice(num_samples, batch_size, replace=False)
        discriminator_loss = discriminator_model.train_on_batch(combined_data[indices], combined_labels[indices])

        # Train the Generator (via GAN)
        gan_noise = np.random.randint(0, vocab_size, size=(batch_size, max_sequence_length-1))
        gan_labels = np.ones((batch_size, 1))
        gan_loss = gan_model.train_on_batch(gan_noise, gan_labels)

    # Print the loss for each epoch
    print(f"Epoch {epoch+1}: Discriminator Loss={discriminator_loss}, GAN Loss={gan_loss}")
    # Save the Generator model
    # Define the optimizer and loss function
    optimizer = Adam()
    loss_function = "binary_crossentropy"

    # Compile the Generator model
    generator_model.compile(optimizer=optimizer, loss=loss_function)

    # Save the compiled Generator model
    generator_model.save(os.path.join(save_dir, "generator_model_compiled.h5"))

    tokenizer_data = {
        'tokenizer': tokenizer,
        'max_sequence_length': max_sequence_length
    }

    # Save the Tokenizer
    with open(os.path.join(save_dir, "tokenizer.pkl"), "wb") as tokenizer_file:
        pickle.dump(tokenizer_data, tokenizer_file)

    
    # Update TensorBoard metrics after each epoch
    # tensorboard_callback.on_epoch_end(epoch, logs={'discriminator_loss': discriminator_loss, 'gan_loss': gan_loss})



Epoch 1: Discriminator Loss=0.6912660598754883, GAN Loss=0.6956967711448669
Epoch 2: Discriminator Loss=0.6896840333938599, GAN Loss=0.6937199234962463
Epoch 3: Discriminator Loss=0.689294159412384, GAN Loss=0.688646674156189
Epoch 4: Discriminator Loss=0.6889013051986694, GAN Loss=0.6839730739593506
Epoch 5: Discriminator Loss=0.6883232593536377, GAN Loss=0.6783967018127441
Epoch 6: Discriminator Loss=0.6873670220375061, GAN Loss=0.6734932661056519
Epoch 7: Discriminator Loss=0.6863397359848022, GAN Loss=0.6680924892425537
Epoch 8: Discriminator Loss=0.6842513084411621, GAN Loss=0.6618421077728271
Epoch 9: Discriminator Loss=0.6816513538360596, GAN Loss=0.6549659967422485
Epoch 10: Discriminator Loss=0.6786863207817078, GAN Loss=0.6486729383468628
Epoch 11: Discriminator Loss=0.6749143004417419, GAN Loss=0.6428980827331543
Epoch 12: Discriminator Loss=0.666480302810669, GAN Loss=0.6362019181251526


KeyboardInterrupt: 

In [None]:
# # Start the TensorBoard server
# tensorboard.program.TensorBoard(
#     logdir=log_dir,
#     host="localhost",
#     port=6006,
#     reload_interval=5  # Refresh the TensorBoard server every 5 seconds
# ).main()

In [None]:
def preprocess_input(user_input):
    # Tokenize the user input
    tokens = user_input.strip().split()

    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]

    # Perform any additional preprocessing steps
    # ...

    # Return the preprocessed input
    return tokens


def generate_response(input_tokens, generator_model):
    # Convert input tokens to numerical representation
    input_sequence = tokenizer.texts_to_sequences([input_tokens])
    input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length-1)
    # Generate response using the generator model
    generated_sequence = generator_model.predict(input_sequence)
    print('generated_sequence generated_sequence:',generated_sequence)

    # Convert numerical representation back to tokens
    # generated_tokens = tokenizer.sequences_to_texts(generated_sequence)[0].split()

    generated_tokens = [tokenizer.index_word.get(index, "") for index in np.argmax(generated_sequence, axis=-1)[0]]
    generated_tokens = [token for token in generated_tokens if token]

    print('generated_sequence function:',generated_tokens)

    # Return the generated response tokens
    return generated_tokens


def postprocess_response(response_tokens):
    # Convert tokens to string
    response_text = ' '.join(response_tokens)

    # Perform any postprocessing steps, such as capitalization or punctuation handling
    # ...

    # Return the postprocessed response
    return response_text


In [None]:
while True:
    user_input = input("User: ")
    if user_input.lower() == "exit":
        break

    # Preprocess user input
    processed_input = preprocess_input(user_input)

    # Generate response using the generator model
    generated_response = generate_response(processed_input, generator_model)
    print('generated response',generated_response)

    # Postprocess and display the generated response
    response = postprocess_response(generated_response)
    print("Chatbot: " + response)
