In [None]:
import numpy as np
import tensorflow as tf
import pickle 
from tensorflow.keras import layers, activations, models, preprocessing

# Load Data

In [None]:
with open("dialogs.txt", "r") as f:
    conversations = []
    for line in f:
        conversations.append(line.split("\t"))

In [None]:
questions = [text[0] for text in conversations]
answers = [text[1] for text in conversations]

In [None]:
print(len(questions))
print(questions[0])
print(f"Answer: {answers[0]}")

3725
hi, how are you doing?
Answer: i'm fine. how about yourself?



# Preprocess

In [None]:
import string

In [None]:
questions = ["<sos> " + ques.lower().translate(str.maketrans("", "", string.punctuation)) + " <eos>" for ques in questions]
answers = ["<sos> " + ans.lower().translate(str.maketrans("", "", string.punctuation)) + " <eos>" for ans in answers]

In [None]:
questions[1]

'<sos> im fine how about yourself <eos>'

**Tokenizer**

In [None]:
tokenizer = preprocessing.text.Tokenizer(oov_token="<oov>")
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index) + 1
print(f"Vocab size: {VOCAB_SIZE}")

Vocab size: 2528


In [None]:
tokenizer.word_index["sos"]

2

In [None]:
from gensim.models import Word2Vec
import re

In [None]:
vocab = []
for word in tokenizer.word_index:
    vocab.append(word)

In [None]:
w2i = {}
i2w = {}
for w, i in tokenizer.word_index.items():
    w2i[w] = i
    i2w[i] = w

In [None]:
def tokenize(sentences):
    token_list = []
    vocabulary = []
    for sentence in sentences: 
        sentence = sentence.lower()
        sentence = re.sub("[^a-zA-Z<>]", "", sentence)
        tokens = sentence.split()
        vocabulary += tokens
        token_list.append(tokens)
    return token_list, vocabulary

In [None]:
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
padded_questions = tf.keras.utils.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding="post")
encoder_input_data = np.array(padded_questions)

**Tokenize for answer**

Decoder Input là input sẽ feed vào phần decoder của model

Input này được tạo ra từ các câu trả lời trong Dataset, nó sẽ được tokenize, padding về cùng 1 độ dài

In [None]:
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
padded_answers = tf.keras.utils.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding="post")
decoder_input_data = np.array(padded_answers)

In [None]:
print(f"input data shape: {decoder_input_data.shape}, maxlen: {maxlen_answers}")

input data shape: (3725, 21), maxlen: 21


**Decoder output data**

Decoder Output Data cũng tương tự như trên, nhưng dữ liệu sẽ bị cắt bỏ token đầu tiên, các token còn lại chuyển thành dạng One hot vector, để model có thể dự đoán bằng hàm softmax

In [None]:
tokenized_answers = tokenizer.texts_to_sequences(answers)
for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:]

padded_answers = tf.keras.utils.pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding="post")
onehot_answers = tf.keras.utils.to_categorical(padded_answers, VOCAB_SIZE)
decoder_output_data = np.array(onehot_answers)
print(decoder_output_data.shape)

(3725, 21, 2528)


**Get pretrained word embedding**

Ở đây, ta sử dụng pretrained word embedding *Glove6B 200 dimensions*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# get pretrained word embedding glove 
embedding_index = {}
with open("/content/drive/MyDrive/DEEP LEARNING/glove.6B.200d.txt", "r") as f:
    for line in f:
        word, coef = line.split(maxsplit=1)
        coefs = np.fromstring(coef, "f", sep=" ")
        embedding_index[word] = coefs

In [None]:
print(len(embedding_index))
embedding_of_hello = embedding_index["hello"]
print(embedding_of_hello.shape)

400000
(200,)


Tạo embedding matrix để có thể đưa vào lớp Embedding của model. 
Embedding matrix có số dòng bằng với số từ vựng, và mỗi dòng là một vector embedding của từ đó

In [None]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 200))

for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Base Model

In [None]:
encoder_input = tf.keras.layers.Input(shape=(maxlen_questions, ))
decoder_input = tf.keras.layers.Input(shape=(maxlen_answers, ))

In [None]:
embedding_layer = tf.keras.layers.Embedding(input_dim = VOCAB_SIZE,
                            output_dim = 200, 
                            input_length = maxlen_questions,
                            weights=[embedding_matrix])

In [None]:
embedding_encoder = embedding_layer(encoder_input)
embedding_decoder =  embedding_layer(decoder_input)

In [None]:
HIDDEN_DIM = 200

# HIDDEN_DIM = 20
encoder_LSTM = tf.keras.layers.LSTM(HIDDEN_DIM, return_state=True)    
encoder_outputs, state_h, state_c = encoder_LSTM(embedding_encoder)
decoder_LSTM = tf.keras.layers.LSTM(HIDDEN_DIM, return_state=True, return_sequences=True)   
decoder_outputs, _, _ = decoder_LSTM(embedding_decoder, initial_state=[state_h, state_c])
outputs = tf.keras.layers.Dense(VOCAB_SIZE, activation="softmax")(decoder_outputs)

In [None]:
model_lstm = tf.keras.models.Model([encoder_input, decoder_input], outputs)

In [None]:
model_lstm.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=["accuracy"])

In [None]:
model_lstm.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 21, 200)      505600      ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lstm_6 (LSTM)                  [(None, 200),        320800      ['embedding_2[0][0]']      

In [None]:
decoder_output_data.shape

(3725, 21, 2528)

In [None]:
model_lstm.fit(
    [encoder_input_data, decoder_input_data], decoder_output_data,
    batch_size=32,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f31b00a6220>

In [None]:
def generate_respond(text):
    tokenized_questions = tokenizer.texts_to_sequences(["sos " + text + " eos"])
    padded_questions = tf.keras.utils.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding="post")
    inputs = np.array(padded_questions)

    result = ''
    dec_input_ids = [[w2i["sos"]]]
    
    for t in range(maxlen_answers):
        dec_input = tf.expand_dims(dec_input_ids, axis=1)
        predictions =  model_lstm.predict([inputs, dec_input]) # decoder(dec_input, enc_out, dec_hidden)
        predicted_id = tf.argmax(predictions[0][0]).numpy()
        if i2w[predicted_id] == "eos":
            break
        result += i2w[predicted_id] + ' '
        dec_input_ids = [predicted_id]
        
    return result

In [None]:
generate_respond("How are you")

TypeError: ignored

# Model

In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, dot, concatenate

**Encoder**

In [None]:
# prepare data for this
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoder_input_data, padded_answers)

In [None]:
BATCH_SIZE = 32
BUFFER_SIZE = X_train.shape[0]
N_BATCH = BUFFER_SIZE//BATCH_SIZE
hidden_unit = 200
embedding_size = 200

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
dataset = dataset.batch(BATCH_SIZE)

In [None]:
class Encode(tf.keras.Model):
    def __init__(self, embedding_size, vocab_size, hidden_units):
        super(Encode, self).__init__()
        self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size, weights=[embedding_matrix])
        self.GRU = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform')
        self.hidden_units = hidden_units
        
    def call(self, x, hidden_state):
        x = self.Embedding(x)
        outputs, last_state = self.GRU(x, hidden_state)
        return outputs, last_state
    
    def init_hidden_state(self, batch_size):
        return tf.zeros([batch_size, self.hidden_units])


In [None]:
class Attention(tf.keras.Model):
    def __init__(self, hidden_units):
        super(Attention, self).__init__()
        self.W_out_encode = tf.keras.layers.Dense(hidden_units)
        self.W_state = tf.keras.layers.Dense(hidden_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, encode_outs, pre_state):
        pre_state = tf.expand_dims(pre_state, axis=1)
        pre_state = self.W_state(pre_state)
        encode_outs = self.W_out_encode(encode_outs)
        score = self.V(
            tf.nn.tanh(
                pre_state + encode_outs)
        )
        score = tf.nn.softmax(score, axis=1)
        context_vector = score*encode_outs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, score


In [None]:
class Decode(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_units):
        super(Decode, self).__init__()
        self.hidden_units = hidden_units
        self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size, weights=[embedding_matrix])
        self.Attention = Attention(hidden_units)
        self.GRU = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.Fc = tf.keras.layers.Dense(vocab_size)
            
    def call(self, x, encode_outs, pre_state):
        x = tf.expand_dims(x, axis=1)
        x = self.Embedding(x)
        context_vector, attention_weight = self.Attention(encode_outs, pre_state)
        context_vector = tf.expand_dims(context_vector, axis=1)
        gru_inp = tf.concat([x, context_vector], axis=-1)
        out_gru, state = self.GRU(gru_inp)
        out_gru = tf.reshape(out_gru, (-1, out_gru.shape[2]))
        return self.Fc(out_gru), state


In [None]:
def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [None]:
    
EPOCHS = 20
optimizer = tf.optimizers.Adam()
encoder = Encode(embedding_size, vocab_size=VOCAB_SIZE, hidden_units=hidden_unit)
decoder = Decode(vocab_size=VOCAB_SIZE, embedding_size=embedding_size, hidden_units=hidden_unit)
    
for epoch in range(EPOCHS):
    total_loss = 0
    for batch_id, (x, y) in enumerate(dataset.take(N_BATCH)):
        loss = 0
        with tf.GradientTape() as tape:
            first_state = encoder.init_hidden_state(batch_size=BATCH_SIZE)
            encode_outs, last_state = encoder(x, first_state)
            decode_state = last_state
            decode_input = [w2i["sos"]]*BATCH_SIZE
            
            for i in range(1, y.shape[1]):
                decode_out, decode_state = decoder(
                        decode_input, encode_outs, decode_state
                )
                loss += loss_function(y[:, i], decode_out)
                decode_input = y[:, i]
                
            train_vars = encoder.trainable_variables \
                        + decoder.trainable_variables
            grads = tape.gradient(loss, train_vars)
            optimizer.apply_gradients(zip(grads, train_vars))
        total_loss += loss
    print(total_loss.numpy())




3498.7236
3295.7292
3294.2341
3288.8894
3251.646
3103.4075
2965.0376
2838.3845
2726.757
2620.048
2522.8022
2436.3064
2356.0508
2276.4597
2193.0273
2118.3044
2051.8335
1983.2173
1920.2454
1836.7229


In [None]:
def generate_respond(text):
    tokenized_questions = tokenizer.texts_to_sequences([text])
    padded_questions = tf.keras.utils.pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding="post")
    inputs = np.array(padded_questions)

    result = ''
    hidden = encoder.init_hidden_state(batch_size=1)
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = [w2i["sos"]]
    
    for t in range(maxlen_answers):
        predictions, dec_hidden = decoder(dec_input, enc_out, dec_hidden)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if i2w[predicted_id] == "eos":
            break
        result += i2w[predicted_id] + ' '
        dec_input = [predicted_id]
    return result

In [None]:
text = "How are you"

print(generate_respond(text))

dont know 




---



**Original**

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=( maxlen_questions , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( maxlen_answers ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=["accuracy"])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 21, 200)      505400      ['input_4[0][0]']                
                                                                                                  
 embedding_4 (Embedding)        (None, 21, 200)      505400      ['input_5[0][0]']                
                                                                                            

# Model fail

**Encoder**

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=(maxlen_questions, ))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True, weights=[embedding_matrix])(encoder_inputs)
encoder_outputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

**Attention**

In [None]:
class Attention(tf.keras.Model):
    def __init__(self, hidden_units):
        super(Attention, self).__init__()
        self.W_out_encode = tf.keras.layers.Dense(hidden_units)
        self.W_state = tf.keras.layers.Dense(hidden_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, encode_outs, pre_state):
        pre_state = tf.expand_dims(pre_state, axis=1)
        pre_state = self.W_state(pre_state)
        encode_outs = self.W_out_encode(encode_outs)
        score = self.V(
            tf.nn.tanh(
                pre_state + encode_outs)
        )
        score = tf.nn.softmax(score, axis=1)
        context_vector = score*encode_outs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, score

**Decoder**

In [None]:
decoder_inputs = tf.keras.layers.Input(shape=(maxlen_answers,))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200, mask_zero=True, weights=[embedding_matrix])(decoder_inputs)

# context_vector, attention_weights = Attention

decoder_outputs , _ , _ = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )(decoder_embedding)
outputs = tf.keras.layers.Dense(VOCAB_SIZE, activation="softmax")(decoder_outputs)

In [None]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], outputs)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 21, 200)      505600      ['input_2[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  [(None, 21, 200),    320800      ['embedding_1[0][0]']            
                                 (None, 200),                                                     
                                 (None, 200)]                                                     
                                                                                              

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=150 ) 
model.save( 'model.h5' )

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

# Talking to the chatbot

In [None]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
    
    decoder_states = [state_h, state_c]

    decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [None]:
def str_to_tokens( sentence : str ):

    words = sentence.lower().split()
    tokens_list = list()
  
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')


In [None]:
ques = input("Enter your question: ")
tokens = str_to_tokens(ques)

Enter your question: how are you


In [None]:
tokenizer.word_index['sos']

1

In [None]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['sos']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'eos' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

Enter question : hello
 winter patient <oov> <oov> <oov> <oov> <oov> decided birds back birds back elastic about balloon balloon balloon 50 about bed bed crime


KeyboardInterrupt: ignored



# Chatbot using Transformers models

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Demo Simple chatbot with BERT**

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<SOS>",
                                "eos_token": "<EOS>"})

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

In [None]:
class ChatbotDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_masks = []
        self.labels = []

        self.X = []
        for conversation in tqdm(data):
            input_text = conversation[0]
            label_text = conversation[1]

            line = "<SOS> " + input_text + " <bot>: " + label_text + " <EOS>"

            """input_encoding = self.tokenizer.encode_plus(input_text, add_special_tokens=True, 
                                                          padding="max_length", truncation=True,
                                                          max_length=64, return_tensors="pt")
            label_encoding = self.tokenizer.encode_plus(label_text, add_special_tokens=True, 
                                                          padding="max_length", truncation=True,
                                                          max_length=64, return_tensors="pt")"""
            inputs = self.tokenizer.encode_plus(line, add_special_tokens=True, 
                                                          padding="max_length", truncation=True,
                                                          max_length=128, return_tensors="pt")
            self.input_ids.append(inputs["input_ids"])     # .append(input_encoding["input_ids"])  
            self.attention_masks.append(inputs["attention_mask"])      #.append(input_encoding["attention_mask"])
        # self.labels.append(label_encoding["input_ids"])
    
    def __len__(self):
        return len(self.input_ids) 
    
    def __getitem__(self, index):
        """return {
            "input_ids": self.input_ids[index],
            "attention_mask": self.attention_masks[index],
            "labels": self.labels[index]
        }"""
        return {
            "input_ids": self.input_ids[index],
            "attention_mask": self.attention_masks[index]
        }

In [None]:
train_dataset = ChatbotDataset(conversations, tokenizer)

100%|██████████| 3725/3725 [00:01<00:00, 3035.10it/s]


In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
for epoch in range(1):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # target_ids = batch['labels'].to(device)
        
        # Generate the outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)

        # outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_dataset)}")
    
# Save model
model.save_pretrained("gpt2_chatbot")

100%|██████████| 233/233 [02:06<00:00,  1.84it/s]


Epoch 1 Loss: 0.02163321002217747


In [None]:
def infer(inp):
    inp = "<SOS> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a, pad_token_id=50256, max_new_tokens=20)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

In [None]:
infer("how are you doing")

"<SOS> how are you doing <bot>:  i'm doing well.\n <EOS>SOS> i'm"