In [None]:
!pip install transformers

In [1]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFGPT2LMHeadModel
import pandas as pd
import os, tqdm, warnings

warnings.filterwarnings('ignore')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='<s>', eos_token='</s>', cls_token='<usr>',
                                          pad_token='<pad>', sep_token='<sys>', unk_token='<unk>')
model = TFGPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2', from_pt=True)

In [4]:
df = pd.read_csv('C:/Users/Lee_Hyo_Jae/Desktop/new_project/dataset/dialog_chatbot.csv')

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.drop(df.loc[df['Q'].str.contains('\*')].index, axis=0, inplace=True)
df.drop(df.loc[df['A'].str.contains('\*')].index, axis=0, inplace=True)
df.drop(df.loc[df['Q'].str.len() >= 50].index, axis=0, inplace=True)
df.drop(df.loc[df['A'].str.len() >= 50].index, axis=0, inplace=True)
df

In [None]:
intet_label = list(df['Q_intent'].unique())

label_dict = {}

for idx, intent_lab in enumerate(intet_label) :
    label_dict[idx] = intent_lab

label_dict

In [None]:
Q_int_li = list(set(df['Q_intent'].unique()))
A_int_li = list(set(df['A_intent'].unique()))

for i in Q_int_li :
  print(f"{i} / Q : {df.loc[df['Q_intent'] == i, 'Q_intent'].count()} / A : {df.loc[df['A_intent'] == i, 'A_intent'].count()}")

In [None]:
train_data = pd.DataFrame()

for v,k in label_dict.items():
    len_ = df.loc[df['Q_intent'] == k,'Q_intent'].count()
    if len_ > 10000 :
        train_data = pd.concat([train_data,df.loc[df['Q_intent'] == k].sample(n=10000)],ignore_index=True)
    else :
        train_data = pd.concat([train_data,df.loc[df['Q_intent'] == k]],ignore_index=True)

for v,k in label_dict.items():
    len_ = df.loc[df['A_intent'] == k,'A_intent'].count()
    if len_ > 10000 :
        train_data = pd.concat([train_data,df.loc[df['A_intent'] == k].sample(n=10000)],ignore_index=True)
    else :
        train_data = pd.concat([train_data,df.loc[df['A_intent'] == k]],ignore_index=True)

train_data.drop(train_data[train_data.duplicated()].index, axis=0, inplace=True)
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data

In [116]:
batch_size = 128

In [117]:
def get_chat_data():
  for question, answer in zip(train_data.Q.to_list(), train_data.A.to_list()):
    bos_token = [tokenizer.bos_token_id]
    eos_token = [tokenizer.eos_token_id]
    sent = tokenizer.encode('<usr>' + question + '<sys>' + answer) 
    yield bos_token + sent + eos_token

In [118]:
dataset = tf.data.Dataset.from_generator(get_chat_data, output_types=tf.int32)

In [119]:
dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(None,), padding_values=tokenizer.pad_token_id)

In [None]:
for batch in dataset:
    print(batch)
    break

In [121]:
adam = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

In [122]:
if len(train_data) % batch_size == 0:
    steps = len(train_data) // batch_size
else :
    steps = len(train_data) // batch_size + 1

In [None]:
EPOCHS = 5

for epoch in range(EPOCHS):
  epoch_loss = 0

  for batch in tqdm.notebook.tqdm(dataset, total=steps):
      with tf.GradientTape() as tape:
          result = model(batch, labels=batch)
          loss = result[0]
          batch_loss = tf.reduce_mean(loss)
          
      grads = tape.gradient(batch_loss, model.trainable_variables)
      adam.apply_gradients(zip(grads, model.trainable_variables))
      epoch_loss += batch_loss / steps

  print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

In [None]:
def return_answer_by_chatbot(user_text):
  sent = '<usr>' + user_text + '<sys>'
  input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
  input_ids = tf.convert_to_tensor([input_ids])
  output = model.generate(input_ids, max_length=50, do_sample=True, top_k=20)
  sentence = tokenizer.decode(output[0].numpy().tolist())
  chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '')
  return chatbot_response

In [None]:
def chat():
    while 1:
        sen = input("user > ").strip()
        if sen == '끝':
            break
        respon = return_answer_by_chatbot(sen)
        print("Chatbot > {}".format(respon.strip()))

In [None]:
chat()

In [None]:
MODEL_SAVE_PATH = os.path.join("koGPT_Chatbot") # change this to your preferred location
print(MODEL_SAVE_PATH)
print('='*50)

if os.path.exists(MODEL_SAVE_PATH):
    print(f"{MODEL_SAVE_PATH} -- Folder already exists \n")

else:
    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    print(f"{MODEL_SAVE_PATH} -- Folder create complete \n")

# save tokenizer, model
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)