# The Virtual Assistant Model Training ✨
------------------
- This project aims to develop a virtual assistant using deep learning tech- niques to create an intelligent and versatile system capable of understanding natural language queries and providing relevant responses. ☺️
- The goal is to address the limitations of current virtual assistant technologies and create a more interactive and personalized experience for users. With the help of transformers the virtual assistant aims to provide a seamless and intuitive user experience. 🚀

## 1 - Install and Import Necessary Libraries 💥




In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BlenderbotTokenizer, BlenderbotForConditionalGeneration
import torch
from sklearn.model_selection import train_test_split
import os
import pickle
import logging
import torch
from transformers import AutoConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import PreTrainedTokenizer, MODEL_WITH_LM_HEAD_MAPPING, WEIGHTS_NAME
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pathlib import Path
from typing import Dict, List, Tuple

# Caching and storing of data/checkpoints
import json
import re
import random
import glob
import shutil
import numpy as np

Check if cuda is empty or not.

In [3]:
torch.cuda.empty_cache()

## 2 - Preprocessing Step 👻

Create categories here.

In [4]:
# words to create categories and find emotions
sad_words = ["sad", "upset", "heartbreaking", "lonely", "lone", "alone", "broken", "dislike", "cry", "sorry"]
happy_words = ["happy", "cheerful", "funny", "fun", "enjoy", "playful", "like", "laugh", "joyful", "smile"]
calm_words = ["calm", "piece", "relax", "quiet", "smooth", "cool", "easygoing", "breath"]
angry_words = ["angry", "horrified", "terrible", "bad", "sucks", "don't like", "dislike", "hate", "shit"]
tired_words = ["tired", "sick", "bored", "anxiety"]
romantic_words = ["love", "flirt", "flirty", "honey", "sweet", "sweetheart", "darling", "beauty", "lovely"]

Create a class to find which dataframe to create.

In [5]:
class CategorizeDataset:

    def return_objects(self, file):
        # read json file
        data_file = open(file, "r")
        data = data_file.read()

        # extract objects and texts
        objects = json.loads(data)
        utterances = []
        texts = []

        for obj in objects:
            utterance = obj['utterances']
            utterances.append(obj['utterances'])
            for i in range(len(utterance)):
                text = utterance[i].get("text")
                texts.append(text)

        return texts

        # creating categories


    def create_category(self, words, texts):
        category = []
        for word in words:
            for s in texts:
                if word in s:
                    category.append(s)
        return category


    def choose_emotion(self, file1, file2):
        category_name = self.detect_emotion()
        texts1 = self.return_objects(file1)
        texts2 = self.return_objects(file2)

        # categories to use
        sad_category = self.create_category(sad_words, texts1) + self.create_category(sad_words, texts2)
        happy_category = self.create_category(happy_words, texts1) + self.create_category(happy_words, texts2)
        calm_category = self.create_category(calm_words, texts1) + self.create_category(calm_words, texts2)
        angry_category = self.create_category(angry_words, texts1) + self.create_category(angry_words, texts2)
        tired_category = self.create_category(tired_words, texts1) + self.create_category(tired_words, texts2)
        romantic_category = self.create_category(romantic_words, texts1) + self.create_category(romantic_words, texts2)

        if category_name == "sad_counter":
            return sad_category
        elif category_name == "happy_counter":
            return happy_category
        elif category_name == "calm_counter":
            return calm_category
        elif category_name == "angry_counter":
            return angry_category
        elif category_name == "tired_counter":
            return tired_category
        elif category_name == "romantic_counter":
            return romantic_category
        return None

    def initial_form(self):
        print("Hello! My name is The Assistant. Before we start, I'd like to ask 3 questions to understand your feelings.")
        print("In terms of your emotional behavior, how would you describe myself?")
        ans1 = input().split(' ')
        print("With which words would you describe your feelings")
        ans2 = input().split(' ')
        print("How do you feel right now?")
        ans3 = input().split(' ')

        return ans1, ans2, ans3


    def counter(self, word_list, answers_list):
        count = 0

        for answer in answers_list:
            for s in answer:
                for word in word_list:
                    if word == s:
                        count += 1
        return count


    def detect_emotion(self):
        answers = self.initial_form()

        # initialize counters
        sad_counter = self.counter(sad_words, answers)
        happy_counter = self.counter(happy_words, answers)
        romantic_counter = self.counter(romantic_words, answers)
        angry_counter = self.counter(angry_words, answers)
        calm_counter = self.counter(calm_words, answers)
        tired_counter = self.counter(tired_words, answers)

        var = {sad_counter: "sad_counter", happy_counter: "happy_counter", romantic_counter: "romantic_counter",
              angry_counter: "angry_counter", calm_counter: "calm_counter", tired_counter: "tired_counter"}

        return var.get(max(var))

Create category and the dataframe.

In [6]:
categorizer = CategorizeDataset()

# initialize our category
category = categorizer.choose_emotion("data.json", "data_00.json")

# create our dataframe
df = pd.DataFrame(category, columns=['Text'])

Hello! My name is The Assistant. Before we start, I'd like to ask 3 questions to understand your feelings.
In terms of your emotional behavior, how would you describe myself?
sad
With which words would you describe your feelings
sad
How do you feel right now?
sad


Contextualize dataframe here.

In [7]:
contexted = []
n = 3
text = df['Text']

for i in range(n, len(text)):
  row = []
  prev = i - 1 - n
  for j in range(i, prev, -1):
    row.append(text[j])
  contexted.append(row)
contexted[:5]

[['what did you think of the ending. personally i thought it was sad',
  "A Walk to Remember is not my favorite movie. I guess It depends on like these kinds of movies. I am not the biggest fan of the author of that movie. It's based on a book. And he kills almost everybody in his movies, so that's why I'm not really a fan. A Walk to Remember has The lead dies, so it's Even though it's a teen movie, it's very sad. It's a drama",
  "Well, I was hoping there wasn't any romance, but it was a really heartfelt story and sad in the end, and I really liked the character development they went through.",
  "Because it was sad, it was a horror movie and it was like playing a game, you want to play a game with me and they will catch people and put them in a home let's say in a house and they will lock them and they need to find the way to leave. But in order to leave a place they will have to end up dead."],
 ['Well it was a little sad, yes, but actually, you know for the future of mankind, it it

In [8]:
columns = ['response', 'context'] + ['context/'+str(i) for i in range(n-1)]
columns

['response', 'context', 'context/0', 'context/1']

In [9]:
df = pd.DataFrame.from_records(contexted, columns=columns)
df.head()

Unnamed: 0,response,context,context/0,context/1
0,what did you think of the ending. personally i...,A Walk to Remember is not my favorite movie. I...,"Well, I was hoping there wasn't any romance, b...","Because it was sad, it was a horror movie and ..."
1,"Well it was a little sad, yes, but actually, y...",what did you think of the ending. personally i...,A Walk to Remember is not my favorite movie. I...,"Well, I was hoping there wasn't any romance, b..."
2,"I like how they're all sad, profound.","Well it was a little sad, yes, but actually, y...",what did you think of the ending. personally i...,A Walk to Remember is not my favorite movie. I...
3,"They make me happy, and make me leave the real...","I like how they're all sad, profound.","Well it was a little sad, yes, but actually, y...",what did you think of the ending. personally i...
4,It's just not the kind of movie that interests...,"They make me happy, and make me leave the real...","I like how they're all sad, profound.","Well it was a little sad, yes, but actually, y..."


## 3 - Split Dataset into Train-Validation 📈

In [13]:
trn_df, val_df = train_test_split(df, test_size=0.2)
# You may would like to sample your data to prevent memory error
# trn_df = trn_df.sample(1500)
# val_df = val_df.sample(300)

logger = logging.getLogger(__name__)

def construct_convo(row, tokenizer):
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = flatten(conv)
    return conv

We will use "microsoft/DialoGPT-large" as our tokenizer.

In [14]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")

In [15]:
for _, row in trn_df.iterrows():
  conv = construct_convo(row, tokenizer)
  break

## 4 - Training our Model 🤗

Create our dataset function.

In [16]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_convo(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [17]:
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
   return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)

In [18]:
def set_seed(args):
   random.seed(args.seed)
   np.random.seed(args.seed)
   torch.manual_seed(args.seed)
   if args.n_gpu > 0:
       torch.cuda.manual_seed_all(args.seed)

In [19]:
def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
   ordering_and_checkpoint_path = []


   glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))


   for path in glob_checkpoints:
       if use_mtime:
           ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
       else:
           regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
           if regex_match and regex_match.groups():
               ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))


   checkpoints_sorted = sorted(ordering_and_checkpoint_path)
   checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
   return checkpoints_sorted

In [20]:
def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
   if not args.save_total_limit:
       return
   if args.save_total_limit <= 0:
       return


   # Check if we should delete older checkpoint(s)
   checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
   if len(checkpoints_sorted) <= args.save_total_limit:
       return

   number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
   checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
   for checkpoint in checkpoints_to_be_deleted:
       logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
       shutil.rmtree(checkpoint)

from transformers import PreTrainedModel, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm, trange

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

Create our train function.

In [21]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )
    print("Batch size: ", args.train_batch_size)
    print("Data Loader length: ", len(train_dataloader))

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                print("Batch shape: ", batch.shape)
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

Let's create our evaluation function.

In [22]:
# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Evaluation
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result


Lastly, initalize our arguments.

In [23]:
class Args():
    def __init__(self):
        self.output_dir = 'GokcenazGPT-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-large'
        self.config_name = 'microsoft/DialoGPT-large'
        self.tokenizer_name = 'microsoft/DialoGPT-large'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 2
        self.per_gpu_eval_batch_size = 2
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'


args = Args()

Now, we need to combine all of our methods to get ready for training! 👩🏽‍💻

In [24]:
# Main runner
def main(df_trn, df_val):
    args = Args()

    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
            os.path.exists(args.output_dir)
            and os.listdir(args.output_dir)
            and args.do_train
            and not args.overwrite_output_dir
            and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
        print(train_dataset)
        print("Train Dataset Length: ", len(train_dataset))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForCausalLM.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelForCausalLM.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

And... Here we go!💫

In [25]:
main(trn_df, val_df)



<__main__.ConversationDataset object at 0x7f11ec26f0a0>
Train Dataset Length:  568
Batch size:  2
Data Loader length:  284




Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/284 [00:00<?, ?it/s]

Iteration:   0%|          | 0/284 [00:00<?, ?it/s]

Iteration:   0%|          | 0/284 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/71 [00:00<?, ?it/s]

{'perplexity_': tensor(1.4166)}

## 5 - Try Your Model 🎯

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')
model = AutoModelForCausalLM.from_pretrained('GokcenazGPT-small')

# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(tokenizer.eos_token + input(">> User:"), return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000, 
        pad_token_id=tokenizer.eos_token_id, 
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature = 0.1,
                                      )
  
    # pretty print last ouput tokens from bot
    print("GokcenazBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0]).replace("<|endoftext|>", "")))

## 6 - Save Your Model To Hugging Face Hub! ⚡️

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Don't forget to change your models name 🤩

In [None]:
model.push_to_hub('GokcenazGPT-small-v1')
tokenizer.push_to_hub('GokcenazGPT-small-v1')