dataset building

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [2]:
import pandas as pd
from typing import Tuple, List
from torch.utils.data import Dataset
import torch
import sklearn.model_selection as skm
import re
import pandas as pd
import torch
import pickle
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from sklearn.metrics import f1_score

In [3]:
class YTCommentsDataset(Dataset):
    def __init__(self, video_title_list, comments_list, tokenizer, max_length=1024):
        # variables
        self.input_ids = []
        self.attn_masks = []
        self.comments = []

        # iterate through the dataset
        for video_title, comment in zip(video_title_list, comments_list):
            # text of the request to model and its answer
            req_ans_text = f"<startoftext>The YouTube video named '{video_title}' may have the following comment:" \
                           f" {comment}<endoftext>"

            # tokenize text
            tokenized_text_dict = tokenizer(req_ans_text,
                                            truncation=True,
                                            max_length=max_length,
                                            padding="max_length")

            # append to lists
            self.input_ids.append(torch.tensor(tokenized_text_dict['input_ids']))
            self.attn_masks.append(torch.tensor(tokenized_text_dict['attention_mask']))
            self.comments.append(comment)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx], self.comments[idx]


def build_dataset(path_to_video_info_file: str = "/content/drive/MyDrive/comments-generator/USvideos.csv",
                  path_to_video_comments_file: str = "/content/drive/MyDrive/comments-generator/UScomments.csv"
                  ) -> pd.DataFrame:
    # video info
    df_video_info = pd.read_csv(path_to_video_info_file,
                                sep=',',
                                quotechar='"',
                                skipinitialspace=True,
                                on_bad_lines='skip',
                                header=0)
    df_video_info = df_video_info.drop(
        columns=["channel_title", "category_id", "tags", "views", "likes", "dislikes", "comment_total",
                 "thumbnail_link", "date"])
    # tags could be left as a feature

    # video comments
    df_comments = pd.read_csv(path_to_video_comments_file,
                              sep=',',
                              quotechar='"',
                              skipinitialspace=True,
                              on_bad_lines='skip',
                              header=0)
    df_comments = df_comments.drop(columns=["likes", "replies"])

    # concatenating dataframes
    dataframe = df_comments.merge(df_video_info, on="video_id", how="inner").drop_duplicates()
    dataframe.drop(columns=["video_id"])

    # resulting dataframe has 2 columns: 'title', 'comment_text'
    return dataframe


def train_eval_test_dataset_split(dataframe: pd.DataFrame, train_ratio: float = 0.8) -> Tuple[List[str], List[str], List[str], List[str], List[str], List[str]]:
    # train = 0.8, eval = 0.15, test = 0.05
    X_train, X_eval_test, y_train, y_eval_test = \
        skm.train_test_split(dataframe['title'].tolist(),
                             dataframe['comment_text'].tolist(),
                             shuffle=False,
                             train_size=train_ratio
                             #stratify=dataframe['comment_text']
                             )
    print(X_eval_test)
        
    X_eval, X_test, y_eval, y_test = \
        skm.train_test_split(X_eval_test,
                             y_eval_test,
                             shuffle=False,
                             test_size=0.25
                             )

    return X_train, y_train, X_eval, y_eval, X_test, y_test

Model utils

In [4]:
def build_model(gpt2_type: str = 'gpt2'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type,
                                              bos_token='<startoftext',
                                              eos_token='<endoftext',
                                              pad_token='<pad>')
    model = GPT2LMHeadModel.from_pretrained(gpt2_type).to(device)
    model.resize_token_embeddings(len(tokenizer))
    return device, tokenizer, model


def train_model(model, train_dataset, eval_dataset):
    # creating training arguments
    training_args = TrainingArguments(output_dir='/content/drive/MyDrive/comments-generator/trained-model',
                                      num_train_epochs=2,
                                      logging_steps=10,
                                      load_best_model_at_end=True,
                                      save_strategy="epoch",
                                      evaluation_strategy="epoch",
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      weight_decay=0.01,
                                      logging_dir='/content/drive/MyDrive/comments-generator/logs'
                                      )
    # start training
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                  'attention_mask': torch.stack([f[1] for f in data]),
                                                  'labels': torch.stack([f[0] for f in data])
                                                  # it is a text generation model that uses the prompt itself as the label.
                                                  })
    trainer.train()
    return trainer


def score_model(model, tokenizer, device, X_test, y_test):
    # set the model to eval mode
    _ = model.eval()

    # run model on all test data
    original_comments, predicted_comments, video_titles = [], [], []

    # iter over all test data
    for video_title, comment in zip(X_test, y_test):
        # create request to model (the same as request that was used in training)
        req_text = f"<startoftext>The YouTube video named '{video_title}' may have the following comment:"
        tokenized_req_text = tokenizer(f'{req_text}', return_tensors="pt").input_ids.to(device)

        # perform prediction
        sample_outputs = model.generate(tokenized_req_text,
                                        do_sample=False,
                                        top_k=50,
                                        max_length=512,
                                        top_p=0.9,
                                        temperature=0,
                                        num_return_sequences=0
                                        )

        # decode predicted tokens into text
        predicted_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
        # extract the predicted comment
        try:
            predicted_comment = re.findall("comment: (.*)", predicted_text)[-1]
        except:
            predicted_comment = "None"

        # append results
        original_comments.append(comment)
        predicted_comments.append(predicted_comment)
        video_titles.append(video_title)

    # transform result into dataframe
    df = pd.DataFrame({'video_title': video_titles,
                       'original_comment': original_comments,
                       'predicted_comment': predicted_comments})

    # calc the accuracy
    print(f1_score(original_comments, predicted_comments, average='macro'))



def save_model(model: torch.nn.Module, path_to_save_file: str = 'saved-model.pklz'):
    pickle.dump(model, open(path_to_save_file, 'wb'))

Main

In [5]:
def fine_tune_gpt2_model():
    # seed
    torch.manual_seed(42)

    # building model
    device, tokenizer, model = build_model('gpt2')

    # building dataset
    dataframe = build_dataset()
    X_train, y_train, X_eval, y_eval, X_test, y_test = train_eval_test_dataset_split(dataframe)
    train_dataset = YTCommentsDataset(X_train, y_train, tokenizer)
    eval_dataset = YTCommentsDataset(X_eval, y_eval, tokenizer)

    # train model
    trainer = train_model(model, train_dataset, eval_dataset)

    # score model
    # score_model(model, tokenizer, device, X_test, y_test)

    # save model
    # save_model(model)

In [6]:
fine_tune_gpt2_model()

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  df_comments = pd.read_csv(path_to_video_comments_file,


ValueError: ignored