In [None]:
!pip install datasets transformers numpy
import torch, datasets, transformers
import pandas as pd

# set up Google Drive access
from google.colab import drive
drive.mount('/content/gdrive')

FILE = "Your Directory Here/subfolder/discord.csv" # replace with the directory in your My Drive of the file you created in preprocess.py
df = pd.read_csv('gdrive/My Drive/'+FILE)
dataset = datasets.Dataset.from_pandas(df)
# if you have a lot of data, or not a lot of time, you can do something like:
# dataset = datasets.Dataset.from_pandas(df.sample(2000))
# replace 2000 with something that works on your hardware.

In [None]:
# create a train-test split with 10% of data used for testing
dataset = dataset.train_test_split(test_size=0.1)

Next will be the tokenizer. Here is an example of tokenization:

"I see the Apple store but I don't see any apples"

becomes

`"I", "see", "the", "Apple", "store", "but", "I", "do", "_n't", "see", "any", "apple", "_s", "."`

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2-medium')

def tokenize_conversation(csv_row):
  return tokenizer(csv_row['Conversation'], truncation=True)
tokenized_dataset = dataset.map(tokenize_conversation, batched=True, remove_columns=dataset['train'].column_names)

In [None]:
# create blocks of tokens for training
block_size = 256
def group_texts(examples):
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])
  total_length = (total_length // block_size) * block_size
  result = {
    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
    for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)

In [None]:
# data collator for padding and data preparation before we start training
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# load the GPT model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('gpt2-medium')
# you can also try gpt2-large or gpt2-xl if you have the hardware for it.
# this will need a pretty big GPU! Google Colab can only go up to gpt2-medium.

In [None]:
# training the model
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
  output_dir="dialogue-model",
  evaluation_strategy="epoch",
  learning_rate=2e-5,
  weight_decay=0.01,
  num_train_epochs=1, # this is how many times we go through the entire dataset. try 2 if you have a lot of time.
  per_device_train_batch_size=4,
  per_device_eval_batch_size=8
)

torch.cuda.empty_cache() # get the GPU ready for training
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=lm_dataset['train'],
  eval_dataset=lm_dataset['test'],
  data_collator=data_collator
)

trainer.train() # this will take a while! about 15-20 minutes for me on Colab.

In [None]:
# text generation pipeline to see the trained model in action
from transformers import pipeline
generator = pipeline('text-generation', model=model)

def generate_messages(prompt='', num=10, max_length=128):
  outputs = generator(prompt, num_return_sequences=num, max_new_tokens=max_length)
  for output in outputs:
    print("-"*20)
    text = output['generated_text']
    print(text)

In [None]:
# final function. choose prompts and parameters as desired.
# an example if you want a conversation on a particular topic:
# A: what do you think about large language models?
generate_messages(": ", num=2, max_length=256)