In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset

## Important Step:
please change the filename below to the file you want to use for training (This should not include the .csv)

In [2]:
# Specify the training file to take. Change the hashes, filename = '###'
filename = 'MDN_20K_v2'


# Load the CSV file
csv_file = '../3. Cleaned Data/'+filename+'.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,corrected,original_par,corrected_par,word_count_original,word_count_corrected,word_count_diff,word_count_ratio
0,2086,"My dog was die and I lose my job, my bussness ...","My dog died and I lost my job, my business fai...",,,18,16,-2,0.888889
1,32396,I don't kown why my aesthetic standard is so s...,I don't know why my aesthetic standard is so s...,,,10,10,0,1.0
2,84278,The technical usually in the end of the song.,I usually show my technic at the end of the song.,,,9,11,2,1.222222
3,58304,I ever be the Asian Games Volunteer.,I have ever been the Asian Games Volunteer.,,,7,8,1,1.142857
4,76587,I want to change it!,I want to change this!,,,5,5,0,1.0


In [4]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)

In [5]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert the train and validation DataFrames to Hugging Face's Dataset instances
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [6]:
# Export the validation set
val_file = './../3. Cleaned Data/'+filename+'_val.csv'
val_df.to_csv(val_file, index=False)

In [7]:
# Chose the model
model_name = 'gpt2'

In [8]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

This model will run on CUDA
cuda:0


In [9]:
# Load the tokenizer and the model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config).to(device)

In [10]:
# Ensure that the tokenizer uses the same special tokens as GPT-2
tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize and format input-output pairs
def tokenize_function(examples):
    inputs = [f"input: {orig} output: {corr}" for orig, corr in zip(examples["original"], examples["corrected"])]
    return tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [11]:
# Tokenize the train and validation data
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [12]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_steps=500,
    save_steps=500,
    warmup_steps=200,
    logging_dir="logs",
    evaluation_strategy="steps",
    logging_steps=100,
)

In [13]:
# Define a custom loss function to focus on the "output" tokens
def custom_loss_function(outputs, labels):
    shift_logits = outputs.logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)
    return loss

In [14]:
# Define a custom Trainer class that inherits from the original Trainer
class CustomTrainer(Trainer):
    
    # Override the compute_loss method to use a custom loss function
    def compute_loss(self, model, inputs, return_outputs=False):
        # Get the labels from the inputs dictionary and remove them from the inputs
        labels = inputs.pop("labels")
        
        # Get the model outputs by passing the inputs to the model
        outputs = model(**inputs)
        
        # Extract the logits from the model outputs
        logits = outputs.logits
        
        # Get the correct dimensions for the shift_labels tensor
        shift_labels = labels[..., 1:].reshape(-1)

        # Reshape the shift_logits tensor to align with the dimensions of the shift_labels tensor
        shift_logits = logits[..., :-1, :].reshape(-1, logits.size(-1))

        loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)

        if return_outputs:
            return loss, outputs
        
        # Otherwise, just return the loss
        return loss

# Create the custom Trainer with the custom loss function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    
    # Use the DataCollatorForLanguageModeling to handle the data collation
    # Set mlm=False, as we are not using masked language modeling
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

In [15]:
print(model)
print(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
cuda:0


In [16]:
# Train the model
trainer.train()



  0%|          | 0/1689 [00:00<?, ?it/s]

{'loss': 3.0427, 'learning_rate': 2.5e-05, 'epoch': 0.18}
{'loss': 2.5509, 'learning_rate': 5e-05, 'epoch': 0.36}
{'loss': 2.4852, 'learning_rate': 4.664204163868368e-05, 'epoch': 0.53}
{'loss': 2.4492, 'learning_rate': 4.3284083277367364e-05, 'epoch': 0.71}
{'loss': 2.4355, 'learning_rate': 3.9926124916051045e-05, 'epoch': 0.89}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.3500618934631348, 'eval_runtime': 5.0896, 'eval_samples_per_second': 392.959, 'eval_steps_per_second': 12.378, 'epoch': 0.89}
{'loss': 2.3621, 'learning_rate': 3.6568166554734725e-05, 'epoch': 1.07}
{'loss': 2.3002, 'learning_rate': 3.32102081934184e-05, 'epoch': 1.24}
{'loss': 2.271, 'learning_rate': 2.9852249832102087e-05, 'epoch': 1.42}
{'loss': 2.2643, 'learning_rate': 2.6494291470785764e-05, 'epoch': 1.6}
{'loss': 2.3108, 'learning_rate': 2.3136333109469445e-05, 'epoch': 1.78}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.317484140396118, 'eval_runtime': 4.9403, 'eval_samples_per_second': 404.83, 'eval_steps_per_second': 12.752, 'epoch': 1.78}
{'loss': 2.2656, 'learning_rate': 1.9778374748153125e-05, 'epoch': 1.95}
{'loss': 2.2205, 'learning_rate': 1.6420416386836803e-05, 'epoch': 2.13}
{'loss': 2.2013, 'learning_rate': 1.3062458025520485e-05, 'epoch': 2.31}
{'loss': 2.1893, 'learning_rate': 9.704499664204164e-06, 'epoch': 2.49}
{'loss': 2.199, 'learning_rate': 6.346541302887844e-06, 'epoch': 2.66}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.3089771270751953, 'eval_runtime': 4.9892, 'eval_samples_per_second': 400.866, 'eval_steps_per_second': 12.627, 'epoch': 2.66}
{'loss': 2.1755, 'learning_rate': 2.988582941571525e-06, 'epoch': 2.84}
{'train_runtime': 629.2269, 'train_samples_per_second': 85.82, 'train_steps_per_second': 2.684, 'train_loss': 2.3493096016792943, 'epoch': 3.0}


TrainOutput(global_step=1689, training_loss=2.3493096016792943, metrics={'train_runtime': 629.2269, 'train_samples_per_second': 85.82, 'train_steps_per_second': 2.684, 'train_loss': 2.3493096016792943, 'epoch': 3.0})

In [17]:
# Save the trained model and tokenizer
output_dir = "../7. Models/GPT2_"+filename+"/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Then compress with this command: tar czvf trained_model.tar.gz trained_model/
# Upload to git/drive


('../7. Models/GPT2_MDN_20K_v2/tokenizer_config.json',
 '../7. Models/GPT2_MDN_20K_v2/special_tokens_map.json',
 '../7. Models/GPT2_MDN_20K_v2/vocab.json',
 '../7. Models/GPT2_MDN_20K_v2/merges.txt',
 '../7. Models/GPT2_MDN_20K_v2/added_tokens.json')

In [18]:
# Load trained model
output_dir = "../7. Models/GPT2_"+filename+"/"
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
