In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset

## Important Step:
please change the filename below to the file you want to use for training (This should not include the .csv)

In [23]:
# Specify the training file to take. Change the hashes, filename = '###'
filename = '40K'


# Load the CSV file
csv_file = './../3. Cleaned Data/'+filename+'.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,word_count_original,corrected_fs,word_count_corrected_fs,word_count_diff,word_count_ratio,o_pos_tags,o_num_verbs,o_num_nouns,...,c_num_modal,num_verbs_diff,num_nouns_diff,num_adjs_diff,num_adv_diff,num_pronoun_diff,num_preposition_diff,num_conjunction_diff,num_article_diff,num_modal_diff
0,539741,"Since I have begun to live in London, I have b...",16,"Since I have begun living in London, I have be...",15,-1,0.9375,"[('Since', 'IN'), ('I', 'PRP'), ('have', 'VBP'...",7,2,...,0,-1,1,0,0,0,0,0,0,0
1,805208,"If she heard my description of her, she would ...",12,"If she had heard my description of her, she wo...",13,1,1.083333,"[('If', 'IN'), ('she', 'PRP'), ('heard', 'VBD'...",2,2,...,1,1,0,0,0,0,0,0,0,0
2,553823,I never can do.,4,I could never do it.,5,1,1.25,"[('I', 'PRP'), ('never', 'RB'), ('can', 'MD'),...",1,0,...,1,0,0,0,0,1,0,0,0,0
3,792625,In the class I learned how to write paragraph ...,10,In the class I learned how to write paragraphs.,9,-1,0.9,"[('In', 'IN'), ('the', 'DT'), ('class', 'NN'),...",2,2,...,0,0,0,-1,0,0,0,0,0,0
4,686799,I think imitating is important thing to learn ...,9,I think imitating is important to learning Eng...,8,-1,0.888889,"[('I', 'PRP'), ('think', 'VBP'), ('imitating',...",4,2,...,0,0,-1,0,0,0,0,0,0,0


In [4]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)

In [5]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert the train and validation DataFrames to Hugging Face's Dataset instances
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [6]:
# Chose the model
model_name = 'gpt2'

In [7]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

This model will run on CUDA
cuda:0


In [8]:
# Load the tokenizer and the model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config).to(device)

In [9]:
# Ensure that the tokenizer uses the same special tokens as GPT-2
tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize and format input-output pairs
def tokenize_function(examples):
    inputs = [f"input: {orig} output: {corr}" for orig, corr in zip(examples["original"], examples["corrected"])]
    return tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [10]:
# Tokenize the train and validation data
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [11]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=500,
    save_steps=500,
    warmup_steps=200,
    logging_dir="logs",
    evaluation_strategy="steps",
    logging_steps=100,
)

In [12]:
# Define a custom loss function to focus on the "output" tokens
def custom_loss_function(outputs, labels):
    shift_logits = outputs.logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)
    return loss

In [13]:
# Define a custom Trainer class that inherits from the original Trainer
class CustomTrainer(Trainer):
    
    # Override the compute_loss method to use a custom loss function
    def compute_loss(self, model, inputs, return_outputs=False):
        # Get the labels from the inputs dictionary and remove them from the inputs
        labels = inputs.pop("labels")
        
        # Get the model outputs by passing the inputs to the model
        outputs = model(**inputs)
        
        # Extract the logits from the model outputs
        logits = outputs.logits
        
        # Get the correct dimensions for the shift_labels tensor
        shift_labels = labels[..., 1:].reshape(-1)

        # Reshape the shift_logits tensor to align with the dimensions of the shift_labels tensor
        shift_logits = logits[..., :-1, :].reshape(-1, logits.size(-1))

        loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)

        if return_outputs:
            return loss, outputs
        
        # Otherwise, just return the loss
        return loss

# Create the custom Trainer with the custom loss function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    
    # Use the DataCollatorForLanguageModeling to handle the data collation
    # Set mlm=False, as we are not using masked language modeling
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

In [14]:
print(model)
print(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
cuda:0


In [15]:
# Train the model
trainer.train()



  0%|          | 0/6750 [00:00<?, ?it/s]

{'loss': 3.1382, 'learning_rate': 2.5e-05, 'epoch': 0.04}
{'loss': 2.5698, 'learning_rate': 5e-05, 'epoch': 0.09}
{'loss': 2.5266, 'learning_rate': 4.923664122137405e-05, 'epoch': 0.13}
{'loss': 2.4917, 'learning_rate': 4.847328244274809e-05, 'epoch': 0.18}
{'loss': 2.4935, 'learning_rate': 4.7709923664122144e-05, 'epoch': 0.22}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.3650121688842773, 'eval_runtime': 2.9561, 'eval_samples_per_second': 676.575, 'eval_steps_per_second': 84.572, 'epoch': 0.22}
{'loss': 2.4111, 'learning_rate': 4.694656488549619e-05, 'epoch': 0.27}
{'loss': 2.4188, 'learning_rate': 4.618320610687023e-05, 'epoch': 0.31}
{'loss': 2.4324, 'learning_rate': 4.541984732824428e-05, 'epoch': 0.36}
{'loss': 2.4366, 'learning_rate': 4.465648854961832e-05, 'epoch': 0.4}
{'loss': 2.4007, 'learning_rate': 4.389312977099237e-05, 'epoch': 0.44}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.3255860805511475, 'eval_runtime': 2.9581, 'eval_samples_per_second': 676.104, 'eval_steps_per_second': 84.513, 'epoch': 0.44}
{'loss': 2.388, 'learning_rate': 4.312977099236641e-05, 'epoch': 0.49}
{'loss': 2.3941, 'learning_rate': 4.236641221374046e-05, 'epoch': 0.53}
{'loss': 2.3798, 'learning_rate': 4.160305343511451e-05, 'epoch': 0.58}
{'loss': 2.4025, 'learning_rate': 4.0839694656488554e-05, 'epoch': 0.62}
{'loss': 2.331, 'learning_rate': 4.00763358778626e-05, 'epoch': 0.67}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.293044328689575, 'eval_runtime': 2.9301, 'eval_samples_per_second': 682.569, 'eval_steps_per_second': 85.321, 'epoch': 0.67}
{'loss': 2.3312, 'learning_rate': 3.9312977099236644e-05, 'epoch': 0.71}
{'loss': 2.3359, 'learning_rate': 3.854961832061069e-05, 'epoch': 0.76}
{'loss': 2.3214, 'learning_rate': 3.778625954198473e-05, 'epoch': 0.8}
{'loss': 2.3077, 'learning_rate': 3.702290076335878e-05, 'epoch': 0.84}
{'loss': 2.3597, 'learning_rate': 3.625954198473282e-05, 'epoch': 0.89}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.2830474376678467, 'eval_runtime': 2.9374, 'eval_samples_per_second': 680.866, 'eval_steps_per_second': 85.108, 'epoch': 0.89}
{'loss': 2.3186, 'learning_rate': 3.549618320610687e-05, 'epoch': 0.93}
{'loss': 2.3074, 'learning_rate': 3.473282442748092e-05, 'epoch': 0.98}
{'loss': 2.2553, 'learning_rate': 3.3969465648854964e-05, 'epoch': 1.02}
{'loss': 2.0927, 'learning_rate': 3.320610687022901e-05, 'epoch': 1.07}
{'loss': 2.1058, 'learning_rate': 3.2442748091603054e-05, 'epoch': 1.11}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.285987377166748, 'eval_runtime': 2.9736, 'eval_samples_per_second': 672.596, 'eval_steps_per_second': 84.075, 'epoch': 1.11}
{'loss': 2.1393, 'learning_rate': 3.16793893129771e-05, 'epoch': 1.16}
{'loss': 2.1397, 'learning_rate': 3.091603053435115e-05, 'epoch': 1.2}
{'loss': 2.1267, 'learning_rate': 3.0152671755725192e-05, 'epoch': 1.24}
{'loss': 2.1414, 'learning_rate': 2.9389312977099237e-05, 'epoch': 1.29}
{'loss': 2.1671, 'learning_rate': 2.862595419847328e-05, 'epoch': 1.33}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.2715320587158203, 'eval_runtime': 2.9461, 'eval_samples_per_second': 678.871, 'eval_steps_per_second': 84.859, 'epoch': 1.33}
{'loss': 2.1311, 'learning_rate': 2.7862595419847333e-05, 'epoch': 1.38}
{'loss': 2.1323, 'learning_rate': 2.7099236641221375e-05, 'epoch': 1.42}
{'loss': 2.1054, 'learning_rate': 2.633587786259542e-05, 'epoch': 1.47}
{'loss': 2.152, 'learning_rate': 2.5572519083969464e-05, 'epoch': 1.51}
{'loss': 2.1499, 'learning_rate': 2.4809160305343512e-05, 'epoch': 1.56}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.2694404125213623, 'eval_runtime': 2.9689, 'eval_samples_per_second': 673.649, 'eval_steps_per_second': 84.206, 'epoch': 1.56}
{'loss': 2.1065, 'learning_rate': 2.404580152671756e-05, 'epoch': 1.6}
{'loss': 2.128, 'learning_rate': 2.3282442748091605e-05, 'epoch': 1.64}
{'loss': 2.1369, 'learning_rate': 2.2519083969465647e-05, 'epoch': 1.69}
{'loss': 2.1528, 'learning_rate': 2.1755725190839695e-05, 'epoch': 1.73}
{'loss': 2.1486, 'learning_rate': 2.099236641221374e-05, 'epoch': 1.78}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.264956474304199, 'eval_runtime': 2.92, 'eval_samples_per_second': 684.932, 'eval_steps_per_second': 85.617, 'epoch': 1.78}
{'loss': 2.1221, 'learning_rate': 2.0229007633587788e-05, 'epoch': 1.82}
{'loss': 2.0999, 'learning_rate': 1.9465648854961833e-05, 'epoch': 1.87}
{'loss': 2.1195, 'learning_rate': 1.8702290076335878e-05, 'epoch': 1.91}
{'loss': 2.1016, 'learning_rate': 1.7938931297709923e-05, 'epoch': 1.96}
{'loss': 2.1447, 'learning_rate': 1.717557251908397e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.261399507522583, 'eval_runtime': 2.9322, 'eval_samples_per_second': 682.089, 'eval_steps_per_second': 85.261, 'epoch': 2.0}
{'loss': 2.0226, 'learning_rate': 1.6412213740458016e-05, 'epoch': 2.04}
{'loss': 2.0335, 'learning_rate': 1.5648854961832064e-05, 'epoch': 2.09}
{'loss': 1.9968, 'learning_rate': 1.4885496183206107e-05, 'epoch': 2.13}
{'loss': 1.9707, 'learning_rate': 1.4122137404580155e-05, 'epoch': 2.18}
{'loss': 2.0311, 'learning_rate': 1.3358778625954198e-05, 'epoch': 2.22}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.2775356769561768, 'eval_runtime': 2.9464, 'eval_samples_per_second': 678.802, 'eval_steps_per_second': 84.85, 'epoch': 2.22}
{'loss': 1.9978, 'learning_rate': 1.2595419847328243e-05, 'epoch': 2.27}
{'loss': 1.9913, 'learning_rate': 1.1832061068702292e-05, 'epoch': 2.31}
{'loss': 1.9719, 'learning_rate': 1.1068702290076336e-05, 'epoch': 2.36}
{'loss': 2.0133, 'learning_rate': 1.0305343511450383e-05, 'epoch': 2.4}
{'loss': 2.0168, 'learning_rate': 9.541984732824428e-06, 'epoch': 2.44}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.276672840118408, 'eval_runtime': 2.9495, 'eval_samples_per_second': 678.091, 'eval_steps_per_second': 84.761, 'epoch': 2.44}
{'loss': 2.0195, 'learning_rate': 8.778625954198473e-06, 'epoch': 2.49}
{'loss': 2.0133, 'learning_rate': 8.015267175572519e-06, 'epoch': 2.53}
{'loss': 2.0059, 'learning_rate': 7.251908396946565e-06, 'epoch': 2.58}
{'loss': 1.97, 'learning_rate': 6.4885496183206104e-06, 'epoch': 2.62}
{'loss': 1.9781, 'learning_rate': 5.725190839694657e-06, 'epoch': 2.67}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.2761130332946777, 'eval_runtime': 2.9775, 'eval_samples_per_second': 671.713, 'eval_steps_per_second': 83.964, 'epoch': 2.67}
{'loss': 1.9875, 'learning_rate': 4.961832061068703e-06, 'epoch': 2.71}
{'loss': 1.9956, 'learning_rate': 4.198473282442748e-06, 'epoch': 2.76}
{'loss': 1.9862, 'learning_rate': 3.4351145038167944e-06, 'epoch': 2.8}
{'loss': 1.9942, 'learning_rate': 2.6717557251908397e-06, 'epoch': 2.84}
{'loss': 1.9888, 'learning_rate': 1.908396946564886e-06, 'epoch': 2.89}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 2.2746191024780273, 'eval_runtime': 2.928, 'eval_samples_per_second': 683.071, 'eval_steps_per_second': 85.384, 'epoch': 2.89}
{'loss': 1.9911, 'learning_rate': 1.1450381679389313e-06, 'epoch': 2.93}
{'loss': 2.0007, 'learning_rate': 3.816793893129771e-07, 'epoch': 2.98}
{'train_runtime': 464.6042, 'train_samples_per_second': 116.228, 'train_steps_per_second': 14.528, 'train_loss': 2.1862977984393086, 'epoch': 3.0}


TrainOutput(global_step=6750, training_loss=2.1862977984393086, metrics={'train_runtime': 464.6042, 'train_samples_per_second': 116.228, 'train_steps_per_second': 14.528, 'train_loss': 2.1862977984393086, 'epoch': 3.0})

In [24]:
# Save the trained model and tokenizer
output_dir = "../7. Models/"+filename+"/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Then compress with this command: tar czvf trained_model.tar.gz trained_model/
# Upload to git/drive


('../7. Models/40K/tokenizer_config.json',
 '../7. Models/40K/special_tokens_map.json',
 '../7. Models/40K/vocab.json',
 '../7. Models/40K/merges.txt',
 '../7. Models/40K/added_tokens.json')

In [21]:
# Load trained model
output_dir = "../7. Models/"+filename+"/"
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
