In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset

## Important Step:
please change the filename below to the file you want to use for training (This should not include the .csv)

In [19]:
# Specify the training file to take. Change the hashes, filename = '###'
filename = '80K'


# Load the CSV file
csv_file = './../3. Cleaned Data/'+filename+'.csv'
df = pd.read_csv(csv_file)

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,word_count_original,corrected_fs,word_count_corrected_fs,word_count_diff,word_count_ratio,o_pos_tags,o_num_verbs,o_num_nouns,...,c_num_modal,num_verbs_diff,num_nouns_diff,num_adjs_diff,num_adv_diff,num_pronoun_diff,num_preposition_diff,num_conjunction_diff,num_article_diff,num_modal_diff
0,539741,"Since I have begun to live in London, I have b...",16,"Since I have begun living in London, I have be...",15,-1,0.9375,"[('Since', 'IN'), ('I', 'PRP'), ('have', 'VBP'...",7,2,...,0,-1,1,0,0,0,0,0,0,0
1,805208,"If she heard my description of her, she would ...",12,"If she had heard my description of her, she wo...",13,1,1.083333,"[('If', 'IN'), ('she', 'PRP'), ('heard', 'VBD'...",2,2,...,1,1,0,0,0,0,0,0,0,0
2,553823,I never can do.,4,I could never do it.,5,1,1.25,"[('I', 'PRP'), ('never', 'RB'), ('can', 'MD'),...",1,0,...,1,0,0,0,0,1,0,0,0,0
3,792625,In the class I learned how to write paragraph ...,10,In the class I learned how to write paragraphs.,9,-1,0.9,"[('In', 'IN'), ('the', 'DT'), ('class', 'NN'),...",2,2,...,0,0,0,-1,0,0,0,0,0,0
4,686799,I think imitating is important thing to learn ...,9,I think imitating is important to learning Eng...,8,-1,0.888889,"[('I', 'PRP'), ('think', 'VBP'), ('imitating',...",4,2,...,0,0,-1,0,0,0,0,0,0,0


In [21]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)

In [22]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert the train and validation DataFrames to Hugging Face's Dataset instances
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [23]:
# Chose the model
model_name = 'gpt2'

In [24]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

This model will run on CUDA
cuda:0


In [25]:
# Load the tokenizer and the model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config).to(device)

In [26]:
# Ensure that the tokenizer uses the same special tokens as GPT-2
tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize and format input-output pairs
def tokenize_function(examples):
    inputs = [f"input: {orig} output: {corr}" for orig, corr in zip(examples["original"], examples["corrected"])]
    return tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [None]:
# Tokenize the train and validation data
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])

In [28]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_steps=500,
    save_steps=500,
    warmup_steps=200,
    logging_dir="logs",
    evaluation_strategy="steps",
    logging_steps=100,
)

In [29]:
# Define a custom loss function to focus on the "output" tokens
def custom_loss_function(outputs, labels):
    shift_logits = outputs.logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)
    return loss

In [30]:
# Define a custom Trainer class that inherits from the original Trainer
class CustomTrainer(Trainer):
    
    # Override the compute_loss method to use a custom loss function
    def compute_loss(self, model, inputs, return_outputs=False):
        # Get the labels from the inputs dictionary and remove them from the inputs
        labels = inputs.pop("labels")
        
        # Get the model outputs by passing the inputs to the model
        outputs = model(**inputs)
        
        # Extract the logits from the model outputs
        logits = outputs.logits
        
        # Get the correct dimensions for the shift_labels tensor
        shift_labels = labels[..., 1:].reshape(-1)

        # Reshape the shift_logits tensor to align with the dimensions of the shift_labels tensor
        shift_logits = logits[..., :-1, :].reshape(-1, logits.size(-1))

        loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)

        if return_outputs:
            return loss, outputs
        
        # Otherwise, just return the loss
        return loss

# Create the custom Trainer with the custom loss function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    
    # Use the DataCollatorForLanguageModeling to handle the data collation
    # Set mlm=False, as we are not using masked language modeling
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

In [31]:
print(model)
print(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
cuda:0


In [None]:
# Train the model
trainer.train()

In [33]:
# Save the trained model and tokenizer
output_dir = "../7. Models/"+filename+"/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Then compress with this command: tar czvf trained_model.tar.gz trained_model/
# Upload to git/drive


('../7. Models/80K/tokenizer_config.json',
 '../7. Models/80K/special_tokens_map.json',
 '../7. Models/80K/vocab.json',
 '../7. Models/80K/merges.txt',
 '../7. Models/80K/added_tokens.json')

In [34]:
# Load trained model
output_dir = "../7. Models/"+filename+"/"
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
