In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [29]:
# Load the CSV file
csv_file = './../3. Cleaned Data/Japanese_to_English_cleaned.csv'
df = pd.read_csv(csv_file)

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,original,word_count_original,corrected_fs,word_count_corrected_fs,word_count_diff,word_count_ratio
0,0,I will appreciate it if you correct my sentences.,9,I would appreciate it if you could correct my ...,10,1,1.111111
1,1,It's been getting colder these days here in Ja...,9,It's been getting colder lately here in Japan.,8,-1,0.888889
2,2,The summer weather in Japan is not agreeable t...,16,I find Japan's summer weather disagreeable bec...,13,-3,0.8125
3,3,"So, as the winter is coming, I'm getting to fe...",11,"So, as the winter is coming, I'm starting to f...",11,0,1.0
4,4,It is the very exciting season.,6,It is a very exciting season.,6,0,1.0


In [31]:
# Change the column names in the dataframe
df.rename(columns = {'corrected_fs':'corrected'}, inplace = True)

In [32]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert the train and validation DataFrames to Hugging Face's Dataset instances
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [33]:
# Chose the model
model_name = 'gpt2'

In [35]:
# Assign cuda to the device to use for training
if torch.cuda.is_available(): 
 dev = "cuda:0" 
 print("This model will run on CUDA")
elif  torch.backends.mps.is_available(): 
 dev = "mps:0"
 print("This model will run on MPS")
else:
 dev = "cpu" 
 print("This model will run on CPU")
device = torch.device(dev) 

print(device)

cuda:0


In [36]:
# Load the tokenizer and the model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config).to(device)

In [37]:
# Ensure that the tokenizer uses the same special tokens as GPT-2
tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize and format input-output pairs
def tokenize_function(examples):
    inputs = [f"input: {orig} output: {corr}" for orig, corr in zip(examples["original"], examples["corrected"])]
    return tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [38]:
# Tokenize the train and validation data
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=['original', 'corrected'])


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=500,
    save_steps=500,
    warmup_steps=200,
    logging_dir="logs",
    evaluation_strategy="steps",
    logging_steps=100,
)

In [None]:
# Define a custom loss function to focus on the "output" tokens
def custom_loss_function(outputs, labels):
    shift_logits = outputs.logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)
    return loss

In [None]:
# Define a custom Trainer class that inherits from the original Trainer
class CustomTrainer(Trainer):
    
    # Override the compute_loss method to use a custom loss function
    def compute_loss(self, model, inputs, return_outputs=False):
        # Get the labels from the inputs dictionary and remove them from the inputs
        labels = inputs.pop("labels")
        
        # Get the model outputs by passing the inputs to the model
        outputs = model(**inputs)
        
        # Extract the logits from the model outputs
        logits = outputs.logits
        
        # Get the correct dimensions for the shift_labels tensor
        shift_labels = labels[..., 1:].reshape(-1)

        # Reshape the shift_logits tensor to align with the dimensions of the shift_labels tensor
        shift_logits = logits[..., :-1, :].reshape(-1, logits.size(-1))

        loss = torch.nn.CrossEntropyLoss()(shift_logits, shift_labels)

        if return_outputs:
            return loss, outputs
        
        # Otherwise, just return the loss
        return loss

# Create the custom Trainer with the custom loss function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    
    # Use the DataCollatorForLanguageModeling to handle the data collation
    # Set mlm=False, as we are not using masked language modeling
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

In [None]:
print(model)
print(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
cuda:0


In [None]:
# Train the model
trainer.train()

  0%|          | 101/133179 [00:14<4:28:15,  8.27it/s]

{'loss': 3.0855, 'learning_rate': 2.5e-05, 'epoch': 0.0}


  0%|          | 200/133179 [00:35<11:28:11,  3.22it/s]

{'loss': 2.5259, 'learning_rate': 5e-05, 'epoch': 0.0}


  0%|          | 300/133179 [00:57<9:10:24,  4.02it/s] 

{'loss': 2.4758, 'learning_rate': 4.996240007820784e-05, 'epoch': 0.01}


  0%|          | 401/133179 [01:21<9:06:07,  4.05it/s] 

{'loss': 2.46, 'learning_rate': 4.992480015641568e-05, 'epoch': 0.01}


  0%|          | 500/133179 [01:44<10:28:56,  3.52it/s]

{'loss': 2.4056, 'learning_rate': 4.988720023462352e-05, 'epoch': 0.01}


                                                       
  0%|          | 500/133179 [04:58<10:28:56,  3.52it/s]

{'eval_loss': 2.338057279586792, 'eval_runtime': 193.97, 'eval_samples_per_second': 406.867, 'eval_steps_per_second': 25.432, 'epoch': 0.01}


  0%|          | 601/133179 [05:26<9:35:05,  3.84it/s]   

{'loss': 2.4009, 'learning_rate': 4.984960031283135e-05, 'epoch': 0.01}


  1%|          | 700/133179 [05:52<12:15:18,  3.00it/s]

{'loss': 2.3564, 'learning_rate': 4.981200039103919e-05, 'epoch': 0.02}


  1%|          | 800/133179 [06:18<12:01:28,  3.06it/s]

{'loss': 2.3457, 'learning_rate': 4.9774400469247026e-05, 'epoch': 0.02}


  1%|          | 900/133179 [06:45<12:36:10,  2.92it/s]

{'loss': 2.3542, 'learning_rate': 4.9736800547454864e-05, 'epoch': 0.02}


  1%|          | 1000/133179 [07:14<11:53:15,  3.09it/s]

{'loss': 2.3719, 'learning_rate': 4.96992006256627e-05, 'epoch': 0.02}




KeyboardInterrupt: 



In [None]:
# Save the trained model and tokenizer
output_dir = "trained_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Then compress with this command: tar czvf trained_model.tar.gz trained_model/
# Upload to git/drive


In [None]:
# Load trained model
output_dir = "trained_model"
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
