In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.data.metrics import bleu_score

from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

from tqdm.auto import tqdm

In [3]:
data_file_open = open('/content/data_f.txt', 'r', encoding='UTF-8')
label_file_open = open('/content/label_f.txt', 'r', encoding='UTF-8')
data_lines=data_file_open.readlines()
label_lines = label_file_open.readlines()

input_dataset = []
output_dataset = []
dataset = []


for item in data_lines:
    input_dataset.append(item.strip())
for item in label_lines:
    output_dataset.append(item.strip())

In [4]:
print(len(input_dataset))
print(len(output_dataset))

42535
42535


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)

tokenizer.pad_token = tokenizer.eos_token

In [6]:
dataset = []
for i in range(len(input_dataset)):
    data = input_dataset[i]+' = '+output_dataset[i]
    #if(len(data)>200):
    #    continue
    #if(len(data)<20):
    #    continue
    dataset.append(data)

In [7]:
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

In [8]:
print(len(train_dataset))
print(len(val_dataset))

34028
8507


In [9]:
tokenized_dataset_train = []

tokenized_dataset_val = []


for data in train_dataset:
    tokenized_dataset_train.append(tokenizer.encode(data))

for data in val_dataset:
    tokenized_dataset_val.append(tokenizer.encode(data))

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
print(model.named_parameters())

<generator object Module.named_parameters at 0x7f3adb92f3d0>


In [12]:
for name, param in model.named_parameters():
    if name.startswith("transformer.h.9"):
        param.requires_grad = True
    elif name.startswith("transformer.h.10"):
        param.requires_grad = True
    elif name.startswith("transformer.h.11"):
        param.requires_grad = True
    elif name.startswith("transformer.ln_f"):
        param.requires_grad = True
    else:
        param.requires_grad = False

In [13]:
for name, param in model.named_parameters():
    if name.startswith("transformer.ln_f"):
        print(name)
        print(param.requires_grad)

transformer.ln_f.weight
True
transformer.ln_f.bias
True


In [14]:
training_args = TrainingArguments(
          output_dir='/content/train_2',
          learning_rate=8e-5,
          per_device_train_batch_size=8,
          per_device_eval_batch_size=8,
          num_train_epochs=5,
          evaluation_strategy = "epoch",
          save_strategy='epoch'
      )

In [15]:
trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=tokenized_dataset_train,
          eval_dataset=tokenized_dataset_val,
          data_collator=data_collator)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 34028
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 21270
  Number of trainable parameters = 21265152


Epoch,Training Loss,Validation Loss
1,3.2822,3.078888
2,3.1653,3.010662
3,3.065,2.985982
4,3.0348,2.972035
5,2.9631,2.969222


***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-4254
Configuration saved in /content/train_2/checkpoint-4254/config.json
Model weights saved in /content/train_2/checkpoint-4254/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-8508
Configuration saved in /content/train_2/checkpoint-8508/config.json
Model weights saved in /content/train_2/checkpoint-8508/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-12762
Configuration saved in /content/train_2/checkpoint-12762/config.json
Model weights saved in /content/train_2/checkpoint-12762/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-17016
Configuration saved in /content/train_2/checkpoint

TrainOutput(global_step=21270, training_loss=3.1426242327992773, metrics={'train_runtime': 1844.4508, 'train_samples_per_second': 92.244, 'train_steps_per_second': 11.532, 'total_flos': 4490748582912000.0, 'train_loss': 3.1426242327992773, 'epoch': 5.0})

In [17]:
trainer.save_model()

Saving model checkpoint to /content/train_2
Configuration saved in /content/train_2/config.json
Model weights saved in /content/train_2/pytorch_model.bin


In [18]:
model = GPT2LMHeadModel.from_pretrained('/content/train_2')

loading configuration file /content/train_2/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_siz

In [38]:
train_dataset[100]

"Yes, bloody cloth, I'll keep you, because I wanted you to be this color. = Yea, bloody cloth, I'll keep thee, for I wish'd Thou shouldst be colour'd thus."

In [20]:
val_dataset[99]

'Freedom! = Freedom!'

In [21]:
input = tokenizer.encode(input_dataset[0]+' = ')

In [22]:
input = tokenizer.encode('Hello, how are you? = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'Hello, how are you? =  How now, how fares you?  and so fare thee?  Is this how? sayest thou? what are thy powers?  Is this'

In [23]:
input = tokenizer.encode('You were either ignorant of it or, seeing it, why were you so childishly friendly? = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'You were either ignorant of it or, seeing it, why were you so childishly friendly? =  Or were you, either unaware, or, seeing it, Why be so unkind to the young?  Or, knowing it, how do you'

In [24]:
input = tokenizer.encode('I will always follow your instructions. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'I will always follow your instructions. = .  " = I will always follow thy instructions.  \'Faithful!  \'By thy office, my lord.  \'By thy will'

In [30]:
input = tokenizer.encode('We shall leave tomorrow. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'We shall leave tomorrow. =  Tomorrow shall we depart.  Next day shall we yet be gone. = Tomorrow tomorrow shall we now be departed.   Tomorrow shall we tomorrow be'

In [31]:
input = tokenizer.encode('Let\'s see, will his finger catch fire? = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

"Let's see, will his finger catch fire? =  'Twill show him fire, to the finger of his master?  'Tis his finger.  'Let us observe, though his finger"

In [42]:
input = tokenizer.encode('Yes, bloody cloth, I\'ll keep you, because I wanted you to be this color. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

"Yes, bloody cloth, I'll keep you, because I wanted you to be this color. =  'Do, bloody cloth, For I had intended thee to be this color, for there I would be.  'Thou must be damned with"

In [33]:
from google.colab import drive

import shutil

drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
shutil.copy("/content/train_2/config.json","/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/")
shutil.copy("/content/train_2/pytorch_model.bin","/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/")
shutil.copy("/content/train_2/training_args.bin","/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/")

'/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/training_args.bin'