In [None]:
!pip install transformers
!pip install wandb --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 14.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 68.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.5-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 14.7 MB

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.data.metrics import bleu_score

from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

from tqdm.auto import tqdm

import wandb

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Set Hyperparameters

In [None]:
config = dict(num_frozen_layers=7,
              learning_rate=8e-5,
              num_epochs=3,
              batch_size=8,
              model='gpt2')

### Import Model

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(config['model'])
model = GPT2LMHeadModel.from_pretrained(config['model'], pad_token_id=tokenizer.eos_token_id)

tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
model = model.to(device)

### Freeze Layers

In [None]:
def create_frozen_model(model, num_frozen_layers):

    layers = ['transformer.h.1','transformer.h.2','transformer.h.3','transformer.h.4','transformer.h.5','transformer.h.6','transformer.h.7','transformer.h.8','transformer.h.9','transformer.h.10','transformer.h.11','transformer.ln_f']

    if num_frozen_layers > len(layers):
        print("Error: Number of layers to freeze should be less than or equal to number of layers in the model:",len(layers))
        return 

    keep_layers = layers[num_frozen_layers:]
    for name, param in model.named_parameters():
        param.requires_grad = False   #setting all gradients to false

    for layer_name in keep_layers:
        for name, param in model.named_parameters():
            if name.startswith(layer_name):
                param.requires_grad = True

    return

In [None]:
create_frozen_model(model, num_frozen_layers=config['num_frozen_layers'])

### Create Dataset

In [None]:
data_file_open = open('/content/data_f.txt', 'r', encoding='UTF-8')
label_file_open = open('/content/label_f.txt', 'r', encoding='UTF-8')
data_lines=data_file_open.readlines()
label_lines = label_file_open.readlines()

input_dataset = []
output_dataset = []
dataset = []


for item in data_lines:
    input_dataset.append(item.strip())
for item in label_lines:
    output_dataset.append(item.strip())

In [None]:
print(len(input_dataset))
print(len(output_dataset))

42535
42535


In [None]:
dataset = []
for i in range(len(input_dataset)):
    data = input_dataset[i]+' = '+output_dataset[i]
    dataset.append(data)

In [None]:
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

In [None]:
print(len(train_dataset))
print(len(val_dataset))

34028
8507


In [None]:
tokenized_dataset_train = []

tokenized_dataset_val = []


for data in train_dataset:
    tokenized_dataset_train.append(tokenizer.encode(data))

for data in val_dataset:
    tokenized_dataset_val.append(tokenizer.encode(data))

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
wandb.init(
  project="Antiquator_Trial",
  config=config
)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrohan-ajwani[0m ([33mantiquator[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
training_args = TrainingArguments(
          output_dir='/content/train_2',
          learning_rate=config['learning_rate'],
          per_device_train_batch_size=config['batch_size'],
          per_device_eval_batch_size=config['batch_size'],
          num_train_epochs=config['num_epochs'],
          evaluation_strategy = "epoch",
          save_strategy='epoch',
          report_to="wandb"
      )

In [None]:
trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=tokenized_dataset_train,
          eval_dataset=tokenized_dataset_val,
          data_collator=data_collator)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 34028
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12762
  Number of trainable parameters = 28353024
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,4.6192,4.452034
2,4.5461,4.382207
3,4.5,4.364325


***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-4254
Configuration saved in /content/train_2/checkpoint-4254/config.json
Model weights saved in /content/train_2/checkpoint-4254/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-8508
Configuration saved in /content/train_2/checkpoint-8508/config.json
Model weights saved in /content/train_2/checkpoint-8508/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8507
  Batch size = 8
Saving model checkpoint to /content/train_2/checkpoint-12762
Configuration saved in /content/train_2/checkpoint-12762/config.json
Model weights saved in /content/train_2/checkpoint-12762/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=12762, training_loss=4.579776488320428, metrics={'train_runtime': 1206.6907, 'train_samples_per_second': 84.598, 'train_steps_per_second': 10.576, 'total_flos': 2701763693568000.0, 'train_loss': 4.579776488320428, 'epoch': 3.0})

In [None]:
trainer.save_model()

Saving model checkpoint to /content/train_2
Configuration saved in /content/train_2/config.json
Model weights saved in /content/train_2/pytorch_model.bin


In [None]:
model = GPT2LMHeadModel.from_pretrained('/content/train_2')

loading configuration file /content/train_2/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_siz

In [None]:
train_dataset[10]

'Will he travel onwards, or go back to France? = will he travel higher, or return again into France?'

In [None]:
val_dataset[99]

"I'll go and find him. = I’ll go seek him."

In [None]:
input = tokenizer.encode(input_dataset[0]+' = ')

In [None]:
input = tokenizer.encode('Hello, how are you? = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'Hello, how are you? = ____ Your eyes glisten at your hands, gliding through space. = ~~ You have a strange feeling, but my hands are still there'

In [None]:
input = tokenizer.encode('I give you both many and hearty thanks. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'I give you both many and hearty thanks. = __________________<|endoftext|>'

In [None]:
input = tokenizer.encode('I will always follow your instructions. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'I will always follow your instructions. = *********I will always follow your instructions. = *********\n\nRAW Paste Data\n\n= *********I will always follow your instructions'

In [None]:
input = tokenizer.encode('We shall leave tomorrow. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

'We shall leave tomorrow. = ~~~I am going on my own. ^_^\n\nRAW Paste Data\n\n[S01E03] [Loudspeakers'

In [None]:
input = tokenizer.encode('Let\'s see, will his finger catch fire? = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

"Let's see, will his finger catch fire? = \xa0The second he throws it, something goes wrong and the second he touches it, it's just a small ball to get on you. His index"

In [None]:
input = tokenizer.encode('Yes, bloody cloth, I\'ll keep you, because I wanted you to be this color. = ')

output = model.generate(input_ids=torch.tensor([input]),max_new_tokens=30, do_sample=True)

tokenizer.decode(output[0])

"Yes, bloody cloth, I'll keep you, because I wanted you to be this color. = __________________\n\nI'll be right here. This is going to be a great week, but this isn't for the last one. = "

In [None]:
from google.colab import drive

import shutil

drive.mount('/content/drive')

MessageError: ignored

In [None]:
shutil.copy("/content/train_2/config.json","/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/")
shutil.copy("/content/train_2/pytorch_model.bin","/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/")
shutil.copy("/content/train_2/training_args.bin","/content/drive/MyDrive/ECE1786_Project/Models/GPT2_4Layers_8e-5/")