## This code allows freezing some layers

#### We tried freezing some layers, but the unfrozen model works best, so train with num_frozen_layers=0

In [None]:
# WandB Authorization Key: 33478b17ae12d300fd1949107f96bf4ab4ff75f9

In [None]:
!pip install transformers
!pip install wandb --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 13.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 58.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.5-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 1

### Login to save output to Weights & Biases

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.data.metrics import bleu_score

from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random

from tqdm.auto import tqdm

import wandb

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Set Hyperparameters

In [None]:
config = dict(num_frozen_layers=0,
              learning_rate=5e-5,
              num_epochs=5,
              batch_size=8,
              model='gpt2-medium')  #config['model'] can be any model from the GPT2 family

### Import Model
#### This works for both GPT2 and GPT2 medium

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained(config['model'])
model = GPT2LMHeadModel.from_pretrained(config['model'], pad_token_id=tokenizer.eos_token_id)

tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [None]:
model = model.to(device)

In [None]:
for name, param in model.named_parameters():
    print(name)

transformer.wte.weight
transformer.wpe.weight
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.attn.c_proj.bias
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.0.mlp.c_fc.weight
transformer.h.0.mlp.c_fc.bias
transformer.h.0.mlp.c_proj.weight
transformer.h.0.mlp.c_proj.bias
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.attn.c_proj.bias
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.1.mlp.c_fc.weight
transformer.h.1.mlp.c_fc.bias
transformer.h.1.mlp.c_proj.weight
transformer.h.1.mlp.c_proj.bias
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.attn.c_proj.bias
transformer.h.2.ln_2

### Freeze Layers

In [None]:
def create_frozen_model(model, num_frozen_layers):

    layers = ['transformer.h.1','transformer.h.2','transformer.h.3','transformer.h.4','transformer.h.5','transformer.h.6','transformer.h.7','transformer.h.8','transformer.h.9','transformer.h.10','transformer.h.11','transformer.ln_f']

    if num_frozen_layers > len(layers):
        print("Error: Number of layers to freeze should be less than or equal to number of layers in the model:",len(layers))
        return

    keep_layers = layers[num_frozen_layers:]
    for name, param in model.named_parameters():
        param.requires_grad = False   #setting all gradients to false

    for layer_name in keep_layers:
        for name, param in model.named_parameters():
            if name.startswith(layer_name):
                param.requires_grad = True

    return

In [None]:
create_frozen_model(model, num_frozen_layers=config['num_frozen_layers'])

In [None]:
for name, param in model.named_parameters():
    if name.startswith('transformer.h.1'):
        print(name)
        print(param.requires_grad)

transformer.h.1.ln_1.weight
True
transformer.h.1.ln_1.bias
True
transformer.h.1.attn.c_attn.weight
True
transformer.h.1.attn.c_attn.bias
True
transformer.h.1.attn.c_proj.weight
True
transformer.h.1.attn.c_proj.bias
True
transformer.h.1.ln_2.weight
True
transformer.h.1.ln_2.bias
True
transformer.h.1.mlp.c_fc.weight
True
transformer.h.1.mlp.c_fc.bias
True
transformer.h.1.mlp.c_proj.weight
True
transformer.h.1.mlp.c_proj.bias
True
transformer.h.10.ln_1.weight
True
transformer.h.10.ln_1.bias
True
transformer.h.10.attn.c_attn.weight
True
transformer.h.10.attn.c_attn.bias
True
transformer.h.10.attn.c_proj.weight
True
transformer.h.10.attn.c_proj.bias
True
transformer.h.10.ln_2.weight
True
transformer.h.10.ln_2.bias
True
transformer.h.10.mlp.c_fc.weight
True
transformer.h.10.mlp.c_fc.bias
True
transformer.h.10.mlp.c_proj.weight
True
transformer.h.10.mlp.c_proj.bias
True
transformer.h.11.ln_1.weight
True
transformer.h.11.ln_1.bias
True
transformer.h.11.attn.c_attn.weight
True
transformer.h.11.

### Create Dataset

In [None]:
data_file_open = open('/content/data_f.txt', 'r', encoding='UTF-8')
label_file_open = open('/content/label_f.txt', 'r', encoding='UTF-8')
data_lines=data_file_open.readlines()
label_lines = label_file_open.readlines()

input_dataset = []
output_dataset = []
dataset = []


for item in data_lines:
    input_dataset.append(item.strip())
for item in label_lines:
    output_dataset.append(item.strip())

In [None]:
print(len(input_dataset))
print(len(output_dataset))

42535
42535


In [None]:
dataset = []
for i in range(len(input_dataset)):
    data = input_dataset[i]+' = '+output_dataset[i]
    dataset.append(data)

In [None]:
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

In [None]:
print(len(train_dataset))
print(len(val_dataset))

34028
8507


In [None]:
tokenized_dataset_train = []

tokenized_dataset_val = []


for data in train_dataset:
    tokenized_dataset_train.append(tokenizer.encode(data))

for data in val_dataset:
    tokenized_dataset_val.append(tokenizer.encode(data))

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

### Initialize W&B to save logs

In [None]:
run_name = config['model']+'_'+str(config['num_frozen_layers'])+'frozen_'+str(config['learning_rate'])+'lr_'+str(config['num_epochs'])+'epochs'

In [None]:
run_name

'gpt2_0frozen_5e-05lr_5epochs'

In [None]:
wandb.init(
  project="Antiquator_Trial_new",
  config=config
)

wandb.run.name = run_name

wandb.run.save()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrohan-ajwani[0m ([33mantiquator[0m). Use [1m`wandb login --relogin`[0m to force relogin




True

In [None]:
wandb.run.name

'gpt2_10frozen_5e-05lr_5epochs'

### Create Trainer

In [None]:
training_args = TrainingArguments(
          output_dir='/content/train_2',
          learning_rate=config['learning_rate'],
          per_device_train_batch_size=config['batch_size'],
          per_device_eval_batch_size=config['batch_size'],
          num_train_epochs=config['num_epochs'],
          evaluation_strategy = "steps",
          eval_steps=1000,
          save_strategy='epoch',
          #report_to="wandb"
      )

In [None]:
trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=tokenized_dataset_train,
          eval_dataset=tokenized_dataset_val,
          data_collator=data_collator)

### Training the model

In [None]:
trainer.train()

***** Running training *****
  Num examples = 34028
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 21270
  Number of trainable parameters = 354823168
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
trainer.save_model()

Saving model checkpoint to /content/train_2
Configuration saved in /content/train_2/config.json
Model weights saved in /content/train_2/pytorch_model.bin


In [None]:
model = GPT2LMHeadModel.from_pretrained('/content/train_2')

loading configuration file /content/train_2/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_siz

### Saving Model to Drive

In [None]:
from google.colab import drive

import shutil

drive.mount('/content/drive')

In [None]:
save_name = '/content/'+run_name+'.pt'

In [None]:
save_name

In [None]:
torch.save(model.state_dict(), save_name)

In [None]:
shutil.copy(save_name,"/content/drive/MyDrive/ECE1786_Project/Models")