<a href="https://colab.research.google.com/github/gagan3012/keytotext/blob/master/Notebooks/GPTModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import datetime
import time
import random

import pandas as pd
import numpy as np
import re

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm

!pip install transformers

from transformers import (
	AdamW,
	GPT2LMHeadModel,
	GPT2Tokenizer, 
	GPT2Config,
	get_linear_schedule_with_warmup
)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 6.3MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 22.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 35.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=257cc2ad040

In [2]:
!git clone https://github.com/gagan3012/keytotext

Cloning into 'keytotext'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 123 (delta 54), reused 55 (delta 16), pack-reused 0[K
Receiving objects: 100% (123/123), 2.53 MiB | 11.03 MiB/s, done.
Resolving deltas: 100% (54/54), done.


In [3]:
class BOTHData(Dataset):
	def __init__(self, tokenizer, max_source_length,
		max_target_length, type_path):
		super().__init__()
		self.tokenizer = tokenizer
		self.max_source_length = max_source_length
		self.max_target_length = max_target_length
		self.input_ids = []
		self.attn_masks = []
		self._build(type_path)
	
	def __len__(self):
		return len(self.input_ids)
	
	def __getitem__(self, index):

		return self.input_ids[index], self.attn_masks[index]

	def _build(self, type_path):
		if type_path == 'train':
			#df1 = pd.read_csv('/content/keytotext/data/dart_train.csv')
			df = pd.read_csv('/content/keytotext/data/webnlg_train.csv')
		elif type_path == 'eval':
			#df = pd.read_csv('/content/keytotext/data/dart_dev.csv')
			df = pd.read_csv('/content/keytotext/data/webnlg_dev.csv')
		else:
			#df = pd.read_csv('/content/keytotext/data/dart_test.csv')
			df = pd.read_csv('/content/keytotext/data/webnlg_test.csv')
		
	#	df = pd.concat([df1, df2])
		# n = 1 
		# df = df.head(int(len(df)*(n/100)))

		for index, row in df.iterrows():
				line = row['input_text']
				target = row['target_text']
				encodings = self.tokenizer('<|startoftext|>'+ line + ' = ' + target + '<|endoftext|>', truncation=True, max_length=self.max_source_length, padding="max_length")
				self.input_ids.append(torch.tensor(encodings['input_ids']))
				self.attn_masks.append(torch.tensor(encodings['attention_mask']))

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') 

print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.



The max model length is 1024 for this model, although the actual embedding size for GPT small is 768
The beginning of sequence token <|startoftext|> token has the id 50257
The end of sequence token <|endoftext|> has the id 50256
The padding token <|pad|> has the id 50258


In [5]:
batch_size = 4
train_dataset = BOTHData(tokenizer, 512, 512, 'train')
val_dataset = BOTHData(tokenizer, 512, 512, 'val')
test_dataset = BOTHData(tokenizer, 512, 512, 'test')

In [6]:
train_dataloader = DataLoader(
			train_dataset,  # The training samples.
			sampler = RandomSampler(train_dataset), # Select batches randomly
			batch_size = batch_size # Trains with this batch size.
		)

validation_dataloader = DataLoader(
			val_dataset, # The validation samples.
			sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
			batch_size = batch_size # Evaluate with this batch size.
		)
test_dataloader = DataLoader(
			test_dataset, # The validation samples.
			sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
			batch_size = batch_size # Evaluate with this batch size.
		)

In [7]:
configuration = GPT2Config.from_pretrained('distilgpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("distilgpt2", config=configuration)

model.resize_token_embeddings(len(tokenizer))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=352833716.0, style=ProgressStyle(descri…




Embedding(50259, 768)

In [8]:
device = torch.device("cuda:0")
model.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [9]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [10]:
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
sample_every = 1000

In [11]:
optimizer = AdamW(model.parameters(),
				  lr = learning_rate,
				  eps = epsilon
				)

In [12]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
											num_warmup_steps = warmup_steps, 
											num_training_steps = total_steps)
total_steps

44285

In [13]:
def format_time(elapsed):
	return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [14]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

In [15]:
!pip install wandb

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/33/ae/79374d2b875e638090600eaa2a423479865b7590c53fb78e8ccf6a64acb1/wandb-0.10.22-py2.py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 5.2MB/s 
[?25hCollecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/f3/92/5a33be64990ba815364a8f2dd9e6f51de60d23dfddafb4f1fc5577d4dc64/sentry_sdk-1.0.0-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 35.7MB/s 
[?25hCollecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/a6/99/98019716955ba243657daedd1de8f3a88ca1f5b75057c38e959db22fb87b/GitPython-3.1.14-py3-none-any.whl (159kB)
[K     |████████████████████████████████| 163kB 31.9MB/s 
Collecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405c4a230ca06c8ec9c1379f813caa52e/configparser-5.0.2-py3-none-any.whl
Collecting shortuuid>=0.5.0
  Download

In [16]:
%env WANDB_PROJECT = keytotext
import wandb

wandb.init(project="keytotext")

for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                    labels=b_labels,
                    attention_mask=b_masks,
                    token_type_ids=None
                    )

        loss = outputs[0]

        batch_loss = loss.item()

        total_train_loss += batch_loss
        wandb.log({'Train Loss': batch_loss,'Epoch':epoch_i})
# Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss,
                                                                         elapsed))

            model.eval()

            sample_outputs = model.generate(
                                            bos_token_id=random.randint(1, 30000),
                                            do_sample=True,
                                            top_k=50,
                                            max_length=200,
                                            top_p=0.95,
                                            num_return_sequences=1)
            for i, sample_output in enumerate(sample_outputs):
                print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

# Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

# Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

# ========================================
#               Validation
# ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

# Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                        #                            token_type_ids=None,
                        attention_mask=b_masks,
                        labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss
        wandb.log({'Eval Loss': batch_loss,'Epoch':epoch_i})

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

# Record all statistics from this epoch.
    training_stats.append(
    {
        'epoch': epoch_i + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Training Time': training_time,
        'Validation Time': validation_time
    })


env: WANDB_PROJECT=keytotext


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


0it [00:00, ?it/s]


Training...


185it [01:34,  1.95it/s]

KeyboardInterrupt: ignored

In [None]:
!mkdir model
torch.save(model.state_dict(),'model/pytorch_model.bin')