In [1]:
#uncomment this if you are not using our department puffer
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [2]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F
from tqdm.auto import tqdm
import random, math, time
from torch.autograd import Variable
import operator

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda:0


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_tokenizer():
    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
    ntokens = tokenizer.vocab_size
    PAD_TOKEN = '<pad>'
    ntokens += tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
    PAD_TOKEN_ID = tokenizer.encode(PAD_TOKEN)[0]
    BOS_TOKEN_ID = tokenizer.encode(tokenizer.bos_token)[0]

    return tokenizer, ntokens, PAD_TOKEN_ID, PAD_TOKEN, BOS_TOKEN_ID

In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2TokenizerFast

tokenizer, ntokens, PAD_TOKEN_ID, PAD_TOKEN, BOS_TOKEN_ID = load_tokenizer()

# gpt2 model
GPT2 = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
gpts_modules = list(GPT2.children())

backbone = nn.Sequential(*gpts_modules[:-1])
model = nn.Sequential(*gpts_modules[-1:])

backbone = backbone.eval()

In [5]:
import os
from io import open
import torch
import json
from glob import glob
import numpy as np
import utils
import pandas as pd
from tqdm import tqdm

In [6]:
class Wikitext_Dataset:
    def __init__(self, path):
        self.train = os.path.join(path, 'train/train.txt')
        self.valid = os.path.join(path, 'valid/valid.txt')
        self.test  = os.path.join(path, 'test/test.txt')

    def build_corpus(self, path):
        files = open(path,'r')
        lines = []
        for line in files:
            line = line.strip().lower()
            # line = line.decode("ascii","ignore")
            if len(line) == 0:
                continue
            lines.append(line)
        return lines
path_files = './data/wikitext-2-add10b'
corpus = Wikitext_Dataset(path_files)
train_dataset = corpus.build_corpus(corpus.train)
valid_dataset = corpus.build_corpus(corpus.valid)
test_dataset  = corpus.build_corpus(corpus.test)

In [7]:
from datasets import Dataset
from datasets import DatasetDict
import pandas as pd

raw_datasets_train = Dataset.from_pandas(pd.DataFrame(data = {'text': train_dataset}))
raw_datasets_valid = Dataset.from_pandas(pd.DataFrame(data = {'text': valid_dataset}))
raw_datasets_test  = Dataset.from_pandas(pd.DataFrame(data = {'text': test_dataset}))
#remove .shuffle if you want to train the whole dataset....

raw_datasets = DatasetDict(
    {
        'train':raw_datasets_train,
        'validation':raw_datasets_valid,
        'test':raw_datasets_test
    }
)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23777
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2891
    })
})

In [8]:
def tokenize_function(example):
    outputs =  tokenizer(example['text'], truncation=True, padding='max_length')
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize_function, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

                                                                    

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 23777
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 2891
    })
})

In [9]:
len(tokenized_datasets['train']['input_ids'][1]) #longest token

1024

In [10]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
context_length  = 1024
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)

In [11]:
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.4M parameters


In [12]:
from torch.utils.data.dataloader import DataLoader
batch_size = 8
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=batch_size, shuffle=True)
eval_dataloader  = DataLoader(tokenized_datasets["validation"], batch_size=batch_size)
test_dataloader  = DataLoader(tokenized_datasets["test"], batch_size=batch_size)

In [13]:
for i in train_dataloader:
    i['input_ids']
    print(i['input_ids'].shape)
    break
for i in eval_dataloader:
    print(i['input_ids'].shape)
    break
for i in test_dataloader:
    print(i['input_ids'].shape)
    break

torch.Size([8, 1024])
torch.Size([8, 1024])
torch.Size([8, 1024])


In [14]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [15]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [16]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)
model = model.to(device)

In [17]:
from tqdm.auto import tqdm
num_update_steps_per_epoch = len(eval_dataloader)

def evaluate():
    model.eval()
    losses = []
    progress_bar = tqdm(range(num_update_steps_per_epoch))
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            inputs = batch["input_ids"].to(device)
            labels = batch["input_ids"].to(device)
            outputs = model(inputs, labels=labels)
            outputs.loss = outputs.loss.reshape(1)
            progress_bar.update(1)
        losses.append(accelerator.gather(outputs.loss))        
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [18]:
def train():
    progress_bar = tqdm(range(num_training_steps))
    gradient_accumulation_steps = 8
    model.to(device)
    eval_steps = 2
    model.train()
    completed_steps = 0
    for epoch in range(num_train_epochs):
        total_loss = 0
        for step, batch in enumerate(train_dataloader):
            inputs = batch['input_ids'].to(device)
            labels = batch['input_ids'].to(device)
    
            optimizer.zero_grad()
            outputs = model(inputs, labels=labels)
            logits = outputs.logits      
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            # print(loss)
            accelerator.backward(loss) #instance of optimize.backward()
    
            if step % gradient_accumulation_steps == 0:
                accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                completed_steps += 1
            
            if (step % (eval_steps * gradient_accumulation_steps)) == 0:
                eval_loss, perplexity = evaluate()
                accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
                model.train()
                
            progress_bar.update(1)
            total_loss += loss.item()
    
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{epochs} - Average Loss: {avg_loss:.4f}')

In [19]:
evaluate()

100%|██████████| 308/308 [01:07<00:00,  4.58it/s]


(10.193436622619629, 26727.189453125)

In [20]:
train()

  0%|          | 0/2973 [00:00<?, ?it/s]
  0%|          | 0/308 [00:00<?, ?it/s][A
  0%|          | 1/308 [00:00<00:54,  5.67it/s][A
  1%|          | 2/308 [00:00<01:01,  4.94it/s][A
  1%|          | 3/308 [00:00<01:03,  4.77it/s][A
  1%|▏         | 4/308 [00:00<01:04,  4.68it/s][A
  2%|▏         | 5/308 [00:01<01:05,  4.63it/s][A
  2%|▏         | 6/308 [00:01<01:05,  4.60it/s][A
  2%|▏         | 7/308 [00:01<01:05,  4.58it/s][A
  3%|▎         | 8/308 [00:01<01:05,  4.57it/s][A
  3%|▎         | 9/308 [00:01<01:05,  4.56it/s][A
  3%|▎         | 10/308 [00:02<01:05,  4.56it/s][A
  4%|▎         | 11/308 [00:02<01:05,  4.55it/s][A
  4%|▍         | 12/308 [00:02<01:05,  4.55it/s][A
  4%|▍         | 13/308 [00:02<01:04,  4.55it/s][A
  5%|▍         | 14/308 [00:03<01:04,  4.55it/s][A
  5%|▍         | 15/308 [00:03<01:04,  4.55it/s][A
  5%|▌         | 16/308 [00:03<01:04,  4.55it/s][A
  6%|▌         | 17/308 [00:03<01:03,  4.55it/s][A
  6%|▌         | 18/308 [00:03<01:03,  4.