In [1]:
import torch.optim as optim
from torch.utils.data import DataLoader
import torch
import pandas as pd, numpy as np
from tqdm import tqdm
import argparse
import os
from model_generator import Fluency
from parameters import Parameters
from transformers import GPT2LMHeadModel, GPT2Tokenizer
train_test_split = 0.8
batch_size = 32
n_epochs = 5
learning_rate = 2e-5
optim_every = int(32 / batch_size)
params = Parameters()

In [11]:
x = torch.no_grad()
with x:
    print(5)
with 1:
    print(5)

5


AttributeError: __enter__

In [2]:
model = GPT2LMHeadModel.from_pretrained('C:/Users/Colton/OneDrive/School/Thesis/Adfenix/play/models/2022-04-11/fluency/model_14')
tokenizer = GPT2Tokenizer.from_pretrained('C:/Users/Colton/OneDrive/School/Thesis/Adfenix/play/models/2022-04-11/fluency/tokenizer')

In [3]:
texts = ['PositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPositionPosition', 'This wonderful home has 7 bedrooms, 3 bathrooms, a three car garage, and is located near wonderful schools and local parks','Spacious 3 bedroom, 2 bathroom home with beautiful East River views and oversized balcony with direct access from both living room and primary bedroom. Large kitchen with pass-through to living room.']

In [8]:
def score(batch_msgs):
    with torch.no_grad():
        summ_inp, summ_out = preprocess_input(batch_msgs)
        outputs = model(**summ_inp)
        logits = outputs.logits
        crit = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction='none')
        loss = crit(logits.view(-1, len(tokenizer)), summ_out.input_ids.view(-1)).view(summ_out.input_ids.shape)
        non_pads = ~torch.eq(summ_inp.input_ids, tokenizer.pad_token_id)
        non_pad_cnts = torch.sum(non_pads, dim=1).to('cpu')
        loss_per = torch.sum(loss, dim=1) / non_pad_cnts

        score = (10.0 - loss_per) / 10.0
        return score
    
def preprocess_input(batch_msgs):
    inputs = tokenizer([tokenizer.bos_token + msg for msg in batch_msgs], return_tensors='pt', padding=True, truncation=True, max_length=40).to('cpu')
    outputs = tokenizer([msg + tokenizer.eos_token for msg in batch_msgs], return_tensors='pt', padding=True, truncation=True, max_length=40).to('cpu')
    for i,inpid in enumerate(outputs.input_ids):
        for j,item in enumerate(inpid):
            if item == tokenizer.pad_token_id:
                outputs['input_ids'][i][j] = -1 
        if not tokenizer.eos_token_id in inpid:
            outputs['input_ids'][i][-1] = tokenizer.eos_token_id
    return inputs, outputs

In [9]:
score(texts)

tensor([0.6252, 0.7058, 0.6966])

In [2]:
milestones = list(range(3, n_epochs))
gamma = 0.8

params = Parameters()

df = pd.read_csv(params.dataset_filename, low_memory=False)
msgs = [msg for msg in df.msg_clean.tolist() if isinstance(msg, str)]

N = len(msgs)
N_train = int(N * train_test_split)
N_val = N - N_train
d_train, d_val = torch.utils.data.dataset.random_split(msgs, [N_train, N_val])

fluency = Fluency()

class Dataset:
    def __init__(self, msgs):
        inputs, outputs = fluency.preprocess_input(msgs)
        self.input_ids = inputs.input_ids
        self.attn_mask = inputs.attention_mask
        self.outputs = outputs.input_ids
        
    def __getitem__(self, i):
        return self.input_ids[i].to(params.device), self.attn_mask[i].to(params.device), self.outputs[i].to(params.device)
    
    def __len__(self):
        return len(self.outputs)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded default fluency model, tokenizer


In [3]:
print('Creating train dataset...',end='')
dataset_train = Dataset(list(d_train))
print('done\nCreating val dataset...',end='')
dataset_val = Dataset(list(d_val))
print('done\nCreating dataloaders...',end='')
dl_train = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True, drop_last=True)
dl_val = DataLoader(dataset=dataset_val, batch_size=32, shuffle=True, drop_last=True)

crit = torch.nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.AdamW(fluency.model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)

# if not os.path.isdir(params.model_today_dir):
#     os.mkdir(params.model_today_dir)
# if not os.path.isdir(params.base_fluency_dir):
#     os.mkdir(params.base_fluency_dir)

# params.write_params(params.base_fluency_dir)
# lines = ['\nlearning rate = {}\t milestones = {}\tgamma = {}'.format(learning_rate, milestones, gamma)]
# params.write_params(params.base_fluency_dir, lines)
# fluency.tokenizer.save_pretrained(params.fluency_tokenizer_dir)
# lines = ['\nepoch,train_loss,val_loss']
# params.write_params(params.base_fluency_dir, lines)

Creating train dataset...done
Creating val dataset...done
Creating dataloaders...

In [4]:
fluency.tokenizer.decode([50257, 50258])

'<|startoftext|> <|pad|>'

In [None]:
for i_epoch in range(n_epochs):
    total_train_loss = 0
    total_val_loss = 0
    
    fluency.model.train()
    for ib, batch in enumerate(tqdm(dl_train, desc='Train, epoch #{} - LR={}'.format(i_epoch, optimizer.param_groups[0]["lr"]))):
        input_ids, attn_mask, outputs = batch
        res = fluency.model(input_ids=input_ids, attention_mask=attn_mask)
        logits = res.logits
        loss = crit(logits.view(-1,len(fluency.tokenizer)), outputs.view(-1))
        loss.backward()
        total_train_loss += loss.item()
        if ib + 1 % optim_every == 0:
            optimizer.step()
            optimizer.zero_grad()

    scheduler.step()
    avg_train_loss = total_train_loss / len(dl_train)
    
    fluency.model.eval()
    with torch.no_grad():
        for ib, batch in enumerate(tqdm(dl_val, desc='Val, epoch #{}'.format(i_epoch))):
            input_ids, attn_mask, outputs = batch
            res = fluency.model(input_ids=input_ids, attention_mask=attn_mask)
            logits = res.logits
            loss = crit(logits.view(-1, len(fluency.tokenizer)), outputs.view(-1))
            total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(dl_val)

    lines = ['\n'+','.join([str(item) for item in [i_epoch, avg_train_loss, avg_val_loss]])]
    # params.write_params(params.base_fluency_dir, lines)
    print(lines)
    # fluency.model.save_pretrained(params.fluency_model_dir.format(i_epoch))

In [None]:
for ib, batch in enumerate(dl_train):
    input_ids, attn_mask, outputs = batch
    for inpid in input_ids:
        if inpid[0] ==