In [1]:
import json
import torch
import torch.nn as nn
import random
from tqdm import tqdm

from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

train = json.load(open('../data/train.json'))
val = json.load(open('../data/val.json'))
test = json.load(open('../data/test.json'))

In [3]:
train[1]

{'id': '13728867',
 'summary': 'Olivia and Olivier are voting for liberals in this election. ',
 'dialogue': 'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great'}

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [5]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def batchify(data, n):
    len_dict = {}
    for item in data:
        length = item.shape[1]
        try:
            len_dict[length].append(item)
        except:
            len_dict[length] = [item]

    batch_chunks = []
    for k in len_dict.keys():
        vectors = len_dict[k]
        batch_chunks += chunks(vectors, n)

    batches = []
    for chunk in batch_chunks:
        inputs = torch.stack([item[0] for item in chunk])
        batches.append((inputs))

    return batches

In [6]:
def create_text_from_summary_and_dialogue(summary, dialogue):
    text = f"""
A partial summary of the conversation is:
{summary}

With the dialogue being:
{dialogue}
    """.strip()
    
    return text.replace('\r\n', '\n')

In [7]:
_limit = 1024
train_data = []
total_skipped = 0
for item in train:
    text = create_text_from_summary_and_dialogue(item["summary"], item["dialogue"])
    tokens = tokenizer.encode(text, return_tensors='pt')
    if tokens.shape[1] > _limit:
        tokens = tokens[:, :_limit]
    train_data.append(tokens)
    
print(f'Skipped {total_skipped} out of {len(train)}')

Token indices sequence length is longer than the specified maximum sequence length for this model (1111 > 1024). Running this sequence through the model will result in indexing errors


Skipped 0 out of 14732


In [8]:
_limit = 1024
dev_data = []
total_skipped = 0
for item in val:
    text = create_text_from_summary_and_dialogue(item["summary"], item["dialogue"])
    tokens = tokenizer.encode(text, return_tensors='pt')
    if tokens.shape[1] > _limit:
        tokens = tokens[:, :_limit]
    dev_data.append(tokens)
    
print(f'Skipped {total_skipped} out of {len(val)}')

Skipped 0 out of 818


In [9]:
def train(train_model, batches, optimizer, criterion):
    train_model.train()
    total_loss = 0.
    for i, batch in tqdm(enumerate(batches), total=len(batches)):
        model.train()
        inputs = batch
        optimizer.zero_grad()
        loss = train_model(inputs.cuda(), labels=inputs.cuda())[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(train_model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(batches)

def test(test_model, batches):
    test_model.eval()
    total_loss = 0.
    for i, batch in tqdm(enumerate(batches), total=len(batches)):
        test_model.eval()
        inputs = batch
        loss = test_model(inputs.cuda(), labels=inputs.cuda())[0]
        total_loss += loss.item()

    return total_loss / len(batches)

In [10]:
train_batches = batchify(train_data, 1)
dev_batches = batchify(dev_data, 1)

In [11]:
from torch.optim.lr_scheduler import StepLR

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

model.cuda()

random.shuffle(train_batches)
scheduler = StepLR(optimizer, step_size=2, gamma=0.8)
for epoch in range(10):
    random.shuffle(train_batches)
    loss = train(model, train_batches, optimizer, criterion)
    print('Epoch:', epoch, 'Loss:', loss)
    print('Dev loss:', test(model, dev_batches))
    torch.save({'epoch': epoch,
                'model_state_dict': model.state_dict()},
                'save_small' + str(epoch))
    scheduler.step()

100%|█████████████████████████████████████| 14732/14732 [14:10<00:00, 17.32it/s]


Epoch: 0 Loss: 2.396097652887217


100%|█████████████████████████████████████████| 818/818 [00:09<00:00, 90.66it/s]


Dev loss: 2.2844983887264374


100%|█████████████████████████████████████| 14732/14732 [14:15<00:00, 17.21it/s]


Epoch: 1 Loss: 2.2025161141647516


100%|█████████████████████████████████████████| 818/818 [00:08<00:00, 95.47it/s]


Dev loss: 2.2554801596114573


100%|█████████████████████████████████████| 14732/14732 [14:08<00:00, 17.36it/s]


Epoch: 2 Loss: 2.075270324018932


100%|█████████████████████████████████████████| 818/818 [00:08<00:00, 95.38it/s]


Dev loss: 2.2475088891598296


 29%|███████████                           | 4274/14732 [04:07<10:04, 17.29it/s]


KeyboardInterrupt: 

# Testing

In [None]:
import numpy as np

max_probs = 5

def generate_answer_and_get_confidence(model, prompt):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    generated_entropy = 0
    while tokens.shape[-1] < tokens_length + _length:
        new_token = model(tokens.cuda())
        probs = torch.softmax(new_token.logits[:, -1, :], dim=-1)
        probs_and_indices = [(p, index) for index, p in enumerate(probs[0].cpu().detach())]
        probs_and_indices = sorted(probs_and_indices, key=lambda x: -x[0])
        probs = [item[0] for item in probs_and_indices[:max_probs]]
        generated_entropy -= np.dot(probs, np.log(probs))
        tokens = torch.cat([tokens, torch.tensor([[torch.argmax(new_token.logits[:, -1, :])]])], dim=-1)
        last_token = tokens[:, -1]
        last_output = tokenizer.decode(last_token, skip_special_tokens=True)
        if last_output == '\n':
            break
        
    print(tokens.shape)
    print(tokens_length)
    generated_output = tokens[:, tokens_length:]
    print(generated_output.shape)
    output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    end = output.find('\n')
    return output[:end].strip(), generated_entropy

In [None]:
checkpoint = torch.load('save_small' + str(2))
model.load_state_dict(checkpoint['model_state_dict'])
_ = model.cuda()

In [None]:
def generate_answer_with_typical_decoding(model, tokenizer, prompt):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    generated_entropy = 0
    while tokens.shape[-1] < tokens_length + _length:
        new_tokens = model(tokens.cuda())
        normalized = torch.nn.functional.log_softmax(new_tokens.logits, dim=-1)
        p = torch.exp(normalized)
        entropy = -(normalized * p).nansum(-1, keepdim=True)
        shifted_scores = torch.abs(normalized + entropy)
        pred_ids = torch.argmin(shifted_scores, dim=-1)
        last_token = pred_ids[:, -1].cpu().detach()
        tokens = torch.cat([tokens, torch.tensor([[last_token]])], dim=-1)
        last_output = tokenizer.decode(last_token, skip_special_tokens=True)
        if last_output == '\n':
            break
        
    generated_output = tokens[:, tokens_length:]
    output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    end = output.find('\n')
    return output[:end].replace('A: ', '').strip()

In [None]:
def generate_answer_greedy(model, tokenizer, prompt, max_length=50):
    tokens = tokenizer.encode(prompt, return_tensors="pt")
    tokens_length = tokens.shape[1]
    if tokens_length + max_length > 1024:
        return ""

    while tokens.shape[-1] < tokens_length + max_length:
        new_tokens = model(tokens.cuda())
        pred_ids = torch.argmax(new_tokens.logits, dim=-1)
        last_token = pred_ids[:, -1].cpu().detach()
        tokens = torch.cat([tokens, torch.tensor([[last_token]])], dim=-1)
        last_output = tokenizer.decode(last_token, skip_special_tokens=True)
        if last_output == "\n":
            break

    generated_output = tokens[:, tokens_length:]
    output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    end = output.find("\n")
    return output[:end].replace("A: ", "").strip()


In [None]:
summary = "The user asks how the weather is in London. The bot replies 'The chances of raining is 1% today; 0% chances of snow'."
dialogue = """
User: Is it going to snow today?
Bot: 
""".strip()

prompt = create_text_from_summary_and_dialogue(summary, dialogue)
generate_answer_with_typical_decoding(model, tokenizer, prompt)

In [None]:
summary = """
Alberto is a customer. Alberto ordered pizza at Dominos one hour ago.
Alberto wants to know where his pizza is. Alberto is calling Dominos to know where his orders are.
John works at Dominos. John says that his pizza is almost ready. It will be at Alberto's home in 10 minutes.
""".strip().replace('\n', ' ')

dialogue = """
Alberto: Hello, where is my pizza?
John: 
""".strip()

prompt = create_text_from_summary_and_dialogue(summary, dialogue)
print(generate_answer_greedy(model, tokenizer, prompt))

In [None]:
summary = """
Alberto is a customer. Alberto ordered pizza at Dominos one hour ago.
Alberto wants to know where his pizza is. Alberto is calling Dominos to know where his orders are.
John works at Dominos. John says that his pizza is almost ready. It will be at Alberto's home in 10 minutes.
""".strip().replace('\n', ' ')

dialogue = """
John: Hello, I am John. How can I help you?
""".strip()

prompt = create_text_from_summary_and_dialogue(summary, dialogue)
print(generate_answer_greedy(model, tokenizer, prompt))

In [None]:
print(dialogue)
while True:
    user_input = input()
    dialogue += "\nAlberto: " + user_input + "\nJohn: "
    prompt = create_text_from_summary_and_dialogue(summary, dialogue)
    answer = generate_answer_greedy(model, tokenizer, prompt)
    print(answer)
    dialogue += answer
    