In [1]:
import joblib

In [2]:
train_embeddings = joblib.load('train_embeddings_logits_only.joblib')

In [3]:
train_embeddings[0].keys()

dict_keys(['logits_and_indices', 'input_ids'])

### Training from embeddings

In [4]:
import torch
import transformers
import torch.nn.functional as F

from tqdm import tqdm
from transformers import GPT2LMHeadModel, AutoTokenizer

In [5]:
gpt_small = GPT2LMHeadModel.from_pretrained("gpt2-medium")

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")

In [7]:
def create_text_from_summary_and_dialogue(summary, dialogue):
    text = f"""
A partial summary of the conversation is:
{summary}

With the dialogue being:
{dialogue}
    """.strip()
    
    return text.replace('\r\n', '\n')

In [8]:
summary = """
Alberto is a customer. Alberto ordered pizza and lasagne at Dominos one hour ago.
Alberto wants to know where his pizza is. Alberto is calling Dominos to know where his orders are.
John works at Dominos. John says that his pizza is almost ready. It will be at Alberto's home in 10 minutes. 
The lasagne is already on its way with the delivery person. Unfortunately the driver had an accident and is now in the hospital.
If alberto says "rabbit" then John replies "horse". If Alberto says "Tree" John says "red".
""".strip().replace('\n', ' ')

dialogue = """
Alberto: What happened to my pizza?
John: 
""".strip()

text = create_text_from_summary_and_dialogue(summary, dialogue)

prompt = tokenizer(text, return_tensors='pt')
prompt = {key: value for key, value in prompt.items()}

In [9]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def batchify(data, n):
    len_dict = {}
    for item in data:
        length = item.shape[1]
        try:
            len_dict[length].append(item)
        except:
            len_dict[length] = [item]

    batch_chunks = []
    for k in len_dict.keys():
        vectors = len_dict[k]
        batch_chunks += chunks(vectors, n)

    batches = []
    for chunk in batch_chunks:
        inputs = torch.stack([item[0] for item in chunk])
        batches.append((inputs))

    return batches

In [10]:
import json

val = json.load(open('../data/val.json'))

_limit = 1024
dev_data = []
total_skipped = 0
for item in val:
    text = create_text_from_summary_and_dialogue(item["summary"], item["dialogue"])
    tokens = tokenizer.encode(text, return_tensors='pt')
    if tokens.shape[1] > _limit:
        tokens = tokens[:, :_limit]
    dev_data.append(tokens)
    
print(f'Skipped {total_skipped} out of {len(val)}')

dev_batches = batchify(dev_data, 1)

def test(test_model, batches):
    test_model.eval()
    total_loss = 0.
    #for i, batch in tqdm(enumerate(batches), total=len(batches)):
    for i, batch in enumerate(batches):
        test_model.eval()
        inputs = batch
        loss = test_model(inputs, labels=inputs)[0]
        total_loss += loss.item()

    return total_loss / len(batches)

Skipped 0 out of 818


In [11]:
print('Dev loss:', test(gpt_small, dev_batches))

Dev loss: 3.37475347081723


In [12]:
def get_probability_vector(log_prob_dict, temp):
    _vocab_size = 50257
    
    logits = torch.tensor(log_prob_dict['logits'])
    num_tokens = logits.shape[1]
    indices = torch.tensor(log_prob_dict['indices'])
    vectors = []
    
    for index_set, logs in zip(indices[0], logits[0]):
        v = torch.sparse_coo_tensor([index_set.tolist()], logs, (_vocab_size, )).to_dense().float()
        v[v == 0] = torch.tensor(float('-inf'))
        vectors.append(v)

    vectors = torch.stack(vectors, dim=0)
    return F.softmax(vectors / temp, dim=-1)

In [13]:
get_probability_vector(train_embeddings[0]['logits_and_indices'], temp=10).shape

torch.Size([56, 50257])

In [14]:
import random
from torch.optim.lr_scheduler import StepLR

lr = 1e-5
gamma = 0.9
optimizer = torch.optim.Adam(gpt_small.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=2, gamma=0.9)
epochs = 5
trace_steps = 5000

steps = 0
best_loss = 1e6
for epoch_num in range(epochs):
    temp = 2
    random.shuffle(train_embeddings)
    
    for item in tqdm(train_embeddings):
        gpt_small.train()
        input_ids = torch.tensor([item['input_ids']])
        label_p = get_probability_vector(item['logits_and_indices'], temp=temp)
        out_logits = gpt_small.forward(input_ids).logits
        out_p = F.softmax(out_logits / temp, dim=-1)
        
        #loss = gpt_small(input_ids, labels=input_ids)[0]
        
        loss = - torch.mean(torch.mul(torch.log(out_p).flatten(),
                                      label_p.flatten()))

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        
        steps += 1
        
        if steps % trace_steps == 0:
            print("steps", steps)
            print('Dev loss:', test(gpt_small, dev_batches))
            if loss < best_loss:
                best_loss = loss
                torch.save({'epoch': epoch_num,
                            'steps': steps,
                            'model_state_dict': gpt_small.state_dict()},
                            'save_medium' + str(steps))
            
    scheduler.step()

 34%|███████████▌                      | 4999/14732 [3:12:52<6:06:44,  2.26s/it]

steps 5000
Dev loss: 3.135260449644989


 68%|███████████████████████           | 9999/14732 [6:32:22<3:06:43,  2.37s/it]

steps 10000
Dev loss: 3.147309991141695


100%|███████████████████████████████████| 14732/14732 [9:41:40<00:00,  2.37s/it]
  2%|▋                                    | 267/14732 [10:18<9:01:42,  2.25s/it]

steps 15000


  2%|▌                                 | 268/14732 [17:18<512:33:04, 127.57s/it]

Dev loss: 3.1546648274424025


 36%|████████████▏                     | 5267/14732 [3:31:29<6:10:40,  2.35s/it]

steps 20000
Dev loss: 3.146964810588249


 37%|████████████▌                     | 5419/14732 [3:44:36<6:25:59,  2.49s/it]


KeyboardInterrupt: 

In [None]:
print('Dev loss:', test(gpt_small, dev_batches))

In [None]:
gpt_small.save_pretrained(f'gpt_medium_temp{temp}_lr{lr}_sched{0.5}')

In [None]:
gpt_small = GPT2LMHeadModel.from_pretrained("gpt_medium_temp10_lr1e-05_sched0.9")

In [15]:
_ = gpt_small.eval()

In [16]:
device = 'cpu'

In [20]:
dialogue = """
John: Hello, how can I help?
""".strip()


print(dialogue)

while True:
    user_input = input()
    dialogue += "\nAlberto: " + user_input + "\nJohn: "
    text = create_text_from_summary_and_dialogue(summary, dialogue)
    prompt = tokenizer(text, return_tensors='pt')
    prompt = {key: value.to(device) for key, value in prompt.items()}
    out = gpt_small.generate(**prompt, max_length=prompt['input_ids'].shape[1] + 25, do_sample=False)
    out = out[0][prompt['input_ids'].shape[1]:]
    answer = tokenizer.decode(out)
    answer = answer[:answer.find('\n')].strip()
    print(answer)
    dialogue += answer

John: Hello, how can I help?
how is my pizza doing


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


it's almost ready
when will I receive it


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


in 10 minutes
what about my lasagne


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


it's almost ready


KeyboardInterrupt: Interrupted by user

In [None]:
%%time

summary = """
Alberto is a customer. Alberto ordered pizza and lasagne at Dominos one hour ago.
Alberto wants to know where his pizza is. Alberto is calling Dominos to know where his orders are.
John works at Dominos. John says that his pizza is almost ready. It will be at Alberto's home in 10 minutes. 
The lasagne is already on its way with the delivery person. Unfortunately the driver had an accident and is now in the hospital.
If alberto says "rabbit" then John replies "horse". If Alberto says "Tree" John says "red".
""".strip().replace('\n', ' ')

dialogue = """
Alberto: What happened to my pizza?
John: It's in the delivery man's car.
Alberto: And where is the delivery man?
""".strip()

text = create_text_from_summary_and_dialogue(summary, dialogue)

prompt = tokenizer(text, return_tensors='pt').to('cuda')
prompt = {key: value for key, value in prompt.items()}
out = gpt_small.generate(**prompt, max_length=prompt['input_ids'].shape[1] + 10, do_sample=False)
print(tokenizer.decode(out[0]))

In [None]:
gpt_small.save_pretrained('./gptj_small')

#####

In [None]:
import torch

In [None]:
s = torch.sparse_coo_tensor([[1,2,3]], [4,5,6], (1, 3))

In [None]:
s

In [None]:
torch.index_select(v, 0, [1,2,3], y)

In [None]:
v