In [1]:
import joblib

In [2]:
train_embeddings = joblib.load('train_embeddings_logits_only.joblib')

In [3]:
train_embeddings[0].keys()

dict_keys(['logits_and_indices', 'input_ids'])

### Training from embeddings

In [4]:
import torch
import transformers
import torch.nn.functional as F

from tqdm import tqdm
from transformers import GPT2LMHeadModel, AutoTokenizer

In [20]:
gpt_small = GPT2LMHeadModel.from_pretrained("gpt2")

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")

In [7]:
def create_text_from_summary_and_dialogue(summary, dialogue):
    text = f"""
A partial summary of the conversation is:
{summary}

With the dialogue being:
{dialogue}
    """.strip()
    
    return text.replace('\r\n', '\n')

In [8]:
summary = """
Alberto is a customer. Alberto ordered pizza and lasagne at Dominos one hour ago.
Alberto wants to know where his pizza is. Alberto is calling Dominos to know where his orders are.
John works at Dominos. John says that his pizza is almost ready. It will be at Alberto's home in 10 minutes. 
The lasagne is already on its way with the delivery person. Unfortunately the driver had an accident and is now in the hospital.
If alberto says "rabbit" then John replies "horse". If Alberto says "Tree" John says "red".
""".strip().replace('\n', ' ')

dialogue = """
Alberto: What happened to my pizza?
John: 
""".strip()

text = create_text_from_summary_and_dialogue(summary, dialogue)

prompt = tokenizer(text, return_tensors='pt').to('cuda')
prompt = {key: value for key, value in prompt.items()}

In [9]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def batchify(data, n):
    len_dict = {}
    for item in data:
        length = item.shape[1]
        try:
            len_dict[length].append(item)
        except:
            len_dict[length] = [item]

    batch_chunks = []
    for k in len_dict.keys():
        vectors = len_dict[k]
        batch_chunks += chunks(vectors, n)

    batches = []
    for chunk in batch_chunks:
        inputs = torch.stack([item[0] for item in chunk])
        batches.append((inputs))

    return batches

In [10]:
import json

val = json.load(open('../data/val.json'))

_limit = 1024
dev_data = []
total_skipped = 0
for item in val:
    text = create_text_from_summary_and_dialogue(item["summary"], item["dialogue"])
    tokens = tokenizer.encode(text, return_tensors='pt')
    if tokens.shape[1] > _limit:
        tokens = tokens[:, :_limit]
    dev_data.append(tokens)
    
print(f'Skipped {total_skipped} out of {len(val)}')

dev_batches = batchify(dev_data, 1)

def test(test_model, batches):
    test_model.eval()
    total_loss = 0.
    #for i, batch in tqdm(enumerate(batches), total=len(batches)):
    for i, batch in enumerate(batches):
        test_model.eval()
        inputs = batch
        loss = test_model(inputs.cuda(), labels=inputs.cuda())[0]
        total_loss += loss.item()

    return total_loss / len(batches)

Skipped 0 out of 818


In [21]:
_ = gpt_small.cuda()

In [12]:
print('Dev loss:', test(gpt_small, dev_batches))

Dev loss: 4.009833007103657


In [13]:
def get_probability_vector(log_prob_dict, temp):
    _vocab_size = 50257
    
    logits = torch.tensor(log_prob_dict['logits'])
    num_tokens = logits.shape[1]
    indices = torch.tensor(log_prob_dict['indices'])
    vectors = []
    
    for index_set, logs in zip(indices[0], logits[0]):
        v = torch.sparse_coo_tensor([index_set.tolist()], logs, (_vocab_size, )).to_dense().float()
        v[v == 0] = torch.tensor(float('-inf'))
        vectors.append(v)

    vectors = torch.stack(vectors, dim=0)
    return F.softmax(vectors / temp, dim=-1)

In [14]:
get_probability_vector(train_embeddings[0]['logits_and_indices'], temp=10).shape

torch.Size([56, 50257])

In [15]:
import random
from torch.optim.lr_scheduler import StepLR

lr = 3e-5
optimizer = torch.optim.Adam(gpt_small.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=2, gamma=0.5)
epochs = 5

steps = 0
best_model = None
best_loss = 1e6
for epoch_num in range(epochs):
    gpt_small.train()
    temp = 30
    random.shuffle(train_embeddings)
    
    for item in tqdm(train_embeddings):
        input_ids = torch.tensor([item['input_ids']]).cuda()
        label_p = get_probability_vector(item['logits_and_indices'], temp=temp).cuda()
        out_logits = gpt_small.forward(input_ids).logits
        out_p = F.softmax(out_logits / temp, dim=-1)
        
        loss = gpt_small(input_ids, labels=input_ids)[0]
        
        loss -=  temp * temp * torch.mean(torch.mul(torch.log(out_p).flatten(),
                                          label_p.flatten()))

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        
        steps += 1
        
        if steps % 2000 == 0:
            print("steps", steps)
            print('Dev loss:', test(gpt_small, dev_batches))
            if loss < best_loss:
                best_loss = loss
                best_model = gpt_small
            
    scheduler.step()

 14%|█████▏                                | 1998/14732 [03:49<26:16,  8.08it/s]

steps 2000


 14%|████▉                               | 2001/14732 [03:58<5:35:25,  1.58s/it]

Dev loss: 2.431924162546405


 27%|██████████▎                           | 3999/14732 [07:36<17:54,  9.99it/s]

steps 4000


 27%|█████████▊                          | 4002/14732 [07:44<3:20:20,  1.12s/it]

Dev loss: 2.398879644777489


 41%|███████████████▍                      | 5999/14732 [11:29<16:52,  8.62it/s]

steps 6000


 41%|██████████████▋                     | 6000/14732 [11:38<6:02:19,  2.49s/it]

Dev loss: 2.3750277270897677


 54%|████████████████████▋                 | 7999/14732 [15:11<11:33,  9.70it/s]

steps 8000


 54%|███████████████████▌                | 8000/14732 [15:19<4:03:58,  2.17s/it]

Dev loss: 2.3613543733407933


 68%|█████████████████████████▊            | 9999/14732 [18:46<08:31,  9.25it/s]

steps 10000


 68%|███████████████████████▊           | 10002/14732 [18:54<1:50:40,  1.40s/it]

Dev loss: 2.3492186436734745


 81%|██████████████████████████████▏      | 11999/14732 [22:21<04:52,  9.34it/s]

steps 12000


 81%|████████████████████████████▌      | 12001/14732 [22:29<1:20:09,  1.76s/it]

Dev loss: 2.340343264612417


 95%|███████████████████████████████████▏ | 13999/14732 [25:56<01:18,  9.31it/s]

steps 14000


 95%|███████████████████████████████████▏ | 14001/14732 [26:04<20:16,  1.66s/it]

Dev loss: 2.34194017622465


100%|█████████████████████████████████████| 14732/14732 [27:19<00:00,  8.98it/s]
  9%|███▎                                  | 1267/14732 [02:16<24:38,  9.11it/s]

steps 16000


  9%|███                                 | 1269/14732 [02:24<6:31:21,  1.74s/it]

Dev loss: 2.3437416345099655


 22%|████████▍                             | 3267/14732 [05:53<20:21,  9.38it/s]

steps 18000


 22%|███████▉                            | 3269/14732 [06:02<5:05:50,  1.60s/it]

Dev loss: 2.3482801039819323


 36%|█████████████▌                        | 5267/14732 [09:39<19:42,  8.01it/s]

steps 20000


 36%|████████████▊                       | 5268/14732 [09:48<7:11:29,  2.74s/it]

Dev loss: 2.3470166568942643


 49%|██████████████████▋                   | 7266/14732 [13:19<13:06,  9.49it/s]

steps 22000


 49%|█████████████████▊                  | 7269/14732 [13:27<2:56:32,  1.42s/it]

Dev loss: 2.350205917813084


 63%|███████████████████████▉              | 9267/14732 [16:56<09:39,  9.43it/s]

steps 24000


 63%|██████████████████████▋             | 9268/14732 [17:05<3:23:26,  2.23s/it]

Dev loss: 2.34581764417639


 76%|████████████████████████████▎        | 11267/14732 [20:31<06:10,  9.34it/s]

steps 26000


 76%|██████████████████████████▊        | 11268/14732 [20:39<2:15:57,  2.36s/it]

Dev loss: 2.3399405475059174


 90%|█████████████████████████████████▎   | 13267/14732 [24:06<02:32,  9.59it/s]

steps 28000


 90%|█████████████████████████████████▎   | 13270/14732 [24:14<30:53,  1.27s/it]

Dev loss: 2.3343154505295973


100%|█████████████████████████████████████| 14732/14732 [26:45<00:00,  9.18it/s]
  4%|█▍                                     | 535/14732 [00:56<25:29,  9.28it/s]

steps 30000


  4%|█▎                                   | 537/14732 [01:04<6:42:28,  1.70s/it]

Dev loss: 2.3687014776511996


 17%|██████▌                               | 2534/14732 [04:30<21:03,  9.65it/s]

steps 32000


 17%|██████▏                             | 2536/14732 [04:38<5:40:31,  1.68s/it]

Dev loss: 2.426964402927455


 31%|███████████▋                          | 4535/14732 [08:16<18:32,  9.17it/s]

steps 34000


 31%|███████████                         | 4537/14732 [08:25<4:34:03,  1.61s/it]

Dev loss: 2.4412637476816155


 35%|█████████████▎                        | 5139/14732 [09:32<17:47,  8.98it/s]


KeyboardInterrupt: 

In [None]:
print('Dev loss:', test(best_model, dev_batches))

In [32]:
best_model.save_pretrained(f'gpt_small_temp{temp}_lr{lr}')

NameError: name 'best_model' is not defined

In [41]:
gpt_small = GPT2LMHeadModel.from_pretrained("gpt_small_temp30_lr1e-05")
_ = gpt_small.cuda()

In [42]:
_ = gpt_small.eval()

In [43]:
device = 'cuda'

In [48]:
dialogue = """
John: Hello, how can I help?
""".strip()


print(dialogue)

while True:
    user_input = input()
    dialogue += "\nAlberto: " + user_input + "\nJohn: "
    text = create_text_from_summary_and_dialogue(summary, dialogue)
    prompt = tokenizer(text, return_tensors='pt')
    prompt = {key: value.to(device) for key, value in prompt.items()}
    out = gpt_small.generate(**prompt, max_length=prompt['input_ids'].shape[1] + 25, do_sample=False)
    out = out[0][prompt['input_ids'].shape[1]:]
    answer = tokenizer.decode(out)
    answer = answer[:answer.find('\n')].strip()
    print(answer)
    dialogue += answer

John: Hello, how can I help?
Tree


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Ok, I'm in the office now.


KeyboardInterrupt: Interrupted by user

In [None]:
%%time

summary = """
Alberto is a customer. Alberto ordered pizza and lasagne at Dominos one hour ago.
Alberto wants to know where his pizza is. Alberto is calling Dominos to know where his orders are.
John works at Dominos. John says that his pizza is almost ready. It will be at Alberto's home in 10 minutes. 
The lasagne is already on its way with the delivery person. Unfortunately the driver had an accident and is now in the hospital.
If alberto says "rabbit" then John replies "horse". If Alberto says "Tree" John says "red".
""".strip().replace('\n', ' ')

dialogue = """
Alberto: What happened to my pizza?
John: It's in the delivery man's car.
Alberto: And where is the delivery man?
""".strip()

text = create_text_from_summary_and_dialogue(summary, dialogue)

prompt = tokenizer(text, return_tensors='pt').to('cuda')
prompt = {key: value for key, value in prompt.items()}
out = gpt_small.generate(**prompt, max_length=prompt['input_ids'].shape[1] + 10, do_sample=False)
print(tokenizer.decode(out[0]))

In [None]:
gpt_small.save_pretrained('./gptj_small')

#####

In [None]:
import torch

In [None]:
s = torch.sparse_coo_tensor([[1,2,3]], [4,5,6], (1, 3))

In [None]:
s

In [None]:
torch.index_select(v, 0, [1,2,3], y)

In [None]:
v