In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import torch
import pickle
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import DataLoader
from sentence_transformers import CrossEncoder

import load_data
from load_data import GenderDataset, gender_data_collate_fn
from models.encoder_t5 import EncoderT5

In [3]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

print(device)

cuda


In [4]:
# Hyper parameters

encoder_batch_size = 8

In [5]:
with open(os.path.join(os.curdir, "data", "blog.json"), "r") as file:
    json_data = json.load(file)
docs = json_data['docs'][1:] # I don't want to see the first document

In [6]:
num_docs = len(docs)
num_train_docs = int(num_docs * 0.7)
num_val_docs = int(num_docs * 0.15)
num_test_docs = num_docs - num_train_docs - num_val_docs
print(num_train_docs, num_val_docs, num_test_docs)

13773 2951 2952


In [7]:
train_docs = docs[:num_train_docs]
val_docs = docs[num_train_docs:num_train_docs+num_val_docs]
test_docs = docs[num_train_docs+num_val_docs:]

In [8]:
train_dataset, val_dataset, test_dataset = None, None, None
load_from_pickled = False

if os.path.exists(os.path.join(os.curdir, "data", "train.pickle")):
    load_from_pickled = True
    with open(os.path.join(os.curdir, "data", "train.pickle"), "rb") as f:
        train_dataset = pickle.load(f)
    with open(os.path.join(os.curdir, "data", "val.pickle"), "rb") as f:
        val_dataset = pickle.load(f)
    with open(os.path.join(os.curdir, "data", "test.pickle"), "rb") as f:
        test_dataset = pickle.load(f)
else:
    train_dataset = GenderDataset(train_docs)
    val_dataset = GenderDataset(val_docs)
    test_dataset = GenderDataset(test_docs)

if not load_from_pickled:
    with open(os.path.join(os.curdir, "data", "train.pickle"), "wb") as f:
        pickle.dump(train_dataset, f)
    with open(os.path.join(os.curdir, "data", "val.pickle"), "wb") as f:
        pickle.dump(val_dataset, f)
    with open(os.path.join(os.curdir, "data", "test.pickle"), "wb") as f:
        pickle.dump(test_dataset, f)

print(load_from_pickled)   

True


In [9]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=encoder_batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=gender_data_collate_fn
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=encoder_batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=gender_data_collate_fn
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=encoder_batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=gender_data_collate_fn
)

In [10]:
next(enumerate(test_dataloader))

(0,
 (tensor([[ 2057,  1014,  2013,  ...,  1015, 13571, 11838],
          [ 2209,  2176, 10235,  ...,  2164,  2101, 13018],
          [ 2004,  2026,  5462,  ...,  2208,  1009,  8296],
          ...,
          [26864, 15478,  1015,  ...,  2194,  2130,  2004],
          [ 6207, 11386,  7511,  ...,  1014, 10400, 11608],
          [ 1015,  2002,  2000,  ...,  1016,  1016,  1016]], dtype=torch.int32),
  tensor([128, 128, 128, 128, 128, 128, 128, 128], dtype=torch.int32),
  tensor([0, 1, 1, 1, 0, 1, 1, 1])))

In [11]:
# Sample on how to initialize an encoder

encoder_model = EncoderT5(
    vocab_size=load_data.tokenizer.vocab_size
).to(device)

Some weights of the model checkpoint at google/t5-v1_1-small were not used when initializing T5EncoderModel: ['decoder.block.4.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', 'decoder.block.2.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.1.layer_norm.weight', 'decoder.block.2.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.layer_norm.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.final_layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.k.weight', 'decoder.block.3.layer.1.EncDecAttention.q.weight', 'decoder.block.2.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.4.layer.1.EncDecAttention.q.weight', 'decoder.block.5.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.1.EncDecAttention.o.weight', 'decoder.block.7.layer.0.SelfAttention.q.weigh

In [12]:
# Sample on how to forward on the encoder

batch, (src_ids, src_len, tgt) = next(enumerate(train_dataloader))

src_ids = src_ids.to(device)
tgt = tgt.to(device)
logits = encoder_model(src_ids)
print(logits.size())

torch.Size([8, 128, 30527])


In [13]:
from sentence_transformers import CrossEncoder
#scores = model.predict([('Sent A1', 'Sent B1'), ('Sent A2', 'Sent B2')])
#rint(scores)

In [14]:
print(train_dataset.idx2str(src_ids[0]))

moved on to other guys already - lrb - amazing how fast she can get over me - rrb -. and well the other few friends - lrb - at least i called them my friends - - i do n ' t really know if they would ever do the same - rrb - are either moved back home - lrb - melissa o. - rrb -, moved to denver - lrb - megan b., aaron, and cindy - rrb -, or i just do n ' t know how to ask them to talk to me or hang out with me - lrb - brandon, jeremy, nathan,


In [65]:
from sentence_transformers import SentenceTransformer, util

st_model = SentenceTransformer('all-mpnet-base-v2')
st_model[0].auto_model=st_model[0].auto_model.to(device)
st_model[0].auto_model.eval()
#rst_logits=logits
def calc_similarity(src_ids, rst_logits):
    SEP=st_model.tokenizer.sep_token_id
    CLS=st_model.tokenizer.cls_token_id
    # print(CLS,SEP)
    cls_emb=st_model[0].auto_model.embeddings.word_embeddings.weight[torch.ones(src_ids.shape[0], 1, dtype=torch.long).to(device) * CLS].to(device)
    sep_emb=st_model[0].auto_model.embeddings.word_embeddings.weight[torch.ones(src_ids.shape[0], 1, dtype=torch.long).to(device) * SEP].to(device)
    # print(cls_emb)
    src_full = torch.cat([torch.ones(src_ids.shape[0], 1, dtype=torch.long).to(device) * CLS,
                src_ids,
                torch.ones(src_ids.shape[0], 1, dtype=torch.long).to(device) * SEP], dim=1)
    src_embed = st_model[0].auto_model.embeddings.word_embeddings.weight[src_full].to(device)
    tgt_embed = nn.Softmax(dim=-1)(rst_logits)@st_model[0].auto_model.embeddings.word_embeddings.weight.to(device)
    tgt_embed = torch.cat([cls_emb, tgt_embed, sep_emb], dim=1)
    src_encode = st_model[0].auto_model.forward(inputs_embeds=src_embed)[1]
    tgt_encode = st_model[0].auto_model.forward(inputs_embeds=tgt_embed)[1]
    # normalize with torch.nn.functional.normalize
    src_encode = torch.nn.functional.normalize(src_encode, p=2, dim=1)
    tgt_encode = torch.nn.functional.normalize(tgt_encode, p=2, dim=1)
    result = torch.sum(src_encode * tgt_encode, dim=1)
    #print(src_encode.shape, tgt_encode.shape)
    #print(src_encode-tgt_encode)
    # print(result.shape)
    # print(result)
    return result

In [51]:
with torch.no_grad():
    print(calc_similarity(src_ids, logits))

tensor([0.0574, 0.0633, 0.2124, 0.0368, 0.2187, 0.2120, 0.2563, 0.0633],
       device='cuda:0')


In [1]:
# sanity check: the similarity should be ~1 for the same sentence
new_logits=torch.zeros(logits.shape).to(device)
new_logits-=10
for i in range(src_ids.shape[0]):
    for j in range(src_ids.shape[1]):
        new_logits[i,j,src_ids[i,j]]=0

NameError: name 'torch' is not defined

In [2]:
with torch.no_grad():
    print(calc_similarity(src_ids, new_logits))

NameError: name 'torch' is not defined