In [49]:
import json
from transformers import BertModel, BertTokenizer, GPT2LMHeadModel, GPT2Tokenizer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [52]:
bert_tokenizer = BertTokenizer.from_pretrained('embedding_models/dstc6_am/')
bert_model = BertModel.from_pretrained('embedding_models/dstc6_am/')

Some weights of BertModel were not initialized from the model checkpoint at embedding_models/dstc6_am/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [2]:
dstc6_data = json.load(open('human_evaluation_data/dstc6_eval.json', 'r'))
response_list = [item['response'] for item in dstc6_data]
reference_list = [item['reference'] for item in dstc6_data]
human_relevance_scores = [np.mean(item['annotations']['relevance']) for item in dstc6_data]

Some weights of BertModel were not initialized from the model checkpoint at embedding_models/dstc6_am/ and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
response_embedding_list = []
for r in tqdm(response_list):
    with torch.no_grad():
        inputs = bert_tokenizer(r, return_tensors="pt")
        outputs = bert_model(**inputs)
        pooler_output = outputs.pooler_output
        response_embedding_list.append(pooler_output.numpy().squeeze())

In [8]:
reference_embedding_list = []
for r in tqdm(reference_list):
    with torch.no_grad():
        inputs = bert_tokenizer(r, return_tensors="pt")
        outputs = bert_model(**inputs)
        pooler_output = outputs.pooler_output
        reference_embedding_list.append(pooler_output.numpy().squeeze())

In [None]:
am_scores = np.diagonal(cosine_similarity(np.stack(reference_embedding_list), 
                                          np.stack(response_embedding_list)))

In [50]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('language_models/dstc6_fm')
gpt2_model = GPT2LMHeadModel.from_pretrained('language_models/dstc6_fm')

In [54]:
gpt2_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [None]:
response_fm_score_list = []
with torch.no_grad():
    for r in tqdm(response_list):
        r = gpt2_tokenizer.encode(str(r))  + [50256]
        batch = torch.tensor([r])
        # average -logp
        loss = gpt2_model(batch, labels=batch)[0]
        response_fm_score_list.append(-1*loss.item())

In [None]:
reference_fm_score_list = []
nb_steps, eval_loss, exp_average_loss = 0, 0, None
with torch.no_grad():
    for r in tqdm(reference_list):
        r = gpt2_tokenizer.encode(str(r))  + [50256]
        batch = torch.tensor([r])
        # average -logp
        loss = gpt2_model(batch, labels=batch)[0]
        reference_fm_score_list.append(-1*loss.item())

In [61]:
def compute_fm_score(x, y):
    return max([x,y]) / min([x,y])

In [62]:
fm_score_list = [compute_fm_score(x, y) for x, y in zip(response_fm_score_list, reference_fm_score_list)]

NameError: name 'reference_fm_score_list' is not defined