<a href="https://colab.research.google.com/github/deniskapel/autoskill/blob/main/rpt_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1**. Get the latest Hugging Face Transformers (`pip install` may NOT give you the latest version)

In [None]:
!git clone https://github.com/huggingface/transformers.git
%cd transformers
!pip install -e .
%cd src

**Step 2**. Set up the `DialogRPT` model using Hugging Face model card

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

cards = ["microsoft/DialogRPT-updown",
         "microsoft/DialogRPT-depth",
         "microsoft/DialogRPT-width",
         "microsoft/DialogRPT-human-vs-rand",
         "microsoft/DialogRPT-human-vs-machine"]

tokenizers = [AutoTokenizer.from_pretrained(card) for card in cards]
models = [AutoModelForSequenceClassification.from_pretrained(card).to(device) for card in cards]

## testing

In [None]:
from torch.nn.functional import pad

def score(cxt, hyp, tknzr, ranker):
    model_input = tknzr.encode(cxt + "<|endoftext|>" + hyp, return_tensors="pt")
    with torch.no_grad():
        ranker.eval()
        result = ranker(model_input, return_dict=True)
        
    return torch.sigmoid(result.logits)

def score_as_batch(cxts, hyps, tknzr, ranker):
    all_results = []
    model_inputs = []
    for cxt, hyp in zip(cxts, hyps):
        model_inputs += [tknzr.encode(cxt + "<|endoftext|>" + hyp, padding="max_length", max_length=64, return_tensors="pt")]

    model_input = torch.cat(model_inputs, dim=0)
    with torch.no_grad():
        ranker.eval()
        result = ranker(model_input, return_dict=True)

    return torch.sigmoid(result.logits)

**Step 3**. Play!

In the following example, the model predicts that, given the same context "I love NLP!", response B is gets more upvotes than response A.

|  | Response of "I love NLP!"  | Score |
| :-----------: | : ----------- | :----------- : |
|  A |  Me too! | 0.111|
|  B |  Here’s a free textbook (URL) in case anyone needs it. | 0.613|


In [None]:
cxt = "I love NLP!"

hyp_A = "Me too!"
hyp_B = "Here’s a free textbook (URL) in case anyone needs it."
hyp_C = "I studied it in college"

print(score_as_batch([cxt, cxt, cxt], [hyp_A, hyp_B, hyp_C], tokenizer1, model1))
print(score_as_batch([cxt, cxt, cxt], [hyp_A, hyp_B, hyp_C], tokenizer2, model2))

tensor([[0.1252],
        [0.6395],
        [0.2652]])
tensor([[0.5964],
        [0.8397],
        [0.9092]])


In [None]:
from torch.nn.functional import pad

def score(cxt, hyp, tknzr, ranker):
  model_input = tknzr.encode(cxt + "<|endoftext|>" + hyp, return_tensors="pt")
  result = ranker(model_input, return_dict=True)
  return torch.sigmoid(result.logits)

def score_as_batch(cxts, hyps, tknzr, ranker):
    all_results = []
    model_inputs = []
    for cxt, hyp in zip(cxts, hyps):
        model_inputs += [tknzr.encode(cxt + "<|endoftext|>" + hyp, padding="max_length", max_length=64, return_tensors="pt")]

    model_input = torch.cat(model_inputs, dim=0)
    with torch.no_grad():
        ranker.eval()
        result = ranker(model_input, return_dict=True)

    return torch.sigmoid(result.logits)

## Data

In [None]:
import json
from tqdm.notebook import tqdm

with open('../../data/merged_preds.json', 'r', encoding='utf8') as f:
    predictions = json.load(f)

In [None]:
def score(cxt, hyp, tknzr, ranker):
  model_input = tknzr.encode(cxt + "<|endoftext|>" + hyp, return_tensors="pt").to(device)
  result = ranker(model_input, return_dict=True)
  return torch.sigmoid(result.logits).squeeze().cpu().detach().item()

scores = []

In [None]:
scored_preds = []

for sample in tqdm(predictions):
    evaluated = {}
    
    ctx = " [SEP] ".join(sample['context'].split(' __eou__ ')) 
    preds = sample['preds']

    scored = {'context': ctx,
              'preds': preds,
              'scores': {}
              }
    
    for name, tokenizer, model in zip(cards, tokenizers, models):
        name = name[20:]
        scored['scores'][name] = {}
        evaluated = {}
        for algo, hyp in preds.items():

            if hyp in evaluated:
                scored['scores'][name][algo] = evaluated[hyp]
                continue

            val = score(ctx, hyp, tokenizer, model)
            val = round(val,3)
            scored['scores'][name][algo] = val
            evaluated[hyp] = val

    scored_preds.append(scored)

  0%|          | 0/5206 [00:00<?, ?it/s]

In [None]:
# scored_preds[0]['scores']['depth']['algo_convert']
scored_preds[0]['scores']

In [None]:
with open('../../data/scored_merged_preds.json', "w", encoding="UTF-8") as f:
    json.dump(scored_preds, f, ensure_ascii=False, indent=2)

asd