In [1]:
import warnings
warnings.filterwarnings('ignore')


def read_json_file(file_path):
    """
    Reads a JSON file with multiple JSON objects (one per line) and returns the data as a list of dictionaries.
    
    Args:
        file_path (str): The path to the JSON file.
    
    Returns:
        list: A list of dictionaries containing the data from the JSON file.
    """
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    json_obj = json.loads(line)
                    data.append(json_obj)
                except json.JSONDecodeError as e:
                    print(f"JSONDecodeError in line: {line.strip()}")
                    print(f"Error message: {e}")
        return data
    except FileNotFoundError:
        print(f"The file at {file_path} does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [41]:
def extract_data_without_tag(prompt):
    prompts = prompt['prompt']

    last_utter = f"{prompt['last_speaker']} :" 

    dialogue = prompts + last_utter
    
    return dialogue, prompt['last_speaker'], prompt['answer']
    

def extract_data_with_tag(prompt):
    prompts = prompt['prompt']

    last_utter = f"{prompt['last_speaker']} : ({prompt['gold_tag']})" 

    dialogue = prompts + last_utter
    
    return dialogue, prompt['last_speaker'], prompt['gold_tag'], prompt['answer']

def distinct_ngrams(sentences, n):
    """
    Calculate the distinct-n metric for a list of sentences.
    
    Args:
        sentences (list of str): The list of sentences generated by the model.
        n (int): The n-gram length.

    Returns:
        float: The distinct-n score.
    """
    ngrams = Counter()
    total_ngrams = 0

    for sentence in sentences:
        tokens = sentence.split()
        sentence_ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams.update(sentence_ngrams)
        total_ngrams += len(tokens) - n + 1
    
    return len(ngrams) / total_ngrams if total_ngrams > 0 else 0


def get_ppl(text ,model, tokenizer):
    encodings = tokenizer(text, return_tensors="pt")

    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)
    
    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
    
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
    
            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1. 
            neg_log_likelihood = outputs.loss
    
        nlls.append(neg_log_likelihood)
    
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl    


In [53]:
import os,sys

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

import re
import torch
import evaluate
from evaluate import load
current_dir = os.getcwd()
episode_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(episode_dir)

from utils.model_utils import get_peft_checkpoint, generate, get_peft_checkpoint_
from tqdm import tqdm


from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer)

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from collections import Counter
from accelerate import Accelerator


def evaluation_chat_system(num, prompt , model, tokenizer, device, bert_eval,rough_eval,bleu_eval):

    
    if num == 2:
        print("This is a wo tag evaluation")
        print(prompt)
        input__, person, utterance,  = extract_data_without_tag(prompt)
        print(f'Last word -> {person} : "{utterance}"')
        
        input_ = tokenizer(input__, return_tensors = 'pt').to(device)     
        output = generate(model,tokenizer,
                                      input_,
                                      num_beams=1,
                                      num_return_sequences=1,
                                      max_new_tokens=100)


        
        response = output.replace(input__, '')
        response = response.split("\n")[0]
        print(f"prediction : {response}")
        print(f"Real answer : {utterance}")

        reference = [utterance.split()]
        candidate = response.split()

            
        output_list = [response.strip()]
        last_utter_list = [utterance.strip()]
        #evalation
        bert_score = bert_eval.compute(predictions=output_list, references=last_utter_list, lang="en")
        rouge_score = rouge_eval.compute(predictions=output_list, references=last_utter_list)

        ## bleu

        weights_unigram = (1, 0, 0, 0)
        bleu_unigram = sentence_bleu(reference, candidate, weights=weights_unigram, smoothing_function=SmoothingFunction().method1)

        weights_bigram = (0.5, 0.5, 0, 0)
        bleu_bigram = sentence_bleu(reference, candidate, weights=weights_bigram, smoothing_function=SmoothingFunction().method1)

        ### ppl
        ppl = get_ppl(response, model, tokenizer)
        
        print(f"Bert Score : {bert_score}")
        print(f"Rouge Score : {rouge_score}")
        print(f"bleu 1/2 : {bleu_unigram} {bleu_bigram}")
        print(f"ppl : {ppl}")
        return bert_score, rouge_score, bleu_unigram, bleu_bigram, response ,ppl
    
    
    if num == 3:
        print("This is with tag evaluation")
        print(prompt)
        input__ , person, trait, utterance = extract_data_tag(prompt)
        print(f'Last word -> {person} : ({trait}) "{utterance}"')
        input_ = tokenizer(input__, return_tensors = 'pt').to(device)
     
        output = generate(model,tokenizer,
                                      input_,
                                      num_beams=1,
                                      num_return_sequences=1,
                                      max_new_tokens=100)


        print(input__)

        print(output)

        
        response = output.replace(input__, '')
        print(response)
        
        response = response.split("\n")[0]
        print(response)

        assert False
        print(f"prediction : {response}")
        print(f"Real answer : {utterance}")
        
            
        output_list = [response.strip()]
        last_utter_list = [utterance.strip()]
        bert_score = bert_eval.compute(predictions=output_list, references=last_utter_list, lang="en")
        rouge_score = rouge_eval.compute(predictions=output_list, references=last_utter_list)

        reference = [utterance.split()]
        candidate = response.split()

        
        weights_unigram = (1, 0, 0, 0)
        bleu_unigram = sentence_bleu(reference, candidate, weights=weights_unigram, smoothing_function=SmoothingFunction().method1)

        weights_bigram = (0.5, 0.5, 0, 0)
        bleu_bigram = sentence_bleu(reference, candidate, weights=weights_bigram, smoothing_function=SmoothingFunction().method1)
        
        ppl = get_ppl(response, model, tokenizer)
        infer = utterance
        print(f"Bert Score : {bert_score}")
        print(f"Rouge Score : {rouge_score}")
        print(f"bleu Score : {bleu_score}")
        print(f"ppl : {ppl}")
        
        return bert_score, rouge_score, bleu_score, infer, ppl

In [6]:
path = '/home/chanho/Model/SHARE/Refactorizing/result/model_save/gemma with tag'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model, tokenizer = get_peft_checkpoint(path, device)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
import json

In [32]:
path = '/home/chanho/Model/SHARE/Refactorizing/result/dataset/test_without_tag.json'
json_data = read_json_file(path)

In [33]:
json_data[0]

{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLOLA: Hello, Mr. Neff. It's me.\nNEFF: Something the matter?\nLOLA: I've been waiting for you.\nNEFF: For me? What for?\nLOLA: I thought you could let me ride with you, if you're going my way.\n\n",
 'answer': "Which way would that be? Oh, sure. Vermont and Franklin. North- west corner, wasn't it? Be glad to, Miss Dietrichson.",
 'gold_tag': 'NEFF is familiar with the local geographic area , NEFF references specific streets',
 'last_speaker': 'NEFF'}

In [54]:
bert = []
rough = []
bleu_1_list = []
bleu_2_list = []
infer = []
ppl_list = []

In [55]:
bertscore_eval = load("bertscore")
rouge_eval = evaluate.load('rouge')
bleu_eval = evaluate.load("bleu")

for prompt in json_data:
    bert_score, rough_score, bleu_1, bleu_2, infer_sentence, ppl = evaluation_chat_system(2, prompt ,model, tokenizer,device, bertscore_eval, rouge_eval, bleu_eval)
    bert.append(bert_score)
    rough.append(rough_score)
    bleu_1_list.append(bleu_1)
    bleu_2_list.append(bleu_2)
    infer.append(infer_sentence)
    ppl_list.append(ppl)

This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLOLA: Hello, Mr. Neff. It's me.\nNEFF: Something the matter?\nLOLA: I've been waiting for you.\nNEFF: For me? What for?\nLOLA: I thought you could let me ride with you, if you're going my way.\n\n", 'answer': "Which way would that be? Oh, sure. Vermont and Franklin. North- west corner, wasn't it? Be glad to, Miss Dietrichson.", 'gold_tag': 'NEFF is familiar with the local geographic area , NEFF references specific streets', 'last_speaker': 'NEFF'}
Last word -> NEFF : "Which way would that be? Oh, sure. Vermont and Franklin. North- west corner, wasn't it? Be glad to, Miss Dietrichson."
prediction :  Okay!
Real answer : Which way would that be? Oh, sure. Vermont and Franklin. North- west corner, wasn't it? Be glad to, Miss Dietrichson.

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8356287479400635], 'recall': [0.7825762033462524], 'f1': [0.808232843875885], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 11553.6015625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEFF: Roller skating, eh? You like roller skating?\nLOLA: I can take it or leave it.\nNEFF: Only tonight you're leaving it?\nLOLA: Yes, I am. You see, Mr. Neff, I'm having a very tough time at home. My father doesn't understand me and Phyllis hates me.\nNEFF: That does sound tough, all right.\nLOLA: That's why I have to lie sometimes.\nNEFF: You mean it's not Vermont and Franklin.\nLOLA: It's Vermont and Franklin all right. Only it's not An

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8692246675491333], 'recall': [0.8588073253631592], 'f1': [0.8639845848083496], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307691, 'rouge2': 0.0, 'rougeL': 0.07692307692307691, 'rougeLsum': 0.07692307692307691}
bleu 1/2 : 0 0
ppl : 16.772769927978516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLOLA: Hello, Mr. Neff.\nNEFF: Hello.\nLOLA: Lola Dietrichson. Don't you remember me?\nNEFF: Yes. Of course.\nLOLA: Could I talk to you, just for a few minutes? Somewhere where we can be alone?\n\n", 'answer': 'Sure. Come on into my office.', 'gold_tag': 'NEFF has an office , NEFF implies a professional occupation', 'last_speaker': 'NEFF'}
Last word -> NEFF : "Sure. Come on into my office

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8639878034591675], 'recall': [0.8563694953918457], 'f1': [0.8601617813110352], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 395.3274230957031
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEFF: Is it something to do with -- what happened?\nLOLA: Yes, Mr. Neff. It's about my father's death.\n\n", 'answer': "I'm terribly sorry, Miss Dietrichson.", 'gold_tag': "NEFF is aware of Lola's father's death , NEFF is empathetic about it", 'last_speaker': 'NEFF'}
Last word -> NEFF : "I'm terribly sorry, Miss Dietrichson."
prediction :  My father was murdered by some bad man on his way to work this morning.
Real answer : I'm terribl

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8274996280670166], 'recall': [0.840857744216919], 'f1': [0.8341252207756042], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 35.40941619873047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEFF: Why are you crying? You won't tell me?\nLOLA: Of course I will, Walter. I wouldn't tell anybody else but you. It's about Nino.\nNEFF: Zachetti? What about him?\nLOLA: They killed my father together. He and Phyllis. He helped her do it. I know he did.\nNEFF: What makes you say that?\nLOLA: I've been following him. He's at her house, night after night. It was Phyllis and him all the time. Maybe he was going with me just for a blind.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8634580373764038], 'recall': [0.8603291511535645], 'f1': [0.8618907332420349], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1100.4786376953125
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: Our son just called me a bitch.\n\n', 'answer': "You're not a bitch.", 'gold_tag': 'EDDIE is a supportive husband , EDDIE is a supportive father', 'last_speaker': 'EDDIE'}
Last word -> EDDIE : "You're not a bitch."
prediction :  No. Your mother said she hates you too, for being such a bitch! What are you gonna do?
Real answer : You're not a bitch.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8449645638465881], 'recall': [0.8823566436767578], 'f1': [0.8632559180259705], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2608695652173913, 'rouge2': 0.09523809523809523, 'rougeL': 0.2608695652173913, 'rougeLsum': 0.2608695652173913}
bleu 1/2 : 0.05555555555555554 0.018077538151554672
ppl : 44.035003662109375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: So let me get this straight. No airline will claim ownership of the engine. So we have to wait for the FAA to decide who fixes my roof. Fuck that. We're taking the money out of savings.\n\n", 'answer': '(quoting Rod Serling) You are entering a new dimension of sight and sound...', 'gold_tag': 'EDDIE is familiar with the work of Rod 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.828396201133728], 'recall': [0.8411673903465271], 'f1': [0.8347329497337341], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06666666666666667, 'rouge2': 0.0, 'rougeL': 0.06666666666666667, 'rougeLsum': 0.06666666666666667}
bleu 1/2 : 0.0625 0.02041241452319315
ppl : 32.58978271484375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: He's too old to be behaving this way.\n\n", 'answer': 'Oh, I say we buy him a moped.', 'gold_tag': 'EDDIE suggests buying a moped , EDDIE has a pragmatic, solution-oriented attitude', 'last_speaker': 'EDDIE'}
Last word -> EDDIE : "Oh, I say we buy him a moped."
prediction :  [Chuckling] Oh, man!
Real answer : Oh, I say we buy him a moped.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8216872215270996], 'recall': [0.8512065410614014], 'f1': [0.836186408996582], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.18181818181818182, 'rouge2': 0.0, 'rougeL': 0.18181818181818182, 'rougeLsum': 0.18181818181818182}
bleu 1/2 : 0.06295853427918728 0.024383735476492117
ppl : 109.24520111083984
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWERNER: You've been here since you were a baby -- you know nothing of America --\nJULIUS: I know it is the cradle of democracy and the land of the free; besides, I speak twelve languages -- I'm sure I can get a job.\nWERNER: They're a simple people; rather primitive, not like us.\n\n", 'answer': 'My brother will look after me.', 'gold_tag': 'JULIUS bel

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8454692363739014], 'recall': [0.8642604351043701], 'f1': [0.8547616004943848], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 80.92160034179688
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJULIUS: It's your only copy --\nWERNER: -- My need for it is only sentimental -- yours may be practical --\nJULIUS: Thank you -- I'll miss you.\n\n", 'answer': "I'll miss you.", 'gold_tag': 'Everyday Language', 'last_speaker': 'WERNER'}
Last word -> WERNER : "I'll miss you."
prediction :  I'll see you around -- 1459036736103.
Real answer : I'll miss you.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8243613243103027], 'recall': [0.9198824167251587], 'f1': [0.8695063591003418], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.6, 'rouge2': 0.25, 'rougeL': 0.6, 'rougeLsum': 0.6}
bleu 1/2 : 0.16666666666666669 0.05773502691896258
ppl : 34.73563003540039
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWERNER: And...?\nJULIUS: Well, if a woman loved my brother enough to marry him, she might have a sister who'd feel the same way about me. That often happens with twins, you know.\nWERNER: Julius -- please -- don't get your hopes up. Your brother doesn't even know you exist.\n\n", 'answer': "Of course he does. I've always known there was something missing in my life! And deep down, I'm sure Vincent f

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8289239406585693], 'recall': [0.8258674144744873], 'f1': [0.8273928761482239], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05, 'rouge2': 0.0, 'rougeL': 0.05, 'rougeLsum': 0.05}
bleu 1/2 : 0.012040891469099537 0.004038637775190581
ppl : 125.05367279052734
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBESS: Whatever is the matter with you? Your behaviour is so out of the ordinary.\n\n', 'answer': 'I just feel like keeping to myself today...', 'gold_tag': 'GEORGIANA is introverted , GEORGIANA prefers solitude at times', 'last_speaker': 'GEORGIANA'}
Last word -> GEORGIANA : "I just feel like keeping to myself today..."
prediction :  Your mother. I just need my mother. I can barely feel her whe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8610793948173523], 'recall': [0.8483717441558838], 'f1': [0.8546782732009888], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23076923076923078, 'rouge2': 0.08333333333333333, 'rougeL': 0.23076923076923078, 'rougeLsum': 0.23076923076923078}
bleu 1/2 : 0.16666666666666669 0.0990147542976674
ppl : 37.88048553466797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGEORGIANA: I may not have the authority to remove you from this house, but I can at least order you out of my room.\nBESS: Won't you please let me explain?\nGEORGIANA: There is nothing to explain. I trusted you, I made you my confidante, and you repaid me by stealing what is mine.\nBESS: This is my only chance of ever seeing my children a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8393616676330566], 'recall': [0.8305653929710388], 'f1': [0.8349403142929077], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 261.1163635253906
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGEORGIANA: Thank you for coming.\nBESS: I couldn’t not be with you.\nGEORGIANA: I count it a triumph we have become friends again when fate has been so intent on keeping us rivals.\nBESS: So do I. The Duke is for my boys only. You are for me.\nGEORGIANA: Bess, how will I do this?\n\n', 'answer': 'For Charlotte, for Harryo, for little G, for Hart...', 'gold_tag': "BESS has children she refers to as 'my boys' , GEORGIANA is a mother to C

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8678241968154907], 'recall': [0.7937864661216736], 'f1': [0.8291558623313904], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 45.04303741455078
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDR. SHIRLEY: What are you doing?\nLIP: I gotta take a leak.\nDR. SHIRLEY: Here? Now?\n\n', 'answer': 'What, you want me to piss my pants?', 'gold_tag': 'Everyday Language', 'last_speaker': 'LIP'}
Last word -> LIP : "What, you want me to piss my pants?"
prediction :  Well I want to piss.
Real answer : What, you want me to piss my pants?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.909238338470459], 'recall': [0.8633615970611572], 'f1': [0.8857063055038452], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.4615384615384615, 'rouge2': 0.18181818181818182, 'rougeL': 0.4615384615384615, 'rougeLsum': 0.4615384615384615}
bleu 1/2 : 0.21952465443761057 0.05488116360940265
ppl : 940.7593383789062
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: What do you want?\n\n', 'answer': 'I’m fine. Thank you.', 'gold_tag': 'Everyday Language', 'last_speaker': 'DR. SHIRLEY'}
Last word -> DR. SHIRLEY : "I’m fine. Thank you."
prediction :  I want to become a great researcher
Real answer : I’m fine. Thank you.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8606492877006531], 'recall': [0.8667268753051758], 'f1': [0.8636773824691772], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 77.50735473632812
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: This can’t be it. Says right here......cozy as your own home... This place looks like my ass.\nDR. SHIRLEY: This is the place.\n\n', 'answer': 'If you need anything, I’ll be up the street at the Easton Inn. So...see you tomorrow.', 'gold_tag': 'LIP offers to be available for DR. SHIRLEY if needed , LIP will be at the Easton Inn up the street , LIP plans to see DR. SHIRLEY the followi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8062158226966858], 'recall': [0.8271626830101013], 'f1': [0.8165549635887146], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16, 'rouge2': 0.0, 'rougeL': 0.16, 'rougeLsum': 0.16}
bleu 1/2 : 0.07898658475130411 0.01907707262543283
ppl : 41.819915771484375
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: What the hell’s wrong with you?! You go in there alone?\nDR. SHIRLEY: I apologize for putting you in that position, Tony.\nLIP: I don’t understand you, Doc, honest to God. Why couldn’t you just drink here--you got a whole bottle?\nDR. SHIRLEY: I needed some air.\nLIP: Air?! Don’t you know where you are?\nDR. SHIRLEY: Does the geography really matter?\nLIP: What?\nDR. SHIRLEY: If I walked int

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8701955676078796], 'recall': [0.8537177443504333], 'f1': [0.8618779182434082], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16216216216216214, 'rouge2': 0.0, 'rougeL': 0.10810810810810811, 'rougeLsum': 0.10810810810810811}
bleu 1/2 : 0.055156056411537216 0.018013892590990767
ppl : 13.426088333129883
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: “Betty bought... a bit of buttah... but she found the buttah bittah...”\nDR. SHIRLEY: Not buttah... butter. Say the “er.”\nLIP: Er.\nDR. SHIRLEY: “So Betty bought a bit of better butter to make the bitter butter better...”\nLIP: “So Betty bit a buttah...”\nDR. SHIRLEY: Don’t be lazy--enunciate. “So Betty bought a bit of better butter...”\nLIP: “

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8696684837341309], 'recall': [0.8654886484146118], 'f1': [0.8675735592842102], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 43.65284729003906
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDR. SHIRLEY: They were wrong for the way they treated me, and you rewarded them.\nLIP: I was hired to make sure you get from one show to the next. How I do it shouldn’t matter to you.\nDR. SHIRLEY: I just wish you hadn’t paid them off.\nLIP: I did what I had to do. You know, if this got out it would kill your career.\nDR. SHIRLEY: Okay, Tony, quit your phony altruism and concern for my career.\nLIP: What the hell does that mean?\nDR. S

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8305857181549072], 'recall': [0.864684522151947], 'f1': [0.8472921252250671], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 70.15714263916016
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: Should I try to find you some make- up or somethin’ before the show?\nDR. SHIRLEY: I’m fine.\nLIP: You sure?\n\n', 'answer': 'I said I’m fine.', 'gold_tag': 'DR. SHIRLEY expresses that he is fine even without makeup before a show', 'last_speaker': 'DR. SHIRLEY'}
Last word -> DR. SHIRLEY : "I said I’m fine."
prediction :  No, you don’t have to. You always sound so sad to us.
Real answer : I said I’m fine.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8503289222717285], 'recall': [0.8884605169296265], 'f1': [0.8689765930175781], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 43.695186614990234
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: So, how’d you learn how to play so good, Doc?\nDR. SHIRLEY: My mother. She taught me how to play on an old spinet. Soon as I could walk, we’d travel around the Florida panhandle and I’d put on little shows in parishes and halls. A man who had seen me play arranged for me to study at the Leningrad Conservatory of Music.\nLIP: So that’s where they taught you all them songs you play?\nDR. SHIRLEY: Actually, I was trained to play cla

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8349109888076782], 'recall': [0.8323906064033508], 'f1': [0.8336488604545593], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473682, 'rouge2': 0.0, 'rougeL': 0.10526315789473682, 'rougeLsum': 0.10526315789473682}
bleu 1/2 : 0 0
ppl : 62.833003997802734
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDR. SHIRLEY: I need sleep.\nLIP: Okay, I’ll pull over at the next place we see and I’ll sneak you into my room.\nDR. SHIRLEY: No. No. I refuse to stay at an establishment that doesn’t want me.\n\n', 'answer': 'Okay.', 'gold_tag': 'Everyday Language', 'last_speaker': 'LIP'}
Last word -> LIP : "Okay."
prediction :  Okay, okay, I’m going to get you a place to sleep. I’ll get it for you some

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8229926824569702], 'recall': [0.933280348777771], 'f1': [0.8746737241744995], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07407407407407407, 'rouge2': 0.0, 'rougeL': 0.07407407407407407, 'rougeLsum': 0.07407407407407407}
bleu 1/2 : 0 0
ppl : 16.316057205200195
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDR. SHIRLEY: If we leave right now, I think you can make it.\nLIP: Make what?\nDR. SHIRLEY: Christmas Eve.\nLIP: Don’t ever flash a wad of cash in a\n\n', 'answer': 'I knew you had a gun!', 'gold_tag': "DR. SHIRLEY is aware of LIP's concealed weapon , LIP carries a gun", 'last_speaker': 'DR. SHIRLEY'}
Last word -> DR. SHIRLEY : "I knew you had a gun!"
prediction :  Well, they do. You don’

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8422359228134155], 'recall': [0.8752570152282715], 'f1': [0.8584290146827698], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352942, 'rouge2': 0.0, 'rougeL': 0.11764705882352942, 'rougeLsum': 0.11764705882352942}
bleu 1/2 : 0 0
ppl : 44.16279983520508
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: Doc, my eyes are stingin’, I might need to pull over.\n\n', 'answer': 'Keep going as long as you can,', 'gold_tag': 'Everyday Language', 'last_speaker': 'DR. SHIRLEY'}
Last word -> DR. SHIRLEY : "Keep going as long as you can,"
prediction :  Can we pull into the next entrance?
Real answer : Keep going as long as you can,


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8655704259872437], 'recall': [0.8120710849761963], 'f1': [0.8379676938056946], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714285, 'rouge2': 0.0, 'rougeL': 0.14285714285714285, 'rougeLsum': 0.14285714285714285}
bleu 1/2 : 0 0
ppl : 247.1802978515625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLIP: That’s it. I’m pulling us into the next motel.\nDR. SHIRLEY: Keep going, Tony, you can make it.\n\n', 'answer': 'I can’t keep my eyes open, Doc--I’m gettin’ hytnotized by the snow. I think my brain’s gonna explode.', 'gold_tag': 'LIP is willing to assert his boundaries and needs , LIP is extremely tired and struggling to stay awake', 'last_speaker': 'LIP'}
Last word -> LIP : "I can’t

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8872486352920532], 'recall': [0.8187161087989807], 'f1': [0.8516057729721069], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857144, 'rouge2': 0.0, 'rougeL': 0.07142857142857144, 'rougeLsum': 0.07142857142857144}
bleu 1/2 : 0 0
ppl : 55.29771423339844
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPAMELA: We need to get in there.\n\n', 'answer': "I'm working on it.", 'gold_tag': 'Everyday Language', 'last_speaker': 'CRONIN'}
Last word -> CRONIN : "I'm working on it."
prediction :  Yes.
Real answer : I'm working on it.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9478527307510376], 'recall': [0.8724187612533569], 'f1': [0.9085727334022522], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 2505.07666015625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCRONIN: Looks like he's been detained.\nPAMELA: Who's going? Us?\nCRONIN: There's only a Consulate, they sent a field officer out half an hour ago --\n\n", 'answer': "Then get a number, they need to know who they're dealing with.", 'gold_tag': 'PAMELA wants to acquire a number, suggesting there is an immediate need or urgency to identify the person they are dealing with', 'last_speaker': 'PAMELA'}
Last word -> PAMELA : "Then get a numbe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8590992093086243], 'recall': [0.8458447456359863], 'f1': [0.852420449256897], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1111111111111111, 'rouge2': 0.0, 'rougeL': 0.1111111111111111, 'rougeLsum': 0.1111111111111111}
bleu 1/2 : 0 0
ppl : 107.91692352294922
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCRONIN: -- Kurt's reopening all the wyfi and sat\n\n", 'answer': '-- uplink all relevant files to Kim -- -- and I want them to contact anyone who had anything to do with Treadstone --', 'gold_tag': 'PAMELA holds a senior position , PAMELA can command the uplinking of files , PAMELA is likely in intelligence or law enforcement , PAMELA can order others to contact individuals related to "Tread

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8149758577346802], 'recall': [0.8029505014419556], 'f1': [0.8089184761047363], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333334, 'rouge2': 0.0, 'rougeL': 0.08333333333333334, 'rougeLsum': 0.08333333333333334}
bleu 1/2 : 0.0033689734995427335 0.000869865217169542
ppl : 57.68941879272461
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCRONIN: The room he checked into was across the hall -- why, why would he come here?\nPAMELA: He must've had a reason. That's how they were trained.\n\n", 'answer': 'He went out the window in here...', 'gold_tag': 'CRONIN is a problem solver or investigative type', 'last_speaker': 'CRONIN'}
Last word -> CRONIN : "He went out the window in here..."
pr

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8354390859603882], 'recall': [0.8728935718536377], 'f1': [0.8537557721138], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0.047619047619047616 0.01543033499620919
ppl : 21.879737854003906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCRONIN: You're sure?\nPAMELA: What? The tapes?\n\n", 'answer': 'Hold on... Yep. And Abbott just direct dialed Moscow', 'gold_tag': "CRONIN is in a job that deals with tapes and important direct calls , CRONIN's job is possibly in a government agency or high-level corporate position", 'last_speaker': 'CRONIN'}
Last word -> CRONIN : "Hold on... Yep. And Ab

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8210157752037048], 'recall': [0.8310750722885132], 'f1': [0.8260148167610168], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 20.600881576538086
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWALTON: You are now an Erewhon inmate -- a citizen of nowhere. Human rights zealots, the Geneva convention and the P.C. police have no authority here. You have no right... When I say your ass belongs to me -- I mean it. Bend over. Step into them. Don't sniff 'em, you perv. Just step into them.\nARCHER: They're too tight.\n\n", 'answer': "So's a noose. Now keep your mouth shut. The prison's one big magnetic field. The boots'll tell us 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8583807945251465], 'recall': [0.839551568031311], 'f1': [0.848861813545227], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.21538461538461537, 'rouge2': 0.06349206349206349, 'rougeL': 0.12307692307692307, 'rougeLsum': 0.12307692307692307}
bleu 1/2 : 0.016125861504125143 0.005256387755182872
ppl : 10.961220741271973
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARCHER: What? He started it! When I get out of here --\nWALTON: You'll what?\nARCHER: I'm going to have you fired.\n\n", 'answer': "That's two strikes, Dubov. One more and you know where you're going. Back to your 'suites' -- or no", 'gold_tag': 'WALTON holds a position of authority, possibly a supervisor or manager , WALTON has the pow

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8727052807807922], 'recall': [0.8095386624336243], 'f1': [0.8399360775947571], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307691, 'rouge2': 0.0, 'rougeL': 0.07692307692307691, 'rougeLsum': 0.07692307692307691}
bleu 1/2 : 0 0
ppl : 95.32534790039062
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWALTON: Better be nice, Castor. You could get mighty lonely now that Pollux is gone.\nARCHER: Pollux is -- what?\nWALTON: Archer cut him a deal for turning state's evidence. He's been released...\nARCHER: Walton, you have to listen to me -- right now!\n\n", 'answer': "Or what? You'll have me fired? You're confined until I say otherwise...", 'gold_tag': 'WALTON is responsible for the confi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8055114150047302], 'recall': [0.7982341051101685], 'f1': [0.8018562197685242], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 385.25579833984375
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWALTON: There!\n\n', 'answer': "Let's go!", 'gold_tag': 'Everyday Language', 'last_speaker': 'ARCHER'}
Last word -> ARCHER : "Let's go!"
prediction :  Who?
Real answer : Let's go!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8889866471290588], 'recall': [0.8338420391082764], 'f1': [0.8605318069458008], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13011.365234375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: How did the visit go?\nKYLE: OK. He fell asleep watching TV. So he lives there now?\nMIKE: Yeah. Your grandfather is in the things that don't make sense.\nKYLE: Are you a friend of his or something?\nMIKE: I'm his guardian.\nKYLE: What's that mean?\nMIKE: It means I make sure he's well taken care of.\n\n", 'answer': "That's cool.", 'gold_tag': 'Everyday Language', 'last_speaker': 'KYLE'}
Last word -> KYLE : "That's cool."
predictio

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8829588294029236], 'recall': [0.8830289244651794], 'f1': [0.8829938769340515], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.28571428571428575, 'rouge2': 0.0, 'rougeL': 0.28571428571428575, 'rougeLsum': 0.28571428571428575}
bleu 1/2 : 0 0
ppl : 136.19683837890625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: Here's fifty bucks in case you need it.\nKYLE: I have money.\nMIKE: Actually it's only forty-three but take it. You never know.\nKYLE: I have enough money. But thanks.\nMIKE: Well, good luck, Kyle.\n\n", 'answer': 'Yeah. You too, Mike.', 'gold_tag': 'Everyday Language', 'last_speaker': 'KYLE'}
Last word -> KYLE : "Yeah. You too, Mike."
prediction :  Do you want to go?
Real answer :

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8295918107032776], 'recall': [0.8688901662826538], 'f1': [0.8487863540649414], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.22222222222222224, 'rouge2': 0.0, 'rougeL': 0.22222222222222224, 'rougeLsum': 0.22222222222222224}
bleu 1/2 : 0 0
ppl : 66.48255157470703
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: So what happened?\nKYLE: I decided not to go home.\nMIKE: And then you decided you'd break into your Grandfather's house?\nKYLE: You mind if I have a quick smoke?\n\n", 'answer': 'Yeah, I do. Get in the car.', 'gold_tag': "MIKE disapproves of KYLE's smoking habit , MIKE is the responsible and authoritative figure", 'last_speaker': 'MIKE'}
Last word -> MIKE : "Yeah, I do. Get in the 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8508206605911255], 'recall': [0.8730840682983398], 'f1': [0.8618085980415344], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0.10000000000000002 0.03333333333333335
ppl : 114.26062774658203
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: You have fun?\nKYLE: Yeah.\nMIKE: When's the last time you wrestled?\nKYLE: A couple years ago.\nMIKE: Why'd you stop?\nKYLE: Just felt like it.\nMIKE: So let me ask you something, Kyle. How good are you?\n\n", 'answer': "I'm pretty good.", 'gold_tag': 'KYLE believes he is pretty good at wrestling', 'last_speaker': 'KYLE'}
Last word -> KYLE : "I'

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8766563534736633], 'recall': [0.9040913581848145], 'f1': [0.8901625871658325], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714288, 'rouge2': 0.0, 'rougeL': 0.14285714285714288, 'rougeLsum': 0.14285714285714288}
bleu 1/2 : 0 0
ppl : 100.11956787109375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: Were you outside?\nKYLE: Yeah. I went for a run.\nMIKE: It's pretty early. You do this every morning?\nKYLE: In season, yeah. I like to get my run in early and then drill some moves for an hour before school.\nMIKE: Oh. Wow.\n\n", 'answer': "Yeah...so I'm gonna do it. See you. 5/15/10 - FULL SHOOTING SCRIPT - WIN WIN 58", 'gold_tag': 'KYLE has an intense morning routine', 'last_spe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8774928450584412], 'recall': [0.8058187961578369], 'f1': [0.8401298522949219], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714285, 'rouge2': 0.0, 'rougeL': 0.14285714285714285, 'rougeLsum': 0.14285714285714285}
bleu 1/2 : 0 0
ppl : 38.61552429199219
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: Hey, I'm proud of what you did today. That win meant a lot to the guys.\n\n", 'answer': 'Cool. It was good coaching with Stemler.', 'gold_tag': 'KYLE is involved in coaching with Stemler', 'last_speaker': 'KYLE'}
Last word -> KYLE : "Cool. It was good coaching with Stemler."
prediction :  Hey, you know what? That was probably the biggest upset of the year, actually.
Real answer : Co

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8508485555648804], 'recall': [0.8894214630126953], 'f1': [0.8697075247764587], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09523809523809523, 'rouge2': 0.0, 'rougeL': 0.09523809523809523, 'rougeLsum': 0.09523809523809523}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 23.027809143066406
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: Let me ask you something. What's it like? Being as good as you are? What's it feel like?\nKYLE: I don't know. I guess it just feels like I'm in control. Of everything. You know what I mean?\nMIKE: Yeah, I do. Must be nice.\n\n", 'answer': 'It is.', 'gold_tag': 'KYLE is good at something', 'last_speaker': 'KYLE'}
Last word -> KYLE : "It is."
pred

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8437908291816711], 'recall': [0.9028236865997314], 'f1': [0.872309684753418], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0625, 'rouge2': 0.0, 'rougeL': 0.0625, 'rougeLsum': 0.0625}
bleu 1/2 : 0 0
ppl : 24.88302230834961
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: How you doing?\nKYLE: I'm sorry.\nMIKE: Hey, it's alright.\nKYLE: I don't want to go with her. I don't.\n\n", 'answer': 'OK, pal. OK.', 'gold_tag': 'MIKE is supportive', 'last_speaker': 'MIKE'}
Last word -> MIKE : "OK, pal. OK."
prediction :  She is like, really nice and sweet and I just. Like, I don't know what it is, I really like her.
Real answer : OK, pal. OK.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8071742057800293], 'recall': [0.8689056634902954], 'f1': [0.8369031548500061], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13.99208927154541
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHERMAN: Sure. Yes. Go ahead.\nMARTIN: So. Let's see. Can you tell us if your car was in use the night this happened?\nSHERMAN: When exactly was it?\nMARTIN: Tuesday a week ago.\nSHERMAN: I don't know. Let me think. I'd have to figure...\nMARTIN: Anybody else use your car?\nSHERMAN: My wife. Sometimes. And the\nMARTIN: Parking garage.\nSHERMAN: Yes.\nMARTIN: You leave the car with the keys\nSHERMAN: Well... yes.\n\n", 'answer': 'Could 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8700072765350342], 'recall': [0.8777201175689697], 'f1': [0.8738466501235962], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 49.90876007080078
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHERMAN: The car?\nMARTIN: Yes.\nSHERMAN: Now?\n\n', 'answer': "Soon as we leave here. We could take a look. There's things that's consistent with an incident like this. We don't find those things, we move on down the list. And we're out of your hair.", 'gold_tag': "MARTIN intends to examine the car as soon as they leave their current location , MARTIN is in a profession requiring some form of investigation, likely law enforcement , MA

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8387515544891357], 'recall': [0.8477673530578613], 'f1': [0.8432353734970093], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12987012987012989, 'rouge2': 0.026666666666666665, 'rougeL': 0.07792207792207792, 'rougeLsum': 0.07792207792207792}
bleu 1/2 : 0.11092625221822057 0.05629720488379733
ppl : 12.745979309082031
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHERMAN: So you want to take a look at the car then.\nMARTIN: Yeah. We don't have a description of a driver. So we gotta look for the car. And that means bothering a lot of innocent people. We're sorry about the inconvenience.\n\n", 'answer': "I understand. But if it is a routine, well, I should, I guess I ought to... well, follow the 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8566134572029114], 'recall': [0.8145556449890137], 'f1': [0.8350553512573242], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09756097560975609, 'rouge2': 0.0, 'rougeL': 0.048780487804878044, 'rougeLsum': 0.048780487804878044}
bleu 1/2 : 0.0006055109490751632 0.00021408044908687507
ppl : 98.21095275878906
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGRAHAM: Breakfast?\nCLIVE: Java. Straight up.\nGRAHAM: Any solids?\nCLIVE: No, let\'s punch through. Miles make the eggs taste sweeter.\nGRAHAM: I love eggs!\nCLIVE: I hear that. Now, get that "�Joe\' on the go and jump in the Chewie seat. I need your map skills.\nGRAHAM: Good job I pre-pared. It\'s a shame I don\'t have some controls on this sid

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8536180853843689], 'recall': [0.8586918115615845], 'f1': [0.8561474680900574], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07407407407407407, 'rouge2': 0.0, 'rougeL': 0.07407407407407407, 'rougeLsum': 0.07407407407407407}
bleu 1/2 : 0.07667036788577694 0.025323389070344468
ppl : 54.317222595214844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLIVE: Amazing, isn't it? There are 295,734,134 people in this country and we haven't seen a soul for a hundred miles. Where is everybody?\nGRAHAM: What about that old woman on crutches we saw hitchhiking?\nCLIVE: Apart from her. I suppose that's why they established the Dreamland facility out here.\nGRAHAM: Far from prying eyes.\nCLIVE: Did you know 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8360790014266968], 'recall': [0.9178653955459595], 'f1': [0.8750653266906738], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 10.740745544433594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGRAHAM: What's the matter?\nCLIVE: Nothing. How we doing?\nGRAHAM: We should be just about there.\nCLIVE: I've found it.\nGRAHAM: Where?\n\n", 'answer': 'THERE!!!', 'gold_tag': 'Everyday Language', 'last_speaker': 'CLIVE'}
Last word -> CLIVE : "THERE!!!"
prediction :  Oh! I think I know the address. You must be tired and hungry .
Real answer : THERE!!!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8087658286094666], 'recall': [0.8728052973747253], 'f1': [0.839566171169281], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 47.13725280761719
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLIVE: It's amazing, isn't it? We're actually here.\nGRAHAM: Area 51.\nCLIVE: Dreamland.\nGRAHAM: The Groom Lake Military Test Facility.\nCLIVE: MacArthur's Black Mile.\nGRAHAM: Is that a real one?\nCLIVE: No, I made it up myself.\nGRAHAM: I like it.\nCLIVE: Be great if we saw something.\nGRAHAM: Yes, imagine if we got buzzed up by a craft.\nCLIVE: That would be incredible. It just\nGRAHAM: What would you do if they actually landed?\nCL

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8387784957885742], 'recall': [0.8411428928375244], 'f1': [0.8399590253829956], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 23.8873233795166
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGRAHAM: Perhaps it's not them. Perhaps whoever it was turned doff.\nCLIVE: Of course it's them. We're dead. It's like Deliverance. They're going to rape us and break our arms.\nGRAHAM: I don't want my arms broken!\nCLIVE: There's only one thing for it.\nGRAHAM: What?\nCLIVE: I'm gonna have to fly blind. Probably best not to do that. I couldn't really see anything. Oh God, oh God, oh God.\n

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8462984561920166], 'recall': [0.8409245014190674], 'f1': [0.8436028957366943], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352942, 'rouge2': 0.0, 'rougeL': 0.11764705882352942, 'rougeLsum': 0.11764705882352942}
bleu 1/2 : 0 0
ppl : 86.13249969482422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGRAHAM: Now's not really the time, Sausage.\nCLIVE: I must have it.\nGRAHAM: It's very cheap. Are you sure you don't want to save up and get a proper on?\n\n", 'answer': "Oh, come on, we've got time.", 'gold_tag': 'CLIVE does not save for something better', 'last_speaker': 'CLIVE'}
Last word -> CLIVE : "Oh, come on, we've got time."
prediction :  I don't have the patience.
Real answer : O

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8758091926574707], 'recall': [0.8523832559585571], 'f1': [0.863937497138977], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 27.87188148498535
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGRAHAM: It's gone!\n\n", 'answer': 'Where did it go?', 'gold_tag': 'Everyday Language', 'last_speaker': 'CLIVE'}
Last word -> CLIVE : "Where did it go?"
prediction :  [sighs] Good. Good, we've got about...
Real answer : Where did it go?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8269937038421631], 'recall': [0.8226573467254639], 'f1': [0.824819803237915], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 80.49722290039062
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBILL: No, you listen to me. You don't understand, Gene. Don't lie, just give the family what it wants... No... No, you don't understand.\nROSE: Why me ?... Hi !... I'm Rose.\nBILL: Hi, Rose !... I't nice to run into me like this !... I'm Bill Capa...\nROSE: I got no insurance... She gets out of her car. I know it's against the law and everything. Don't bust my chops. I'll bring the money to you, if you just get an estimate.\nBILL: Got a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8324453830718994], 'recall': [0.879741907119751], 'f1': [0.8554404377937317], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0588235294117647, 'rouge2': 0.0, 'rougeL': 0.0588235294117647, 'rougeLsum': 0.0588235294117647}
bleu 1/2 : 0.03571428571428572 0.011501092655705905
ppl : 17.261947631835938
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: Hello !\nBILL: Who's there ?\nROSE: Hi !... Remember me ?... Fender-bender !...\nBILL: There she is, a little angel, dancing on the head of a pin.\nROSE: So... did you get that estimate ?\nBILL: No.\nROSE: Waow !... Nice place you got here. It's a little cold... but it's kind of tasteful, right ?\nBILL: And wet !\nROSE: You too, I guess.\nBILL: Cold

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8060929179191589], 'recall': [0.8344497680664062], 'f1': [0.8200262188911438], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 74.67772674560547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: Not even a shade of pink ?... Can't tell me if it's smeared or not ?\nBILL: I can't tell you if your eyes are bloodshed.\nROSE: Really !... That's sad !... You know what ?... In respect for your infirmity, I'm gonna give up wearing lipstick. Close-up on Rose putting her lipstick in her glass of water. There is something about me that I bet you find a little strange. Right ?\nBILL: What is that ?\nROSE: Well, I haven't asked you w

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8220106363296509], 'recall': [0.8200147747993469], 'f1': [0.821011483669281], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 44.50728225708008
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBILL: Yes, but that was part of my plan. I mean, I wanted to.\nROSE: Oh, I need a taxi. Taxi !\nBILL: No, hey, hey. Wait, wait, wait. What are you being so tough for ?\nROSE: I'm just beginning to think that maybe I'Il...\nBILL: I am thinking you should just let me take you home and forget about this stupid taxi.\nROSE: Rose walks to her waiting cab. Well, I'm thinkin' you should stay exactly where you are, 'cause...\nBILL: Give me your

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8340535759925842], 'recall': [0.8187589645385742], 'f1': [0.8263355493545532], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12121212121212122, 'rouge2': 0.0, 'rougeL': 0.09090909090909091, 'rougeLsum': 0.09090909090909091}
bleu 1/2 : 0.075 0.013867504905630728
ppl : 17.773876190185547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: It's red, poor thing.\nBILL: Thank you.\nROSE: I was thinking. Maybe I should see a shrink.\nBILL: I can recommend someone.\nROSE: I feel better already.\nBILL: God, I missed you.\n\n", 'answer': "This time you won't miss.", 'gold_tag': 'Everyday Language', 'last_speaker': 'ROSE'}
Last word -> ROSE : "This time you won't miss."
prediction :  He's a good guy. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8359932899475098], 'recall': [0.839889407157898], 'f1': [0.8379367589950562], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.21052631578947367, 'rouge2': 0.0, 'rougeL': 0.10526315789473684, 'rougeLsum': 0.10526315789473684}
bleu 1/2 : 0.0909090909090909 0.03015113445777637
ppl : 16.316486358642578
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: I want you to get dressed up... Get dressed up... All right ?\nBILL: Right now ?\n\n', 'answer': 'Uh-huh.', 'gold_tag': 'Everyday Language', 'last_speaker': 'ROSE'}
Last word -> ROSE : "Uh-huh."
prediction :  You're going to the theater. All right? Get dressed up, all right, all right. Do it, I'm waiting. Okay, all right, go on then.
Real answer : 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8249920606613159], 'recall': [0.7956313490867615], 'f1': [0.8100457787513733], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 16.157852172851562
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: You're not eating. Don't you like my food?\nBILL: I'd like your food five inches to the left.\nROSE: Okay.\nBILL: Yeah, okay.\n\n", 'answer': "I think it's ready.", 'gold_tag': 'ROSE is the one cooking , ROSE may be adept at cooking or be the designated cook among the two', 'last_speaker': 'ROSE'}
Last word -> ROSE : "I think it's ready."
prediction :  But, okay, Bill, I mean I'll make your salad that I've spent an hour making. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.818966269493103], 'recall': [0.8693430423736572], 'f1': [0.8434031009674072], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06060606060606061, 'rouge2': 0.0, 'rougeL': 0.06060606060606061, 'rougeLsum': 0.06060606060606061}
bleu 1/2 : 0.03846153846153846 0.012403473458920851
ppl : 31.316980361938477
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROSE: Hey.\nBILL: How did you get in here ?\nROSE: Last time I was here, I stole the key. It was by the door.\nBILL: What about the alarm ?\nROSE: Oh ! I never thought about that.\nBILL: It didn't go off ?\nROSE: No. Um, did I make a mistake ? You have someone in the car ?\nBILL: No, goddamn it, there isn't someone in the car ! People are getting kille

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8858352899551392], 'recall': [0.8371662497520447], 'f1': [0.8608134388923645], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10810810810810811, 'rouge2': 0.05714285714285715, 'rougeL': 0.10810810810810811, 'rougeLsum': 0.10810810810810811}
bleu 1/2 : 0.0002927199051977936 0.00010688619673840353
ppl : 79.89852142333984
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBILL: Voice over.\nROSE: Voice ironically disappointed.\nBILL: Oh, our, our tank fleet is crippled.\nROSE: Yeah ?\nBILL: It's goin' away.\nROSE: How's the submarine fleet ?\nBILL: Hey, hey, hey, hey, please, please. I gotta get some sleep.\nROSE: They kiss. Oh, you're chicken.\nBILL: I gotta be up...\nROSE: You're chicken.\nBILL: I 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8263660669326782], 'recall': [0.843869686126709], 'f1': [0.8350261449813843], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09523809523809525, 'rouge2': 0.0, 'rougeL': 0.09523809523809525, 'rougeLsum': 0.09523809523809525}
bleu 1/2 : 0 0
ppl : 23.770479202270508
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBILL: One of my patients was killed last night. He was murdered.\nROSE: She seems very disturbed by the news. What ?\nBILL: The painter. Casey.\nROSE: She has tears in her eyes. Were you there ?\nBILL: A little bit after.\nROSE: What did you see ?\nBILL: You don't want to know. About a half hour before group. I'm gonna get a shower. You gonna be okay out here ?\n\n", 'answer': 'Yeah.', 'g

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8828050494194031], 'recall': [0.9474420547485352], 'f1': [0.9139821529388428], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 87.44342803955078
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBILL: Rose.\nROSE: She jumps. Oh ! Oh !\n\n', 'answer': 'Rose ! Rose !', 'gold_tag': 'Everyday Language', 'last_speaker': 'BILL'}
Last word -> BILL : "Rose ! Rose !"
prediction :  Rose, what kind of idiot do you think Rose (rose) is to jump (jump), right in the air. I'll take you to Rose Hospital. If you try to commit suicide again, you just walk right to the nurse at the reception and tell her you want to die.
Real answer : Rose ! Ros

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8064841032028198], 'recall': [0.854041576385498], 'f1': [0.8295817971229553], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07843137254901959, 'rouge2': 0.04081632653061225, 'rougeL': 0.07843137254901959, 'rougeLsum': 0.07843137254901959}
bleu 1/2 : 0.041666666666666664 0.00941554471443387
ppl : 30.20322036743164
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMO: Hey. You're creasing the pants.\nSHADES: Tough guy.\nMO: Shut up.\nSHADES: That's why you're still in the corps, isn't it?\nMO: Why?\nSHADES: You tell me. No, really. Tell me why.\nMO: The money's good. I get around. I see what's left of the world.\nSHADES: I get around, too, and at least I'm still in one piece.\nMO: Yeah, until one 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8250527381896973], 'recall': [0.8165053725242615], 'f1': [0.820756733417511], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.048780487804878044, 'rouge2': 0.0, 'rougeL': 0.048780487804878044, 'rougeLsum': 0.048780487804878044}
bleu 1/2 : 0 0
ppl : 28.86924934387207
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMO: Jesus... Oh, Jesus... Jill. Come on...come on...!\nSHADES: Ayuh... Hey, uh...wow, um...it's cool, but...right now it's, uh, kinda difficult... Do you know what time it is?\nMO: Listen to me! This is serious! Jill's in danger and you've gotta help her!\nSHADES: Oh, Jeez. Oh...Christ. It had to be tonight.\nMO: What's wrong?\nSHADES: My heart feels like an alligator.\nMO: What?! Shade

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8402701020240784], 'recall': [0.8348913192749023], 'f1': [0.8375720977783203], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12244897959183672, 'rouge2': 0.0, 'rougeL': 0.08163265306122448, 'rougeLsum': 0.08163265306122448}
bleu 1/2 : 0.05604671793449092 0.01297229449750861
ppl : 31.023096084594727
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMO: Hey! Hey! Listen to me! Come back! You've got to do this for me! You've got to go down to the apartment and get Jill out of there right\nSHADES: Yeah, yeah. Okay.\nMO: Go get Jill. Bring her to your place and lock the door. Got that?\nSHADES: What's going on? You in trouble with the cops again or --\nMO: I'll tell you later! Just do it! Now!\nSHADE

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8079285621643066], 'recall': [0.8502340316772461], 'f1': [0.8285415768623352], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 82.17268371582031
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTAFT: Eddie Shitface! Where you been, boy? Eddie, I don't think I introduced you to my new partner, Michael Hernandez...\n\n", 'answer': 'Pleased to meet you, Mr. Shitface...', 'gold_tag': 'HERNANDEZ is a new partner to TAFT , HERNANDEZ has a formal tone indicating a level of education or professionalism', 'last_speaker': 'HERNANDEZ'}
Last word -> HERNANDEZ : "Pleased to meet you, Mr. Shitface..."
prediction :  I'm sorry, sir.
Real ans

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8852166533470154], 'recall': [0.8502141833305359], 'f1': [0.8673624396324158], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 38.490203857421875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTAFT: This is our collar? Who is he?\n\n', 'answer': 'Some scumbag...', 'gold_tag': "HERNANDEZ has a negative view of people they consider 'scumbags' , This negative view is likely due to their job", 'last_speaker': 'HERNANDEZ'}
Last word -> HERNANDEZ : "Some scumbag..."
prediction :  It’s me; it used to be my mother’s; she didn’t have children.
Real answer : Some scumbag...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8127333521842957], 'recall': [0.8171452283859253], 'f1': [0.8149333000183105], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 25.277511596679688
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTAFT: Who are you, man? Who the fuck are you?\n\n', 'answer': "I'm me.", 'gold_tag': 'HERNANDEZ indicates his identity only as "me"', 'last_speaker': 'HERNANDEZ'}
Last word -> HERNANDEZ : "I'm me."
prediction :  I’m your best friend Jack. I’m the only one who has your back.
Real answer : I'm me.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8531115651130676], 'recall': [0.9214187264442444], 'f1': [0.8859504461288452], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2222222222222222, 'rouge2': 0.125, 'rougeL': 0.2222222222222222, 'rougeLsum': 0.2222222222222222}
bleu 1/2 : 0 0
ppl : 16.391372680664062
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLATESHA: Tell me she has an off switch --\nSCOTT: We can only unlock the emergency override from the main terminal, and that's assuming she lets us in.\nLATESHA: What're we supposed to do, say pretty\n\n", 'answer': "We can't shut her down, but maybe we can get her to shut herself down. Help me with this As they tug at the hatch --", 'gold_tag': 'SCOTT is responsible for problem-solving',

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8548156023025513], 'recall': [0.8466219305992126], 'f1': [0.8506990671157837], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.04651162790697675, 'rouge2': 0.0, 'rougeL': 0.04651162790697675, 'rougeLsum': 0.04651162790697675}
bleu 1/2 : 0.03345383928243689 0.010925978157206149
ppl : 21.93385887145996
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSCOTT: This controls her primary cooling system... if we cut the circulation, the temp in her tank goes up.\nLATESHA: So we boil her brain.\nSCOTT: She'll have to shut herself down to keep from overheating.\nLATESHA: Can't she just drain the. water?\n\n", 'answer': 'Not anymore.', 'gold_tag': 'SCOTT demonstrates technical expertise', 'last_speaker': 'S

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.838405966758728], 'recall': [0.8741185665130615], 'f1': [0.8558899164199829], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 20.65447998046875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSCOTT: So...\nLATESHA: So...\nSCOTT: Hey :you think I could have your... you KNOW\n\n', 'answer': "That's my address. I'm not using a cell phone anymore. Pick me up at eight.", 'gold_tag': 'LATESHA has decided not to use a cell phone anymore , LATESHA will be available for SCOTT to pick her up at eight', 'last_speaker': 'LATESHA'}
Last word -> LATESHA : "That's my address. I'm not using a cell phone anymore. Pick me up at eight."
predic

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9067261815071106], 'recall': [0.8449791669845581], 'f1': [0.8747643232345581], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 6502.19970703125
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRON: You can have our old room. I’m in the basement.\nJOHN: What, you live here now?\n\n', 'answer': 'We’re in a recession.', 'gold_tag': 'RON refers to the recession', 'last_speaker': 'RON'}
Last word -> RON : "We’re in a recession."
prediction :  I'm still here.
Real answer : We’re in a recession.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9067401885986328], 'recall': [0.8988308906555176], 'f1': [0.9027681946754456], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 33.58161544799805
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRON: She is going to be okay, right?\nJOHN: Yeah. Better then ever.\n\n', 'answer': 'Tell Dad I’ll take the job.', 'gold_tag': 'RON is going to take a job , RON has an upcoming occupational commitment', 'last_speaker': 'RON'}
Last word -> RON : "Tell Dad I’ll take the job."
prediction :  Why you acting so hostile?
Real answer : Tell Dad I’ll take the job.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8421616554260254], 'recall': [0.8447511196136475], 'f1': [0.8434544205665588], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 715.8826904296875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRON: John. John.\n\n', 'answer': 'What? What is it?', 'gold_tag': 'Everyday Language', 'last_speaker': 'JOHN'}
Last word -> JOHN : "What? What is it?"
prediction :  Yeah?
Real answer : What? What is it?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9347659349441528], 'recall': [0.8829267024993896], 'f1': [0.9081071019172668], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67594.546875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRON: Everyone got on their seat belts?\nJOHN: Yeah. Just go. Go, go!\nRON: Shit. I never did get that fixed.\n\n', 'answer': 'Oh my God.', 'gold_tag': 'Everyday Language', 'last_speaker': 'JOHN'}
Last word -> JOHN : "Oh my God."
prediction :  (2015 Nissan Pathfinder King Sized SUV (9 Seat) with a 3.5 Liter V6 Engine)
Real answer : Oh my God.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7838701009750366], 'recall': [0.8128703832626343], 'f1': [0.7981069087982178], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 19.529016494750977
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMALCOLM: That's one fine frame. A fine frame it is. How much does a fine frame like that cost, you think?\nANNA: I've never told you... but you sound a little like Dr. Seuss when you're drunk.\nMALCOLM: Anna, I'm serious. Serious I am, Anna.\nANNA: Mahogany. I'd say that cost at least a couple hundred. Maybe three.\nMALCOLM: Three? We should hock it. Buy a C.D. rack for the bedroom.\nANNA: Do you know how important this is? This is bi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8280274271965027], 'recall': [0.8502469062805176], 'f1': [0.8389900922775269], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09195402298850575, 'rouge2': 0.0, 'rougeL': 0.0689655172413793, 'rougeLsum': 0.0689655172413793}
bleu 1/2 : 0.06779661016949153 0.010811600584038359
ppl : 11.974944114685059
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMALCOLM: Anna, I've been so lost. I need my best friend.\nANNA: I miss you.\nMALCOLM: I miss you.\nANNA: Why, Malcolm?\nMALCOLM: What, Anna? What did I do? What's made you so sad?\nANNA: Why did you leave me?\n\n", 'answer': "I didn't leave you. No...", 'gold_tag': 'Everyday Language', 'last_speaker': 'MALCOLM'}
Last word -> MALCOLM : "I didn't leave yo

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8684155344963074], 'recall': [0.8494476675987244], 'f1': [0.8588268756866455], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 88265.40625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMALCOLM: ANNA! Don't cry. I think I have to go. I just needed to do a couple of things. And I needed to tell you something.\nANNA: Tell me.\nMALCOLM: You were never second... Ever. You sleep now, Anna. Everything will be different in the morning.\nANNA: Goodnight, Malcolm.\n\n", 'answer': "Goodnight, sweetheart. ...I think I've had too much to drink. I just have to say, this day today has been one very special day... I wish we all could stay

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8573594093322754], 'recall': [0.8253467082977295], 'f1': [0.841048538684845], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.18518518518518517, 'rouge2': 0.037735849056603765, 'rougeL': 0.11111111111111109, 'rougeLsum': 0.11111111111111109}
bleu 1/2 : 0.0358083566649828 0.012204379773959488
ppl : 10.754672050476074
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMONA: Why did you join a support group of unattractive, militant lesbians when you have me?\nZOE: Because you\'re really not that supportive.\nMONA: Sure I am.\nZOE: You just told me that if I breast- feed "my girls" will end up looking like empty tube socks.\nMONA: I\'m just being honest. And don\'t even get me started on what childbir

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8615032434463501], 'recall': [0.8467578291893005], 'f1': [0.8540669083595276], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13793103448275862, 'rouge2': 0.0, 'rougeL': 0.13793103448275862, 'rougeLsum': 0.13793103448275862}
bleu 1/2 : 0.013680833103983134 0.0047391796051938035
ppl : 54.48143005371094
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZOE: What if I\'m pregnant?\nMONA: Then you\'ll get fat and hormonal and I\'ll say, "I told you so."\nZOE: But what if this could go somewhere with Stan? Like, really go somewhere.\nMONA: Then it will work out perfectly.\nZOE: What do you mean?\nMONA: Because just when you really like him, you\'ll break up because you\'re pregnant and then you never 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8846467137336731], 'recall': [0.8880783319473267], 'f1': [0.8863592147827148], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.22222222222222224, 'rouge2': 0.0, 'rougeL': 0.22222222222222224, 'rougeLsum': 0.22222222222222224}
bleu 1/2 : 0 0
ppl : 41.70455551147461
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMONA: Why are you even working out? This is the one time in your life you're allowed to let it go.\nZOE: Because look at me. I just keep\nMONA: Well, yeah. That's kind of how it goes.\nZOE: I'm afraid I'm going to scare him away. I'm like one of those giant bouncy castles. Soon I'm going to take over the whole back yard.\nMONA: Come on. He loves you for you.\nZOE: I'm just getting kind of

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8386309146881104], 'recall': [0.8457162380218506], 'f1': [0.8421586751937866], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06060606060606061, 'rouge2': 0.0, 'rougeL': 0.06060606060606061, 'rougeLsum': 0.06060606060606061}
bleu 1/2 : 0.041666666666666664 0.013459547551454133
ppl : 14.775688171386719
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLESLIE: We'll be right down.\nALEC: June first!'\nLESLIE: Honey, I just got promoted. I'm going to have a lot more responsibility. I'm sure it's\nALEC: Looking at my big picture agenda, I want to be playing softball with my firstborn in about four years.\nLESLIE: Do you realize that over half the marriages today end in divorce.\nALEC: And the other h

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8638664484024048], 'recall': [0.8711125254631042], 'f1': [0.8674744367599487], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 59.11177062988281
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALEC: Okay. Okay. Like my mom always says, 'Don't get a hard-on'...\nLESLIE: And like my mom always says... 'Marriage makes two people one. But you never know which one.'\n\n", 'answer': "Yeah, and like my pop says, 'The way to woman's heart is,to saw her in half.'", 'gold_tag': 'ALEC might have a sarcastic and humorous personality', 'last_speaker': 'ALEC'}
Last word -> ALEC : "Yeah, and like my 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.836189866065979], 'recall': [0.827646017074585], 'f1': [0.8318960070610046], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06666666666666667, 'rouge2': 0.0, 'rougeL': 0.06666666666666667, 'rougeLsum': 0.06666666666666667}
bleu 1/2 : 0.05971094254781577 0.019721875238006125
ppl : 37.02888870239258
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALEC: Did you forget one of your albums?\n\n', 'answer': "I'm sorry to bother you here. It's Jules. After she left for work this morning, the finance company came and took away all her furniture, her Jeep, everything they could get their hands on... I couldn't reach her so I finally went over to her office. Alec -- she was fired three weeks ago! All this

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8292198181152344], 'recall': [0.7962767481803894], 'f1': [0.8124144077301025], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06711409395973154, 'rouge2': 0.0, 'rougeL': 0.06711409395973154, 'rougeLsum': 0.06711409395973154}
bleu 1/2 : 0.0003643093392122539 5.937515801075783e-05
ppl : 55.18923568725586
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEIL: Come away with me.\nEADY: What?\nNEIL: New Zealand.\nEADY: When?\nNEIL: Soon.\nEADY: You\'re kidding.\nNEIL: What\'s stopping you? What\'s here?\nEADY: I don\'t know you.\nNEIL: What\'s to know?\nEADY: Are you married?\nNEIL: Why?\nEADY: Way you come and go.\nNEIL: I deal with state officials. County. Weird hours. A lot of traveling, entertain

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8923746943473816], 'recall': [0.8685712814331055], 'f1': [0.8803120851516724], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15, 'rouge2': 0.0, 'rougeL': 0.15, 'rougeLsum': 0.15}
bleu 1/2 : 0.04035303290174325 0.013179245357734959
ppl : 43.77810287475586
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEADY: What did you do? Was that you?\nNEIL: It's what I don't do. I don't sell swimming pools. It would have been okay. You would have flown out after. You didn't need to do a thing. Now I'm jammed. We got to go together. Because we get dropped in a place and then I decide: New Zealand or Brazil or Tunisia, whatever. I can't leave a string back here to where I am.\nEADY: What.\nNEIL: I rob and st

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8213638067245483], 'recall': [0.8666649460792542], 'f1': [0.8434064984321594], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0.02941176470588235 0.009440686400617014
ppl : 10.192008972167969
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEIL: Eady! !\nEADY: Why you do this to me! What I do to you?!\n\n', 'answer': 'Shut up!', 'gold_tag': 'Everyday Language', 'last_speaker': 'NEIL'}
Last word -> NEIL : "Shut up!"
prediction :  You know it. I am so sorry darling I am just so very very sorry...
Real answer : Shut up!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7895169258117676], 'recall': [0.8439542055130005], 'f1': [0.8158285021781921], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 61.08317565917969
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEIL: Use these credit cards. They say "Mr. and Mrs." I reserved the car. Buy me some clothes. Here\'s a list. I can\'t go home for my stuff. Can you do it?\nEADY: When\'s it over?\nNEIL: Twenty-two hours. Then we fly out of here...clean.\nEADY: I want you to let me go. When will you let me go?\nNEIL: You can go now. There\'s the door, you want out...\nEADY: No, I have to do what you say. But will you let me leave after?\n\n', 'answer'

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8302193880081177], 'recall': [0.8441498875617981], 'f1': [0.8371266722679138], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.22222222222222224, 'rouge2': 0.0, 'rougeL': 0.11111111111111112, 'rougeLsum': 0.11111111111111112}
bleu 1/2 : 0 0
ppl : 39.07798767089844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEIL: Any trouble?\nEADY: No. When's it over?\n\n", 'answer': "Six hours. 8 p.m. I'll buy you lunch. Everything's smooth. We're going to fly away...", 'gold_tag': "NEIL is involved in an operation or job that will be concluded in six hours , NEIL is confident enough to plan ahead for a meal and a future escape , NEIL's job or operation will be over in six hours at 8 p.m.", 'last_speaker':

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8614199161529541], 'recall': [0.8602536916732788], 'f1': [0.860836386680603], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.24242424242424243, 'rouge2': 0.06451612903225808, 'rougeL': 0.24242424242424243, 'rougeLsum': 0.24242424242424243}
bleu 1/2 : 0.19786244288317878 0.037599649154247045
ppl : 16.1672306060791
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNEIL: Leave it running. ...be right back, Eady.\n\n', 'answer': 'Neil...?', 'gold_tag': 'Everyday Language', 'last_speaker': 'EADY'}
Last word -> EADY : "Neil...?"
prediction :  Hey, Neil. Let me know what's going on.
Real answer : Neil...?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8400010466575623], 'recall': [0.8199547529220581], 'f1': [0.8298568725585938], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.19999999999999998, 'rouge2': 0.0, 'rougeL': 0.19999999999999998, 'rougeLsum': 0.19999999999999998}
bleu 1/2 : 0 0
ppl : 36.11728286743164
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJAMES: Do you have an ice pick I can jam into my ears? I can’t listen to this song again!\nCONNELL: No, but I have this. Be cool with that. So, were you on a date the other night?\nJAMES: Um. I wouldn’t call it a date.\nCONNELL: Not into Em?\nJAMES: No, she’s great. But I’m not really looking for a girlfriend. I’m getting over a broken heart.\nCONNELL: You didn’t tell her that, did you?\n

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8642997741699219], 'recall': [0.8327834606170654], 'f1': [0.8482489585876465], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.03147926713959365 0.010904738014161919
ppl : 63.490352630615234
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCONNELL: You got one of those little baby joints on you?\nJAMES: I do.\n\n', 'answer': 'I could use that. Let’s go for a ride.', 'gold_tag': 'CONNELL has a habit of using marijuana', 'last_speaker': 'CONNELL'}
Last word -> CONNELL : "I could use that. Let’s go for a ride."
prediction :  Do you need any help holding that joint in place?
Real answer : I could use that

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8353679776191711], 'recall': [0.8717184662818909], 'f1': [0.8531562089920044], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10000000000000002, 'rouge2': 0.0, 'rougeL': 0.10000000000000002, 'rougeLsum': 0.10000000000000002}
bleu 1/2 : 0 0
ppl : 86.10474395751953
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJAMES: Your mom okay?\nCONNELL: My mother’s over three hundred pounds, half-blind from diabetes. She’s also got arthritis and emphysema. And now she can’t control her bowels. Personally, I’d rather be dead.\nJAMES: I’m sorry...\nCONNELL: That’s cool...\nJAMES: It’s Neil Young.\nCONNELL: Oh. Right.\nJAMES: Cortez the Killer.\nCONNELL: Yeh-yeh. ADVENTURELAND (revised 8/5/07)\nJAMES: You got

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8713339567184448], 'recall': [0.8503744602203369], 'f1': [0.8607265949249268], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.39999999999999997, 'rouge2': 0.14285714285714288, 'rougeL': 0.26666666666666666, 'rougeLsum': 0.26666666666666666}
bleu 1/2 : 0.18181818181818182 0.042640143271122095
ppl : 9.886153221130371
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCONNELL: I talked to Em. I know you know... Look, I can’t expect you to be happy about any of this. But will you hear me out? You’re a smart guy. You know that it’s possible to love two people at the same time, right? I love my wife. I really do. I don’t want to hurt her. I fucking despise the idea of her getting hurt. And I don’t think

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7963256239891052], 'recall': [0.9299108982086182], 'f1': [0.8579494953155518], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 249.41468811035156
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTARLETON: General, a message from the commander of the rebel militia.\nCORNWALLIS: It seems our Swamp Fox wants to have a formal parley.\nTARLETON: Are you going to meet with him?\n\n', 'answer': 'Most certainly. Arrange it.', 'gold_tag': 'CORNWALLIS has the authority to command TARLETON to arrange the meeting , CORNWALLIS will soon have a meeting with the commander of the rebel militia , TARLETON has to arrange a meeting between CORN

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8595876693725586], 'recall': [0.8881466388702393], 'f1': [0.8736338019371033], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 70.3725814819336
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORNWALLIS: From General Clinton in New York... "... your request to move north is denied until you have properly dealt with your militia problem." He underlined, \'militia\'. "You have spent over six months dealing with a six-week problem. It is essential that you quell the militia..." Underlined again. "... insurgency, particularly because of the likely move south of Washington and the inevitable arrival of the French. Militia, as you

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7955217957496643], 'recall': [0.8585647344589233], 'f1': [0.8258418440818787], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.021505376344086023, 'rouge2': 0.0, 'rougeL': 0.021505376344086023, 'rougeLsum': 0.021505376344086023}
bleu 1/2 : 0.011363636363636359 0.003614090617663796
ppl : 11.604601860046387
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORNWALLIS: Do you see that, Colonel?\n\n', 'answer': "Unless I'm dreaming, I think I see irregulars at their center.", 'gold_tag': 'Everyday Language', 'last_speaker': 'TARLETON'}
Last word -> TARLETON : "Unless I'm dreaming, I think I see irregulars at their center."
prediction :  Yes sir.
Real answer : Unless I'm dreaming, I think I see irregul

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8445848822593689], 'recall': [0.8352676630020142], 'f1': [0.8399004936218262], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1380.511474609375
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSYDNEY: Had a sudden intense craving for a cherry popsicle.\nLUKE: You can check in that freezer.\nSYDNEY: What a relief. I\'m Sydney. You?\nLUKE: Luke.\nSYDNEY: Workin\' late, huh? Luke?\nLUKE: Yeah, four AM. It sucks.\nSYDNEY: Sometimes you guys are just unfathomably stupid.\nLUKE: Excuse me?\nSYDNEY: Why in the world do guys say "it sucks" as if it were a bad thing? I mean, every guy\'s dying to get a blow job, right? So, you think 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8210625648498535], 'recall': [0.8140614032745361], 'f1': [0.8175469636917114], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13366.4169921875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSYDNEY: What are we doing here?\nLUKE: Didn't want anyone around. So, I did what you said.\nSYDNEY: Amazing, huh?\nLUKE: Weird. Awful.\nSYDNEY: Hmm. Was it difficult making the switch?\nLUKE: No. It was hard gettin' back. I need an owner's manual.\nSYDNEY: A lot of guys could use one of those.\nLUKE: So, what now? Do I take pills or something?\nSYDNEY: No, it doesn't work like that. This may sound odd, but I need to know if you're attra

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8244373202323914], 'recall': [0.8689695596694946], 'f1': [0.8461179137229919], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473684, 'rouge2': 0.0, 'rougeL': 0.10526315789473684, 'rougeLsum': 0.10526315789473684}
bleu 1/2 : 0 0
ppl : 18.890897750854492
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSYDNEY: Oh, honey. Come in.\nLUKE: You're one too. Why didn't you tell me?\nSYDNEY: I didn't want to confuse the issue.\nLUKE: So, it's true what you said? You really know. I wouldn't change unless I wanted to?\nSYDNEY: Maybe some part of you.\nLUKE: What part? How much of me? I mean, for it to work? Half? More than half? What if it was only a tiny bit, -a thought?\nSYDNEY: I don't know.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8495508432388306], 'recall': [0.836034893989563], 'f1': [0.842738687992096], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 52.47615432739258
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSYDNEY: Do you like this dress?\nLUKE: Sure. Looks great on you. Man, dresses! Even without everything else, dresses are reason enough right there to stay a guy.\nSYDNEY: "Stay a guy?" Let me get this straight. You didn\'t decide to be a woman?\nLUKE: No. Of course not. What?\nSYDNEY: Just testing you. Hold on a moment. There\'s something I forgot...\nLUKE: What\'s that for?\nSYDNEY: You\'ve never done it with another Z. Trust me, you ne

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8287498950958252], 'recall': [0.827670693397522], 'f1': [0.8282099366188049], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67.0344009399414
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: I was posted here after Berlin.\n\n', 'answer': "Where's Daniels?", 'gold_tag': 'Everyday Language', 'last_speaker': 'BOURNE'}
Last word -> BOURNE : "Where's Daniels?"
prediction :  I was posted here after Berlin.
Real answer : Where's Daniels?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8424702882766724], 'recall': [0.8138279318809509], 'f1': [0.8279014825820923], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 588.6737060546875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: Okay. They know you're here.\nBOURNE: How long do I have?\n\n", 'answer': 'Three minutes...', 'gold_tag': 'Everyday Language', 'last_speaker': 'NICKY'}
Last word -> NICKY : "Three minutes..."
prediction :  10 years.
Real answer : Three minutes...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9464040398597717], 'recall': [0.8847657442092896], 'f1': [0.9145475029945374], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 31.827241897583008
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: 8 o'clock this morning, he wired $100,000. to a numbered bank account in Tangier.\nBOURNE: Okay, Tangier is about 300 miles away, we can make the morning ferry.\n\n", 'answer': 'And then what?', 'gold_tag': 'Everyday Language', 'last_speaker': 'NICKY'}
Last word -> NICKY : "And then what?"
prediction :  There's two things I need first.
Real answer : And then what?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8477382659912109], 'recall': [0.8383026719093323], 'f1': [0.8429940938949585], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 51.05104446411133
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: What are you after, Jason? Why are you looking for Daniels?\nBOURNE: Do you know who this is?\nNICKY: That's Daniels. I don't know who that is.\nBOURNE: I have to get back to the beginning...or I'll never be free of this. I think he started Treadstone. I remember meeting him. I remember the first day... Daniels brought me to him.\nNICKY: What happened?\n\n", 'answer': 'An initiation...I guess...', 'gold_tag': 'BOURNE was involve

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8463375568389893], 'recall': [0.8452102541923523], 'f1': [0.845773458480835], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05714285714285715, 'rouge2': 0.0, 'rougeL': 0.05714285714285715, 'rougeLsum': 0.05714285714285715}
bleu 1/2 : 0 0
ppl : 22.1875057220459
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOURNE: Now they found Daniels, do you know where he is?\nNICKY: They'll get one of the operatives to terminate him.\n\n", 'answer': 'Find out who.', 'gold_tag': 'BOURNE is determined and assertive, demanding immediate results', 'last_speaker': 'BOURNE'}
Last word -> BOURNE : "Find out who."
prediction :  You've just been in a very real situation, and you coped.
Real answer : Find out who.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8438683748245239], 'recall': [0.8627321720123291], 'f1': [0.8531960248947144], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 31.062721252441406
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOURNE: Tell him you're going to meet him and that you have a new phone for him.\nNICKY: If you stop Desh they'll just get someone else.\n\n", 'answer': "We're not going to stop him. We're going to follow him. He'll take us right to Daniels.", 'gold_tag': 'BOURNE has knowledge or experience in covert operations', 'last_speaker': 'BOURNE'}
Last word -> BOURNE : "We're not going to stop him. We're going to follow him. He'll take us righ

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8811036944389343], 'recall': [0.8651548027992249], 'f1': [0.8730564117431641], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05555555555555556, 'rouge2': 0.0, 'rougeL': 0.05555555555555556, 'rougeLsum': 0.05555555555555556}
bleu 1/2 : 0 0
ppl : 18.66954231262207
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOURNE: I was starting to remember who they were...in India...with Marie.\nNICKY: It's just going to lead to more killing, Jason. Are you sure you want that?\n\n", 'answer': "I've killed people and I've tried to apologize for what I've done, for what I am. None of it makes it better... They're going to come for you again. You are going to have to run now.", 'gold_tag': "BOURNE is a killer

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8468618392944336], 'recall': [0.8418890237808228], 'f1': [0.8443681001663208], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.03225806451612904, 'rouge2': 0.0, 'rougeL': 0.03225806451612904, 'rougeLsum': 0.03225806451612904}
bleu 1/2 : 0 0
ppl : 42.00212478637695
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNARRATOR: Napoleon was born at Ajaccio in Corsica on August 15th, 1769. He had not been a healthy baby and his mother, Letizia, lavished him with care and devotion. In middle age, he would write about her from St. Helena.\n\n', 'answer': 'My mother has always loved me. She would do anything for me.', 'gold_tag': "NAPOLEON V.O. had a loving relationship with his mother, Letizia, who lavish

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.829791784286499], 'recall': [0.8838039636611938], 'f1': [0.8559466600418091], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12244897959183675, 'rouge2': 0.0425531914893617, 'rougeL': 0.12244897959183675, 'rougeLsum': 0.12244897959183675}
bleu 1/2 : 0.02777777777777778 0.00890870806374748
ppl : 21.188583374023438
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNARRATOR: His moods at this time were complex and varied.\n\n', 'answer': 'Life is a burden for me. Nothing gives me any pleasure; I find only sadness in everything around me. It is very difficult because the ways of those with whom I live, and probably always shall live, are as different from mine as moonlight is from sunlight.', 'gold_t

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.840959906578064], 'recall': [0.8338808417320251], 'f1': [0.8374053835868835], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08823529411764706, 'rouge2': 0.0, 'rougeL': 0.0588235294117647, 'rougeLsum': 0.0588235294117647}
bleu 1/2 : 0.03888603909688373 0.007284022497350465
ppl : 56.227638244628906
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNARRATOR: He made friends with a family called Columbier, and would later write of his first flirtation with their daughter, Caroline.\n\n', 'answer': 'It will scarcely be considered credible, perhaps, but our whole business consisted in eating cherries together.', 'gold_tag': "NAPOLEON V.O. had a flirtation with the Columbier's daughter, Caroline , NAPO

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8217774629592896], 'recall': [0.8088710308074951], 'f1': [0.8152732253074646], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 11753562.0
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNAPOLEON (V.O.): Soldiers, you are half-naked and badly clothed. The authorities find much fault with you and yet can give nothing. Your patience, your courage are admirable but you are not getting any fame. I will lead you into the must fruitful plains in the world -- rich provinces and great cities shall be your possessions, and then you will have wealth, honor and fame in full measure. ANIMATED MAP\n\n', 'answer': 'With the Italian campaig

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8567442893981934], 'recall': [0.8366542458534241], 'f1': [0.8465800881385803], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1379310344827586, 'rouge2': 0.023529411764705885, 'rougeL': 0.11494252873563218, 'rougeLsum': 0.11494252873563218}
bleu 1/2 : 0.05463304521211505 0.022763768838381267
ppl : 33.86528015136719
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNARRATOR: making political decisions and treaties like a Roman Conqueror, enlarging his role to ruler of Italy. Only his tremendous success and ever increasing popularity prevented the Directory from replacing him.\n\n', 'answer': 'From that moment on, I foresaw what I might be. Already I felt the earth flee beneath me, as if I were bei

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8136071562767029], 'recall': [0.8319121599197388], 'f1': [0.8226578235626221], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15533980582524273, 'rouge2': 0.019801980198019802, 'rougeL': 0.0970873786407767, 'rougeLsum': 0.0970873786407767}
bleu 1/2 : 0.09459459459459461 0.03599744941394482
ppl : 19.65390396118164
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNARRATOR: Led by the warlike Queen Louisa, and her fashion-minded husband, King Frederich Wilhelm, the Prussians still believed themselves cast in the mold of Frederick the Great, and more than a match for Napoleon. The King had a special collection of 60 splendid uniforms, and was personally involved in the design of all the Prussian arm

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8512433171272278], 'recall': [0.8501676917076111], 'f1': [0.8507051467895508], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3508771929824562, 'rouge2': 0.10909090909090909, 'rougeL': 0.2105263157894737, 'rougeLsum': 0.2105263157894737}
bleu 1/2 : 0.2727272727272727 0.13055824196677338
ppl : 23.505298614501953
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNARRATOR: On January 1st, 1814, France itself was invaded. Now, with a small army of raw recruits, Napoleon would have to face the powerful combination of England, Russia, Prussia and Austria, operating against him together, for the first time. The balance of numbers had tilted irretrievably against him.\n\n', 'answer': 'A year ago, the who

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8605269193649292], 'recall': [0.8557237386703491], 'f1': [0.8581185936927795], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17391304347826086, 'rouge2': 0.0, 'rougeL': 0.08695652173913043, 'rougeLsum': 0.08695652173913043}
bleu 1/2 : 0.1481636441363436 0.034922505831373
ppl : 31.662643432617188
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLAROCHE: Once you get the sickness, it takes over your life. I started out just photographing 'em. Now look at me. It'll happen to you. You'll see.\nORLEAN: I don't think so. I'm not prone to --\nLAROCHE: Angraecum sesquipedale! Beauty! God! Darwin wrote about this one. Charles Darwin? Evolution guy? Hello?\n\n", 'answer': 'I know who Darwin is.', 'gold_t

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.845931887626648], 'recall': [0.8576224446296692], 'f1': [0.8517370223999023], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 5.077290058135986
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLAROCHE: See that nectary all the way down there? Darwin hypothesized a moth with a nose twelve inches long to pollinate it. Everyone thought he was a loon. Then, sure enough, they found this moth with a twelve inch proboscis -- proboscis means nose, by the way -- and --\nORLEAN: I know what proboscis means.\nLAROCHE: Every one of these flowers has a specific relationship with the insect that pollinates it. There's an orchid that looks 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8121851682662964], 'recall': [0.8415526151657104], 'f1': [0.8266081213951111], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15789473684210525, 'rouge2': 0.017857142857142856, 'rougeL': 0.07017543859649124, 'rougeLsum': 0.07017543859649124}
bleu 1/2 : 0.05000000000000001 0.007955572841757302
ppl : 14.921436309814453
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLAROCHE: No shit I'm a fun character. Who's gonna play me?\nORLEAN: I've got to write it first. Someone's gotta write the screenplay. Most things never get made. It's premature to --\n\n", 'answer': 'I think I should play me.', 'gold_tag': 'LAROCHE expresses his desire to play himself , LAROCHE has an interest in acting', 'last_speake

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8598203659057617], 'recall': [0.878412663936615], 'f1': [0.8690170645713806], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 29.473934173583984
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nORLEAN: Laroche, can I ask you a personal question?\n\n', 'answer': "We're not lost.", 'gold_tag': 'Everyday Language', 'last_speaker': 'LAROCHE'}
Last word -> LAROCHE : "We're not lost."
prediction :  It depends on what.
Real answer : We're not lost.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8427140712738037], 'recall': [0.8696659803390503], 'f1': [0.8559778928756714], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 281.83831787109375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLAROCHE: The jewel of the Fakahatchee.\nORLEAN: I still don't get it. I mean, there it is. I can see it's pretty, but --\n\n", 'answer': "You'll get it.", 'gold_tag': 'Everyday Language', 'last_speaker': 'LAROCHE'}
Last word -> LAROCHE : "You'll get it."
prediction :  So if you ever get it, I'll have just given you the world, the richest, most beautiful piece of land in the world.
Real answer : You'll get it.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8289191722869873], 'recall': [0.9103258848190308], 'f1': [0.8677173852920532], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2857142857142857, 'rouge2': 0.07692307692307691, 'rougeL': 0.21428571428571427, 'rougeLsum': 0.21428571428571427}
bleu 1/2 : 0.043478260869565216 0.014058038927888336
ppl : 20.291961669921875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFRANCES: And I can't understand how it can hurt to be honest, but the more I tried to explain --\nLILLIAN: Don't listen to him, little sister. When you're proud of what you are, you don't refuse the label, understand?\nFRANCES: Yes, Ma.\n\n", 'answer': 'And you... should be proud. You won that contest and made a name for yourself.', 'g

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8625537157058716], 'recall': [0.8715869188308716], 'f1': [0.8670467734336853], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.10000000000000002 0.022941573387056175
ppl : 27.427261352539062
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILLIAN: ...and here's the one from Duluth. A war widow with five children. She works in a defense plant and she's very worried about you. I answered her that she shouldn't let worry over you affect her vital work; and that you'd be back on the silver screen in no time. And here's one from nice Mr. Zeiss. He says that...\nFRANCES: Why are these all op

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7803052663803101], 'recall': [0.7773505449295044], 'f1': [0.7788251042366028], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 14.509743690490723
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILLIAN: Frances, play 'Flow Gently Sweet Afton'.\nFRANCES: Oh Mama, I'm so... tired of that song.\nLILLIAN: Please. I want you to. It would make me so happy. It's just a flow gently sweet Afton day. Life has been so good to me. Why, I have just about everything one could wish... but I still have\nFRANCES: I think I need a little air.\nLILLIAN: What's wrong?\nFRANCES: Nothing. I think I'll just go out for awhile.\nLILLIAN: Where are y

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9259530305862427], 'recall': [0.9107421636581421], 'f1': [0.9182845950126648], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.5, 'rouge2': 0.0, 'rougeL': 0.5, 'rougeLsum': 0.5}
bleu 1/2 : 0 0
ppl : 15194.8623046875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILLIAN: Oh, just leave those things for now.\nFRANCES: No, Mama, I'll take care of it. I'll wash them in the morning.\nLILLIAN: You know, little sister, I never resented you for refusing to see me in the... the hospital. I knew you had to manage on your own before you could come back.\nFRANCES: Thank you for understanding, Mama.\nLILLIAN: Little sister, I don't want you to feel any rush to get back to work. I want you to rest... for a 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8121739029884338], 'recall': [0.8669917583465576], 'f1': [0.8386880159378052], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 22.679250717163086
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKATIE: You're home.\nOSBOURNE: Hang on to your hat, honey. I have some news. I----\nKATIE: Did you pick up the cheeses?\nOSBOURNE: Huh?\nKATIE: Were they ready? I didn't know you were coming home this early.\nOSBOURNE: The cheeses.\nKATIE: I left a message for you to stop at Todaro's. The Magruders and the Pfarrers are coming over.\nOSBOURNE: The Pfarrers? Ugh. I----what did Kathleen say?\nKATIE: What?\nOSBOURNE: When you left the mes

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8242659568786621], 'recall': [0.8564233779907227], 'f1': [0.8400370478630066], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.034482758620689655, 'rouge2': 0.0, 'rougeL': 0.034482758620689655, 'rougeLsum': 0.034482758620689655}
bleu 1/2 : 0 0
ppl : 16.931522369384766
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKATIE: You quit?!\nOSBOURNE: Uh-huh.\nKATIE: Well----Thank you for letting me know!\nOSBOURNE: I tried to tell you this afternoon.\nKATIE: You tried? You tried? And then---- what, the aphasia kicked in?\nOSBOURNE: Our guests came. We----\nKATIE: Why?! For fuck's sake, Ozzie!\nOSBOURNE: I'm just----I don't know. I got so tired.\nKATIE: You're tired.\nOSBOURNE: Tired of swimming against

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8553128242492676], 'recall': [0.8532665967941284], 'f1': [0.8542885184288025], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307691, 'rouge2': 0.0, 'rougeL': 0.07692307692307691, 'rougeLsum': 0.07692307692307691}
bleu 1/2 : 0 0
ppl : 52.397682189941406
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nOSBOURNE: There's some clown----a couple of clowns ----somehow got a hold of my memoir----\nKATIE: Your what?\nOSBOURNE: Stole it or----I have no idea how they got it----\nKATIE: Your what?\nOSBOURNE: My memoir, the book I'm writing.\nKATIE: Why in God's name would they think that's worth anything.\n\n", 'answer': "Well they----I... I've no idea how they got it.", 'gold_tag': 'OSBOURNE i

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8416325449943542], 'recall': [0.8301127552986145], 'f1': [0.835832953453064], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1, 'rouge2': 0.0, 'rougeL': 0.1, 'rougeLsum': 0.1}
bleu 1/2 : 0 0
ppl : 8.536019325256348
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCATHY: Excuse me. Can I help you? Who are you?\nRAYMOND: Ma'am, I'm sorry. My name's Raymond Deagan. I'm Otis Deagan's son. I've just been - taking over some of his jobs since he -\nCATHY: You're Otis's son?\nRAYMOND: Yes.\nCATHY: Well I'm - terribly sorry for speaking to you in that manner. I didn't know who was in my yard.\nRAYMOND: No need.\nCATHY: How is your father? I knew he was in the hospital.\nRAYMOND: Yes, I - My father passed

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8508301973342896], 'recall': [0.8946834802627563], 'f1': [0.8722059726715088], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.2}
bleu 1/2 : 0 0
ppl : 65.53659057617188
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCATHY: Raymond!\nRAYMOND: Mrs. Whitaker...\nCATHY: Hi.\nRAYMOND: Hello.\nCATHY: Well, wouldn't you know it. I just received a call and suddenly everything's changed. Anyway. I just...\n\n", 'answer': 'You changed your mind? Well good!', 'gold_tag': 'RAYMOND is understanding and receptive to changes', 'last_speaker': 'RAYMOND'}
Last word -> RAYMOND : "You changed your mind? Well good!"
prediction :  I am getting married tomorrow!
Real a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8478056788444519], 'recall': [0.8426198959350586], 'f1': [0.8452048301696777], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 82.84739685058594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCATHY: It's lovely. What is it?\nRAYMOND: It's called a Quaking Aspen. Fairly rare in these parts.\nCATHY: It's beautiful. And you were right. What a perfectly lovely spot.\nRAYMOND: Sometimes a little green, some fresh air, just helps put things back on the shelf. ‘Cause it sure can be a disheveling world out there, every now and again.\nCATHY: I'll say it can. Is that a path?\nRAYMOND: Looks like it.\nCATHY: Oh let's have a peek.\n\n



prediction : 
Real answer : Alright.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.0], 'recall': [0.0], 'f1': [0.0], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : nan
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: Says here they got slips in Honolulu. 325 a month. Utilities included. That's not bad. But I gotta get at least a forty footer. It'll handle rough water better and I'll need the room if I'm gonna live on it.\nRODRIGUEZ: I don't know how you do it.\nDIVINCI: What?\nRODRIGUEZ: How you can think about Hawaii now?\nDIVINCI: My heart's in Hawaii.\nRODRIGUEZ: You never been there. How can your heart be there.\nDIVINCI: You're tellin' me there's no place you'd rather be other than here?\nROD

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8366167545318604], 'recall': [0.8174119591712952], 'f1': [0.8269028663635254], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 42.15943145751953
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: SHIT, FUCK, SHIT, JESUS CHRIST!\nDIVINCI: Calm down!\nRODRIGUEZ: WE'RE DEAD, WE'RE FUCKING DEAD!\nDIVINCI: SHUT UP!\nRODRIGUEZ: FUCK, FUCK, FUCK!\nDIVINCI: STOP IT! YOU HEAR ME, RIGHT NOW, STOP IT!\nRODRIGUEZ: GOD, GOD, THIS IS REALLY FUCKIN' OUT OF CONTROL! LET GO OF ME!\n\n", 'answer': "Shut up and listen to me! They could be watchin' us r

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8578853011131287], 'recall': [0.812690258026123], 'f1': [0.8346764445304871], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14634146341463414, 'rouge2': 0.0, 'rougeL': 0.14634146341463414, 'rougeLsum': 0.14634146341463414}
bleu 1/2 : 0.0013266573081177418 0.0004595675723784975
ppl : 24.727794647216797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: There's no fuckin' reason to panic.\nRODRIGUEZ: No fuckin' reason to panic? Did you say no fuckin' reason to panic!\nDIVINCI: Stop being an idiot! We're the cops on the case. It's our case. We are going to find the fuckin' killers.\nRODRIGUEZ: WE'RE THE FUCKIN' KILLERS!\n\n", 'answer': 'Since when does that matter?!', 'gold_tag': 'DIVINCI d

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8170747756958008], 'recall': [0.8251473903656006], 'f1': [0.8210912346839905], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.049999999999999996, 'rouge2': 0.0, 'rougeL': 0.049999999999999996, 'rougeLsum': 0.049999999999999996}
bleu 1/2 : 0.03125 0.010040241611281232
ppl : 22.702810287475586
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: All they want is someone to go down for the crime, right? Do we care who goes down for the crime? Fuck no. As long as someone goes down for the crime. It's a slot that's gotta be filled.\nRODRIGUEZ: We killed a cop, doesn't that bother you?\nDIVINCI: Of course it bothers me. What d'you want me to do, turn myself in? Well I'm not. That's the risk we ta

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.85235595703125], 'recall': [0.8540566563606262], 'f1': [0.8532055020332336], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0.07692307692307693 0.025318484177091663
ppl : 57.7885856628418
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: He didn't have backup. You believe that?! Jesus what a dick.\nRODRIGUEZ: I don't think we should criticize that particular decision of his.\nDIVINCI: I don't care what the reasons are, you don't play Lone Fuckin' Ranger on a drug buy. You're just askin' for it. That's all I'm sayin'.\nRODRIGUEZ: I don't trust those bastards. They're not waitin' f

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8127821683883667], 'recall': [0.8869646787643433], 'f1': [0.8482546210289001], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 53.87767791748047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: Is that any way to answer the phone?\nRODRIGUEZ: Divinci, shit! It's quarter to six.\nDIVINCI: I know what time it is, I'm not the one in bed. I need the photos of the scene.\nRODRIGUEZ: Now?\nDIVINCI: I got a suspect. But he can't remember all the details. He needs a little help. And I thought, since you're my partner, you might jump at the opportunity to lend me a fuckin' hand.\nRODRIGUEZ: He doesn't remember last Friday?\n\

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8715328574180603], 'recall': [0.8382003307342529], 'f1': [0.8545415997505188], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 77.7574462890625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: The Dunner gun, you used the fucking Dunner gun?!\nDIVINCI: This is not my fuck up! Our shooting was supposed to be gang-related and unsolved! Only because we tagged a goddamn undercover cop do we need the fucking .44 for evidence! Otherwise, we never woulda needed it. So don't point your finger at me!\nRODRIGUEZ: Alright, alright... But how are we supposed to explain how the same gun was used in two murder cases by two diffe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8407306671142578], 'recall': [0.8390809297561646], 'f1': [0.8399049639701843], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.21621621621621623, 'rouge2': 0.0, 'rougeL': 0.10810810810810811, 'rougeLsum': 0.10810810810810811}
bleu 1/2 : 0.029858242066679905 0.007137485028505504
ppl : 140.4276123046875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: We gotta put the right gun back.\nDIVINCI: We can't put the right gun back! Without that gun, they won't convict Joe.\nRODRIGUEZ: Without that gun they won't convict Clyde!\nDIVINCI: I'm not worried about Clyde, I'm worried about us!\nRODRIGUEZ: Look, Joe's never goin' to trial, right. He signed a goddamn confession! Like you said, this one

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8312664031982422], 'recall': [0.8633898496627808], 'f1': [0.8470236659049988], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17391304347826086, 'rouge2': 0.0909090909090909, 'rougeL': 0.17391304347826086, 'rougeLsum': 0.17391304347826086}
bleu 1/2 : 0.057142857142857134 0.04099600308453938
ppl : 15.180192947387695
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: Nobody can connect us to the evidence thing... we're clean there. The most important one is this DEA mess anyway. Once that's finished, we're completely in the clear.\nRODRIGUEZ: You know we can't do it anymore.\nDIVINCI: ...yeah, I guess.\nRODRIGUEZ: No, we're through. That's it.\nDIVINCI: Good while it lasted though. A hundre

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8062070608139038], 'recall': [0.8697373867034912], 'f1': [0.8367681503295898], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13.958742141723633
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: -- when was the last time you saw him? -- Uh-huh -- And you haven't seen him since then?\nRODRIGUEZ: We gotta talk.\nDIVINCI: Just a second. Calm down. I heard. Clyde walked.\nRODRIGUEZ: No! They're takin' Joe to trial. They found him mentally unable to plead so his attorney pleaded not\nDIVINCI: You gotta be kiddin' me.\n\n", 'answer': "Yeah, that's me, I love to make people laugh. You hear the one about the two cops who pop

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8307573795318604], 'recall': [0.8480319976806641], 'f1': [0.8393058180809021], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1111111111111111, 'rouge2': 0.0, 'rougeL': 0.05555555555555555, 'rougeLsum': 0.05555555555555555}
bleu 1/2 : 0.030119421191220207 0.010039807063740072
ppl : 37.83916473388672
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: -- but William McCall shunned the business world and turned his life to medicine, receiving his medical degree from Harvard in 1969. Once again he turned his back on the established norm and spent several years in Africa performing organ transplants for the poor. Jesus Christ, this is who you picked? A goddamn surgeon who performs organ tran

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8366422653198242], 'recall': [0.8568160533905029], 'f1': [0.8466090559959412], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11428571428571428, 'rouge2': 0.0, 'rougeL': 0.11428571428571428, 'rougeLsum': 0.11428571428571428}
bleu 1/2 : 0.05263157894736841 0.017099639201419235
ppl : 51.90889358520508
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: -- look, she's our witness, if she's in custody, somebody better tell me! Anything?\nRODRIGUEZ: Nothin'. Nobody knows nothin'. The D.A. thinks she's gone. D.E.A. doesn't know shit.\nDIVINCI: If she was bein' held by police, we'd know.\nRODRIGUEZ: Unless she said somethin'.\n\n", 'answer': "If she said something we wouldn't be sittin' here. I t

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8520994782447815], 'recall': [0.8234208822250366], 'f1': [0.8375146985054016], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.21621621621621623, 'rouge2': 0.0, 'rougeL': 0.10810810810810811, 'rougeLsum': 0.10810810810810811}
bleu 1/2 : 0.05871331642584224 0.01917568884665115
ppl : 12.603189468383789
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: To justice in all its forms.\nRODRIGUEZ: We're not home free yet.\nDIVINCI: Without Ms Webb, there's no place they can go. We still got the confession, the gun, the ballistics... And I don't give a shit about all this African doctor crap... the man is a drunk and he's been a drunk for seven goddamn years. So lets have a drink for drunks. That'

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8886706829071045], 'recall': [0.8735900521278381], 'f1': [0.881065845489502], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3, 'rouge2': 0.0, 'rougeL': 0.3, 'rougeLsum': 0.3}
bleu 1/2 : 0.19470019576785122 0.04654224884494861
ppl : 14.718111991882324
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: She didn't talk.\nDIVINCI: Don't hold your breath.\nRODRIGUEZ: You think she'll talk.\nDIVINCI: I know she'll talk.\nRODRIGUEZ: We'll make it look gang related.\nDIVINCI: Problem is we're gonna need another gun.\nRODRIGUEZ: The problem?! Have you forgotten? She's in fucking jail. That's the goddamn problem.\n\n", 'answer': "Since when is being in jail a guaranty of a long life? You just ge

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8349008560180664], 'recall': [0.8303759098052979], 'f1': [0.8326321840286255], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09836065573770493, 'rouge2': 0.0, 'rougeL': 0.09836065573770493, 'rougeLsum': 0.09836065573770493}
bleu 1/2 : 0.06818181818181818 0.012592155012732868
ppl : 16.498876571655273
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: You get the gun?\nRODRIGUEZ: I got it.\nDIVINCI: Everything okay?\nRODRIGUEZ: Are you kidding? We're going to kill a goddamn witness who's in fucking jail because we killed an undercover DEA agent. I'm sorry, but this kinda shit troubles me a little.\nDIVINCI: Take off the dress and get back in the game. We gotta do what we gotta do.\n\n", 'a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8627399802207947], 'recall': [0.8492661714553833], 'f1': [0.8559500575065613], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0 0
ppl : 52.61912536621094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: This is the last time, Frank. The last time we kill somebody.\nDIVINCI: Yeah, it's the last time.\nRODRIGUEZ: This'll be eleven, Frank. Eleven is enough.\nDIVINCI: I get the picture.\nRODRIGUEZ: It was all okay until you shot Hudd.\nDIVINCI: What's goin' on here? You mad at me or something?\nRODRIGUEZ: It wasn't my idea to start killing people, Frank. That's all I'm sayin'. We'

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8426551222801208], 'recall': [0.8592042922973633], 'f1': [0.8508492112159729], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08695652173913043, 'rouge2': 0.0, 'rougeL': 0.08695652173913043, 'rougeLsum': 0.08695652173913043}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 109.34839630126953
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: What're you doin'?\n\n", 'answer': 'I need some fresh air.', 'gold_tag': 'Everyday Language', 'last_speaker': 'DIVINCI'}
Last word -> DIVINCI : "I need some fresh air."
prediction :  I'm going.
Real answer : I need some fresh air.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9202381372451782], 'recall': [0.8853394389152527], 'f1': [0.9024515748023987], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.25, 'rougeLsum': 0.25}
bleu 1/2 : 0 0
ppl : 101.39031219482422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRODRIGUEZ: Frank --\nDIVINCI: Talk to me, okay? Just fuckin' talk to me.\nRODRIGUEZ: I don't know... I owe some money. I don't know how I'm gonna pay it...\nDIVINCI: You gambling again?\nRODRIGUEZ: Yeah, yeah... I just couldn't... Frank... let's forget about this. Let's just get the fuck outa here. We got enough money, let's go. We'll get outa the country or something. What the hell're you doin'?!\nDIVINCI: Take off your shirt.\nRO

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8486621379852295], 'recall': [0.8532583713531494], 'f1': [0.8509540557861328], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1333333333333333, 'rouge2': 0.0, 'rougeL': 0.08888888888888889, 'rougeLsum': 0.08888888888888889}
bleu 1/2 : 0 0
ppl : 12.783040046691895
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIVINCI: I'll just feel a whole lot better when she's dead. Then we're off the hook.\nRODRIGUEZ: Right.\nDIVINCI: YOU FUCKIN' RAT!\nRODRIGUEZ: Yeah... yeah... I am, I'm a fuckin' rat... I traded you, Frank, I fuckin' traded you... Simms knows everything. Just fuckin' shoot me, get it the fuck over with! I DESERVE IT! I FUCKING DESERVE TO DIE!\nDIVINCI: You're fuckin' right. ...Get out.\nR

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8800837993621826], 'recall': [0.8154451251029968], 'f1': [0.8465323448181152], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 58.8576774597168
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: It's okay...you're going to be okay.\nRENNIE: I want to go home. I want off this ship.\n\n", 'answer': "Me too. Let's go talk to my Dad.", 'gold_tag': 'SEAN has a familial relationship with the person in authority onboard, his Dad', 'last_speaker': 'SEAN'}
Last word -> SEAN : "Me too. Let's go talk to my Dad."
prediction :  Well, the first thing I have to warn you is they'll likely lock you up first. But no, really. It's better th

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8427921533584595], 'recall': [0.8669753670692444], 'f1': [0.854712724685669], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0784313725490196, 'rouge2': 0.0, 'rougeL': 0.0392156862745098, 'rougeLsum': 0.0392156862745098}
bleu 1/2 : 0.025641025641025644 0.00821440096816907
ppl : 16.249677658081055
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRENNIE: Can he really take us home?\nSEAN: Not completing a voyage is against everything he stands for. But I think I can convince him to call a Coast Guard cutter for you.\nRENNIE: What about you?\n\n', 'answer': "If I go with you, he'll never speak to me again. But I'm never going to live up to his expectations anyway...so maybe it's the right thing to 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8373476266860962], 'recall': [0.842896044254303], 'f1': [0.8401126861572266], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.043478260869565216, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0.020210089552970725 0.006675172886333876
ppl : 19.83507537841797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: Rennie...??\nRENNIE: The window... I don't understand what is happening to me...\n\n", 'answer': "It's gonna be okay...", 'gold_tag': 'SEAN is a supportive presence , SEAN is a calming presence', 'last_speaker': 'SEAN'}
Last word -> SEAN : "It's gonna be okay..."
prediction :  You're not alone...
Real answer : It's gonna be okay.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8710317015647888], 'recall': [0.8460773229598999], 'f1': [0.85837322473526], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 56.976959228515625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRENNIE: What is it??\n\n', 'answer': 'The power room...', 'gold_tag': 'Everyday Language', 'last_speaker': 'SEAN'}
Last word -> SEAN : "The power room..."
prediction :  Are you worried for her?
Real answer : The power room...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8267489671707153], 'recall': [0.8287835121154785], 'f1': [0.8277649879455566], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 288.8167724609375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: Rule one, don't panic. Rule two, assess the damage and act accordingly...\nRENNIE: Is the ship going to sink??\nSEAN: I don't know. The emergency lights just kicked in... We'll be okay. I want you to wait by the lifeboats, just in case.\nRENNIE: I'm not going near any lifeboat!\nSEAN: But Rennie...\n\n", 'answer': "I'm not!!", 'gold_tag': 'RENNIE displays fear towards lifeboats , RENNIE shows strong reluctance towards lifeboats',

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8638485670089722], 'recall': [0.8506441712379456], 'f1': [0.8571954965591431], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2222222222222222, 'rouge2': 0.0, 'rougeL': 0.2222222222222222, 'rougeLsum': 0.2222222222222222}
bleu 1/2 : 0 0
ppl : 80.81505584716797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: Rennie...what'd they do to you??\nRENNIE: Drugs... Then Jason came. He's here, Sean.\n\n", 'answer': 'We have to find the others.', 'gold_tag': 'SEAN takes on a leadership role to find the others , SEAN has a present task to find the others', 'last_speaker': 'SEAN'}
Last word -> SEAN : "We have to find the others."
prediction :  (laughs) What!
Real answer : We have to find the others.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8141390085220337], 'recall': [0.849062442779541], 'f1': [0.8312340974807739], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 301.22821044921875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRENNIE: I was at school when they told me. 'Rennie, we have some very bad news... your parents have been killed in an auto accident.' It seems like everybody I care about ends up...\nSEAN: Not this time.\n\n", 'answer': "Toby... What's wrong, boy?", 'gold_tag': 'RENNIE has a pet named Toby', 'last_speaker': 'RENNIE'}
Last word -> RENNIE : "Toby... What's wrong, boy?"
prediction :  You do this to people.
Real answer : Toby... What's wro

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8510965704917908], 'recall': [0.8316783905029297], 'f1': [0.8412754535675049], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 204.11422729492188
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: Run, Rennie...\n\n', 'answer': "You never got me in the lake, Jason. And you're not going to get me now either.", 'gold_tag': 'RENNIE is a brave individual , RENNIE is not easily intimidated , RENNIE showed a defiant stance against Jason', 'last_speaker': 'RENNIE'}
Last word -> RENNIE : "You never got me in the lake, Jason. And you're not going to get me now either."
prediction :  I can't...
Real answer : You never got me in the

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8331310153007507], 'recall': [0.8181650042533875], 'f1': [0.8255802392959595], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 90.78096008300781
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSAN DE: Your training begins tomorrow. For now, your shr fu will\n\n', 'answer': 'Great, just great.', 'gold_tag': 'Everyday Language', 'last_speaker': 'DREW'}
Last word -> DREW : "Great, just great."
prediction :  Yeah, sure.
Real answer : Great, just great.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9172672629356384], 'recall': [0.8881875276565552], 'f1': [0.9024932980537415], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 310.96142578125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDREW: Yeah, what's up with him?\n\n", 'answer': 'Come with me...', 'gold_tag': 'Everyday Language', 'last_speaker': 'SAN DE'}
Last word -> SAN DE : "Come with me..."
prediction :  Oh, he's fine. You know, he's actually happy today.
Real answer : Come with me...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8240950703620911], 'recall': [0.7974911332130432], 'f1': [0.8105748295783997], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 20.414377212524414
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDREW: I wouldn't have made it through without you. You once asked why I came here, why did you let me in?\nSAN DE: Do you remember the story you heard about the monk who\nDREW: Yes.\nSAN DE: That monk was me.\nDREW: Anything.\nSAN DE: Before you leave for American, can you help at an international tournament in Beijing? The Shaolin temple has an entry, and he will be fighting men from all over the world I'd like you to be there.\nDREW

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.853756308555603], 'recall': [0.8542020916938782], 'f1': [0.8539791107177734], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0.12500000000000003 0.04225771273642583
ppl : 71.82957458496094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHENRY: You heard our broadcast.\nJIM: Yes.\nHENRY: We must be a disappointment. You were hoping for a full brigade. An army base, with helicopters and field hospitals.\nJIM: We were hoping for...\nHENRY: The answer to infection. Well, as I said, it's here. Though it may not be quite what you imagined.\nJIM: I didn't know what to imagine, so... We just fe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8672436475753784], 'recall': [0.8559510707855225], 'f1': [0.8615604043006897], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.045454545454545456, 'rouge2': 0.0, 'rougeL': 0.045454545454545456, 'rougeLsum': 0.045454545454545456}
bleu 1/2 : 0.012245642825298194 0.004081880941766065
ppl : 24.91856575012207
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHENRY: And finally... ...meet Mailer. Mailer, Jim. Jim, Mailer. Got infected three days ago. Mitchell managed to knock him out cold. Got a chain around his neck.\nJIM: You're keeping him alive?\n\n", 'answer': "The idea was to learn something about infection. Have him teach me. In a way. He's teaching me he'll never bake bread. Plant crops, raise l

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8594930768013], 'recall': [0.8330743312835693], 'f1': [0.8460775017738342], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10256410256410256, 'rouge2': 0.0, 'rougeL': 0.10256410256410256, 'rougeLsum': 0.10256410256410256}
bleu 1/2 : 0.011626811151332938 0.0026850970192156052
ppl : 14.925926208496094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHENRY: Drink?\nJIM: Look. We're grateful. Very grateful for your protection, and just to have found other people. But if we're going to stay here with you, you're going to have to stop your men from -\nHENRY: Who have you killed?\nJIM: I haven't killed anyone.\nHENRY: Since it began, who have you killed? You wouldn't be alive now if you hadn't killed s

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8446821570396423], 'recall': [0.8206977248191833], 'f1': [0.8325172066688538], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.047619047619047616, 'rouge2': 0.0, 'rougeL': 0.047619047619047616, 'rougeLsum': 0.047619047619047616}
bleu 1/2 : 8.11308885405331e-05 2.721212727773861e-05
ppl : 40.78818893432617
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHANG: I am from a nearby Lamasery. My name is Chang.\nCONWAY: And mine's Conway. You've no idea, sir, how unexpected and very welcome you are. My friends and I - and the lady in the plane - left Baskul night before last for Shanghai, but we suddenly found ourselves travelling in the opposite direction� At the mercy of a mad pilot. We'd be eternal

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8618899583816528], 'recall': [0.834944486618042], 'f1': [0.8482032418251038], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0 0
ppl : 273.0649719238281
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHANG: it. There are over two thousand people in the Valley besides those here in Shangri-La.\nCONWAY: Who and what is Shangri-La? You?\nCHANG: Goodness, no! Oh, yes.\nCONWAY: Who, for instance?\nCHANG: In time you will meet them all. There are some things, my dear Conway, I deeply regret I may not discuss.\nCONWAY: You know, that's the fourth time you've said that today. You should have a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8209395408630371], 'recall': [0.8566822409629822], 'f1': [0.8384301066398621], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0784313725490196, 'rouge2': 0.0, 'rougeL': 0.0784313725490196, 'rougeLsum': 0.0784313725490196}
bleu 1/2 : 0.057142857142857134 0.012964074471043278
ppl : 19.04204750061035
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHANG: At some time in the future you will have the pleasure of meeting her.\n\n', 'answer': 'Some man had better get ready to be very courteous to me.', 'gold_tag': 'CONWAY implies a future situation where he expects to be treated with courtesy', 'last_speaker': 'CONWAY'}
Last word -> CONWAY : "Some man had better get ready to be very courteous to me."


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8462250232696533], 'recall': [0.8436731100082397], 'f1': [0.844947099685669], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 22.22437286376953
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCONWAY: But Mr. Chang, all these things - books, instruments, sculpture - do you mean to say they were all brought in over those mountains by porters?\nCHANG: They were.\nCONWAY: Well, it must have taken� Centuries! Where did you get the money to pay for all those treasures?\n\n', 'answer': 'Of course we have no money as you know it. We do not buy or sell or seek personal fortunes because, well, because there is no uncertain future here

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8393998146057129], 'recall': [0.8055893182754517], 'f1': [0.8221471309661865], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 109.3291015625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCONWAY: That would suit me perfectly. I\'m always broke. How did you pay for them?\nCHANG: Our Valley is very rich in a metal called gold, which fortunately for us is valued very highly in the outside world. So we merely . . .\nCONWAY: �buy and sell?\nCHANG: Buy and - sell? No, no, pardon me, exchange\nCONWAY: I see. Gold for ideas. You know Mr. Chang, there\'s something so simple and naive about all of this that I suspect there has been 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8568048477172852], 'recall': [0.829940915107727], 'f1': [0.8431589603424072], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.136986301369863, 'rouge2': 0.0, 'rougeL': 0.10958904109589042, 'rougeLsum': 0.10958904109589042}
bleu 1/2 : 0.050474129498663846 0.007323556439251554
ppl : 35.675376892089844
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSLOANE: Cool.\n\n', 'answer': 'I think I see my Dad.', 'gold_tag': 'Everyday Language', 'last_speaker': 'CAMERON'}
Last word -> CAMERON : "I think I see my Dad."
prediction :  Let me know what you think -- what it's like to be inside an actual volcano.
Real answer : I think I see my Dad.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8456733226776123], 'recall': [0.8777696490287781], 'f1': [0.8614225387573242], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09090909090909091, 'rouge2': 0.0, 'rougeL': 0.09090909090909091, 'rougeLsum': 0.09090909090909091}
bleu 1/2 : 0.0625 0.02041241452319315
ppl : 31.2995662689209
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCAMERON: What were you in a previous life?\nSLOANE: I'm not sure but I think I know who Ferris was.\nCAMERON: Hannibal.\nSLOANE: From the A-Team?\nCAMERON: No. The guy who rode the elephants into Switzerland.\nSLOANE: I think if he was anybody, he was Magellan. You know, the guy who went around the world. I could see him ignoring popular belief and taking off on some

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8237742781639099], 'recall': [0.8266867399215698], 'f1': [0.8252279758453369], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 102.65941619873047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSLOANE: The future's worse for a boy, isn't it? A girl can always bail out and have a baby and get some guy to support her.\nCAMERON: That's a pretty grim thought.\nSLOANE: True, but it's an option. No options is worse.\nCAMERON: I don't know what I'm gonna do.\nSLOANE: College.\nCAMERON: Yeah, but to do what?\nSLOANE: What are you interested in?\nCAMERON: Nothing.\nSLOANE: Me either.\n\n", 'answer': 'What do you think Ferris is gonna

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8502337336540222], 'recall': [0.8179473280906677], 'f1': [0.8337781429290771], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 21.94851303100586
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBAINES: Those 80 acres, that cross the stream, what do you think of them?\nSTEWART: On your property?\nBAINES: Yes\nSTEWART: Good, flatish land with reliable water, why? I don't have money. What are you about?\nBAINES: I'd like to make a swap.\nSTEWART: What for?\nBAINES: The piano.\nSTEWART: The piano on the beach? Ada's piano? It's not marshy is it?\nBAINES: No.\nSTEWART: You'd have to organise it up here.\nBAINES: Yes, I thought tha

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8405965566635132], 'recall': [0.8358315825462341], 'f1': [0.8382073044776917], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05714285714285714, 'rouge2': 0.0, 'rougeL': 0.05714285714285714, 'rougeLsum': 0.05714285714285714}
bleu 1/2 : 0 0
ppl : 44.83278274536133
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTEWART: What do they want it for? They don't cultivate it, burn it back, anything. How do they even know it's theirs? as well mark it out.\nBAINES: Yes, why not.\nSTEWART: Ada says you're doing well with the piano? I'll have to come and hear you play. what do you play.\nBAINES: Nothing just yet\n\n", 'answer': 'No. Well I suppose it takes time.', 'gold_tag': 'Everyday Language', 'last_sp

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8582522869110107], 'recall': [0.8796443939208984], 'f1': [0.8688167333602905], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.15262222290039
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTEWART: Put that away, on the floor. I look at you, at your face. I have had that face in my head hating it. But now I am here seeing it ... it\'s nothing, you blink, you have your mark, you look at me through your eyes, yes. you are even scared of me Look at you! Has Ada ever spoken to you?\nBAINES: You mean in signs?\nSTEWART: No, words. You have never heard words?\nBAINES: No, not words.\nSTEWART: Never thought you heard words? She

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8658664226531982], 'recall': [0.846726655960083], 'f1': [0.8561895489692688], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2631578947368421, 'rouge2': 0.03571428571428571, 'rougeL': 0.17543859649122806, 'rougeLsum': 0.17543859649122806}
bleu 1/2 : 0.07670099255102274 0.023496262401094176
ppl : 56.249122619628906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK FURY: We should tell them to go back to sleep?\nAGENT MARIA HILL: If we can't control the Tesseract's energy, there may not be a minimum safe\nNICK FURY: I need you to make sure that PHASE 2\nAGENT MARIA HILL: Sir, is that really a priority right now?\nNICK FURY: Until such time as the world ends, we will act as though it intends to

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8313567638397217], 'recall': [0.8303437232971191], 'f1': [0.8308498859405518], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 75.52047729492188
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nAGENT MARIA HILL: We're at lock, sir.\n\n", 'answer': "Good. Let's vanish.", 'gold_tag': 'Everyday Language', 'last_speaker': 'NICK FURY'}
Last word -> NICK FURY : "Good. Let's vanish."
prediction :  All that's left to do here is check off your log.
Real answer : Good. Let's vanish.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8251474499702454], 'recall': [0.8921920657157898], 'f1': [0.8573610782623291], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0 0
ppl : 44.64603805541992
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nAGENT MARIA HILL: Sir.\nNICK FURY: Agent Hill?\nAGENT MARIA HILL: Those cards, they were in Coulson's locker, not in his jacket.\nNICK FURY: They needed the push. They found it. Get our communications back up, whatever you have to do. I want eyes on everything.\n\n", 'answer': 'Yes, sir.', 'gold_tag': 'AGENT MARIA HILL is a subordinate to NICK FURY', 'last_speaker': 'AGENT MARIA HILL'}
Last word -> AGENT MARIA HILL : "Yes, sir."


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8743525743484497], 'recall': [0.8408003449440002], 'f1': [0.8572482466697693], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 372.595947265625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK FURY: They'll come back.\nAGENT MARIA HILL: You really sure about that?\nNICK FURY: I am.\nAGENT MARIA HILL: Why?\n\n", 'answer': "Because we'll need them to.", 'gold_tag': 'NICK FURY indicates a future need for the implied group, suggesting a potential crisis or threat', 'last_speaker': 'NICK FURY'}
Last word -> NICK FURY : "Because we'll need them to."
prediction :  ... Because of what's on the other side of the gate. It's the la

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8523492217063904], 'recall': [0.863284707069397], 'f1': [0.8577821254730225], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814814, 'rouge2': 0.0, 'rougeL': 0.14814814814814814, 'rougeLsum': 0.14814814814814814}
bleu 1/2 : 0.05000000000000001 0.01622214211307625
ppl : 16.509307861328125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMANRAY: I'm starvin' like Marvin.\nCHEEBA: My world famous, famous world Arroz con pollo will be ready very soon.\nMANRAY: Hurry up, I wanna watch HBO.\nCHEEBA: Did we get our bill yet?\nMANRAY: I guess that will come with the rent, gas, and Con Ed bills, too.\nCHEEBA: Ahh, the luxuries of life.\nMANRAY: Yo, check it. This is good and all that but one d

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9035525918006897], 'recall': [0.9020484089851379], 'f1': [0.9027998447418213], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.0, 'rougeL': 0.1818181818181818, 'rougeLsum': 0.1818181818181818}
bleu 1/2 : 0 0
ppl : 124.74498748779297
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHEEBA: Not this way. Out through the fire escape.\n\n', 'answer': 'My tap shoes.', 'gold_tag': 'MANRAY is a tap dancer , MANRAY is concerned about his tap shoes', 'last_speaker': 'MANRAY'}
Last word -> MANRAY : "My tap shoes."
prediction :  I’ll look out!
Real answer : My tap shoes.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8156156539916992], 'recall': [0.8802036643028259], 'f1': [0.8466796875], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 216.96746826171875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMANRAY: We ran out without my shoes and the floor. I gotta get my stuff. What about our savings?\n\n', 'answer': 'Are you crazy? The joint is crawling with cops now. You wanna go to Rikers? Go to the hoosegow?', 'gold_tag': 'CHEEBA is risk-avoidant , CHEEBA suggests the immediate danger of encountering the police if they return to their previous location', 'last_speaker': 'CHEEBA'}
Last word -> CHEEBA : "Are you crazy? The joint is crawling

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8582140207290649], 'recall': [0.8239787817001343], 'f1': [0.8407480716705322], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12121212121212122, 'rouge2': 0.0, 'rougeL': 0.12121212121212122, 'rougeLsum': 0.12121212121212122}
bleu 1/2 : 0.0930058576283412 0.021721488589350173
ppl : 10.583438873291016
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEMILIE: It's not a charade, all this?\nSCHINDLER: A charade? How could it be a charade? There's no way I could have known this before, but there was always something missing. In every business I tried, I see now it wasn't me that was failing, it was this thing, this missing thing. Even if I'd known what it was, there's nothing I could have done about i

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8260458111763], 'recall': [0.9287291765213013], 'f1': [0.8743831515312195], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 26.32219886779785
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSCHINDLER: What?\n\n', 'answer': 'I feel like an old-fashioned couple. It feels good.', 'gold_tag': 'EMILIE enjoys the feeling of being in an old-fashioned couple', 'last_speaker': 'EMILIE'}
Last word -> EMILIE : "I feel like an old-fashioned couple. It feels good."
prediction :  What do you mean?
Real answer : I feel like an old-fashioned couple. It feels good.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8018220663070679], 'recall': [0.821406900882721], 'f1': [0.8114963173866272], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 69.61486053466797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEMILIE: Should I stay?\nSCHINDLER: It's a beautiful city.\nEMILIE: Should I stay?\nSCHINDLER: It's up to you. That's not it either.\n\n", 'answer': "No, it's up to you. Promise me no doorman or maitre 'd will presume I am anyone other than Mrs. Schindler... and I'll stay.", 'gold_tag': "EMILIE is the wife of Schindler , EMILIE wants to be recognized as Mrs. Schindler, not by her association with others like doormen or maitre 'd", 'last_

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.885481059551239], 'recall': [0.7968990802764893], 'f1': [0.8388580083847046], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13793103448275862, 'rouge2': 0.0, 'rougeL': 0.13793103448275862, 'rougeLsum': 0.13793103448275862}
bleu 1/2 : 0.0003039606551848387 0.00011772345554331757
ppl : 716.1742553710938
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJENNY: You sure this is your car?\nTEENAGER: My grandmother's rich.\nJENNY: Slow down, slow down.\nTEENAGER: Geez, Mrs. Haydn, we just got going.\nJENNY: I told you I'm looking for someone.\nTEENAGER: In the cars, too?!\nJENNY: I don't know where he is.\nTEENAGER: This is going to be a real drag. I thought you wanted to go fast. They after you?\nJEN

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8592360615730286], 'recall': [0.8588539361953735], 'f1': [0.8590449094772339], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307691, 'rouge2': 0.0, 'rougeL': 0.07692307692307691, 'rougeLsum': 0.07692307692307691}
bleu 1/2 : 0.01616199464406751 0.0055986791750362004
ppl : 141.38870239257812
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTEENAGER: I'm telling you, you're going to find him at the roadblock, or right after, or not at all.\nJENNY: If he didn't want to go through the roadblock, is there any other way to get to Vegas?\nTEENAGER: Fly. The five-fifty. We already passed it. But it'll take him way out of\nJENNY: Pull over. I've got to get back there.\nTEENAGER: Why wouldn't h

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8725289106369019], 'recall': [0.8435108065605164], 'f1': [0.8577744960784912], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473685, 'rouge2': 0.0, 'rougeL': 0.10526315789473685, 'rougeLsum': 0.10526315789473685}
bleu 1/2 : 0.11031211282307446 0.03729230060020541
ppl : 57.127586364746094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTEENAGER: Wait. Could I have your autograph?\nJENNY: Sure.\nTEENAGER: I have nothing to do. I'd like to help you.\n\n", 'answer': "Don't worry. I'll be all right.", 'gold_tag': 'JENNY is confident and independent', 'last_speaker': 'JENNY'}
Last word -> JENNY : "Don't worry. I'll be all right."
prediction :  Okay, what do you want to know?
Real answer :

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8223022222518921], 'recall': [0.8526477813720703], 'f1': [0.8372001051902771], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.764127731323242
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBARBARA: The markings he's made on my father's arms are from the pollen of the tule plant. The twigs have no significance other than they've been struck by lightning and are consequently appeals to the spirit of lightning. It's all entirely harmless, a religious ceremony, not a medical one.\nBOCK: You don't seriously believe all that mumbo-jumbo will cure him?\nBARBARA: On the other hand, it won't kill him, Doctor.\nBOCK: Okay. Go ahe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8504270315170288], 'recall': [0.8824731111526489], 'f1': [0.8661537766456604], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 17.29676055908203
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBARBARA: Hello. I'd like to arrange an ambulance for one-thirty tomorrow afternoon... Thank you... ...Drummond, first name, Barbara. I'll pay cash... No, you're to pick up my father, Drummond, Edward, at the Manhattan Medical Center, Holly Pavilion, Room Eight-O-Six. It's a stretcher case. I presume you provide the stretcher. He's to be taken to American Airlines, Yes... No... Kennedy Airport, Flight Seven-Two-Nine to Yuma, Arizona. I'

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8387515544891357], 'recall': [0.8284813761711121], 'f1': [0.8335847854614258], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1747572815533981, 'rouge2': 0.0, 'rougeL': 0.13592233009708737, 'rougeLsum': 0.13592233009708737}
bleu 1/2 : 0.13722781426294575 0.01656838158448903
ppl : 18.111154556274414
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOCK: What the hell am I supposed to say to that, Miss Drummond? I'm sitting here boozing and, all of a sudden, you start telling me some demented story about your father's religious conversion.\nBARBARA: No, no, you miss the point, Doctor. Not my father's conversion -- mine. You see, I had been hitting the acid pretty regularly at that time. I had achi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8261052370071411], 'recall': [0.8104987740516663], 'f1': [0.8182275295257568], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.20338983050847456, 'rouge2': 0.022857142857142854, 'rougeL': 0.11299435028248588, 'rougeLsum': 0.11299435028248588}
bleu 1/2 : 0.034580778647185784 0.0033381078038883074
ppl : 9.901971817016602
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBARBARA: Right on.\nBOCK: When I say impotent, I don't mean merely limp. Disagreeable as it may be for a woman, a man may sometimes lust for other things, something less transient than an erection, some sense of permanent worth. That's what medicine was for me, my reason for being. When I was thirty-four, Miss Drummond, I presented a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8363741636276245], 'recall': [0.8042628765106201], 'f1': [0.8200042843818665], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.20930232558139533, 'rouge2': 0.023809523809523815, 'rougeL': 0.16279069767441862, 'rougeLsum': 0.16279069767441862}
bleu 1/2 : 0.08695652173913045 0.013900960937138312
ppl : 16.836292266845703
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOCK: You wouldn't be awake.\nBARBARA: What time is it?\nBOCK: I swiped this for you out of the nurses' locker room. I'll make good on your dress. I'm afraid it's torn beyond repair. Buy yourself a new one or, if you like, give me your size and I'll send it on to you. But I want to talk to you about that.\nBARBARA: Talk to me about wh

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8558470606803894], 'recall': [0.8316832184791565], 'f1': [0.8435921669006348], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.21621621621621623, 'rouge2': 0.05714285714285714, 'rougeL': 0.21621621621621623, 'rougeLsum': 0.21621621621621623}
bleu 1/2 : 0.023253622302665873 0.005558678929360516
ppl : 48.95146942138672
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOCK: Look, you're not going. I love you, and I'm not going to let you go. Come on, let's start putting your father's things back. He's staying here. I'll find an apartment somewhere. I'm staying in a filthy little hotel room. We can't use that.\nBARBARA: I can't make it here, Herb. I'll crack up. I cracked up once already. One week he

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8235486745834351], 'recall': [0.8633490204811096], 'f1': [0.8429792523384094], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 243.73599243164062
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTAYA: Did you always want to be a soldier?\nCHRIS: I wanted to be a cowboy-- but I did that and I felt like I was meant for more.\nTAYA: So you started rescuing girls from bars?\nCHRIS: I rescued that bar from you. Do you like country music?\nTAYA: Only when I'm depressed.\nCHRIS: You want kids?\nTAYA: Someday. My mom says I have a nose for picking the wrong men.\nCHRIS: That's a shitty 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8667235970497131], 'recall': [0.843660831451416], 'f1': [0.855036735534668], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12903225806451615, 'rouge2': 0.0, 'rougeL': 0.06451612903225808, 'rougeLsum': 0.06451612903225808}
bleu 1/2 : 0.06650448426457305 0.02182444817914707
ppl : 20.66063690185547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRIS: You're trembling...\nTAYA: I know. I don't...\nCHRIS: We don't have to do this...\n\n", 'answer': 'I want to. I do...', 'gold_tag': 'Everyday Language', 'last_speaker': 'TAYA'}
Last word -> TAYA : "I want to. I do..."
prediction :  I need help.
Real answer : I want to. I do...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9118825197219849], 'recall': [0.873160719871521], 'f1': [0.8921016454696655], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.25, 'rougeLsum': 0.25}
bleu 1/2 : 0.17113903967753066 0.06628186505570068
ppl : 162.07302856445312
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTAYA: I thought you were dead. I thought--\n\n', 'answer': "Shh. I'm know. I'm so sorry.", 'gold_tag': 'Everyday Language', 'last_speaker': 'CHRIS'}
Last word -> CHRIS : "Shh. I'm know. I'm so sorry."
prediction :  You're right, you were. (1)
Real answer : Shh. I'm know. I'm so sorry.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8394232988357544], 'recall': [0.840093731880188], 'f1': [0.8397583365440369], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 51.648128509521484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRIS: You're the most beautiful thing I've ever seen.\nTAYA: I have an alien growing inside me-- And there's a strange man in my bedroom.\nCHRIS: Our bedroom.\nTAYA: Why isn't it on your finger?\nCHRIS: If it catches the light...\nTAYA: Your hands feel different.\nCHRIS: They're mine. I swear...\nTAYA: Why am I so fucking nervous?\nCHRIS: I'm nervous too.\nTAYA: No you're not. Don't lie.\nCHRIS: I am... What if that little alien reach

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8447009325027466], 'recall': [0.8527421355247498], 'f1': [0.8487024903297424], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 466.6203308105469
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRIS: You sabotaged me back there.\nTAYA: What am I supposed to do. You're not talking. You act like it's all okay--\nCHRIS: It is okay. I'm fine.\nTAYA: You're not fine. Your blood pressure--\nCHRIS: Babe, I'm driving down the freeway, it's sunny and 72 degrees. I'm fine. But there are people dying over there and I look around and it's like it's not even happening. It's barely on the news, no one talks about it. No one cares. And if 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8587255477905273], 'recall': [0.7990405559539795], 'f1': [0.8278086185455322], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 28.235713958740234
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTAYA: Marc wrote that letter two weeks ago. Did he say any of that to you? Chris? I want to know what you thought of his letter...\n\n', 'answer': "An AQI informant called in a tip. Biggles had just been shot. We were operating off emotion and-- we walked into an ambush. But that's not what killed him. That letter did. That letter killed Marc. He let go and he paid the price for it. (03.18.14) 92.", 'gold_tag': "CHRIS works in a high-

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8206607103347778], 'recall': [0.8090745210647583], 'f1': [0.8148264288902283], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10714285714285714, 'rouge2': 0.037037037037037035, 'rougeL': 0.07142857142857142, 'rougeLsum': 0.07142857142857142}
bleu 1/2 : 4.175425197561415e-06 1.5246497119199013e-06
ppl : 110.36531829833984
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRIS: If something ever happened to me-- you'd meet someone else. You'd be alright.\nTAYA: Do you want to die? Is that what it is?\nCHRIS: No.\nTAYA: Then tell me why do you do it. I want to understand.\nCHRIS: I do it for you. To protect you.\nTAYA: No you don't. I'm here. Your family is here. Your children have no father--\nCHR

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8199144601821899], 'recall': [0.8709633350372314], 'f1': [0.8446683287620544], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0.06666666666666667 0.02182178902359924
ppl : 26.687694549560547
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEVAN: Are you okay?\n\n', 'answer': "There it is. Let's go.", 'gold_tag': 'Everyday Language', 'last_speaker': 'TOMMY'}
Last word -> TOMMY : "There it is. Let's go."
prediction :  Yes .
Real answer : There it is. Let's go.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8235659599304199], 'recall': [0.8679409027099609], 'f1': [0.845171332359314], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24911.173828125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEVAN: Crockett!!\nTOMMY: Look what you made me do!\nEVAN: What's wrong with you?! Kayleigh! Wake up!\n\n", 'answer': "Why don't you fucking kiss her, Prince Charming!", 'gold_tag': 'TOMMY is aggressive', 'last_speaker': 'TOMMY'}
Last word -> TOMMY : "Why don't you fucking kiss her, Prince Charming!"
prediction :  I need you! I need your love, I need you! Give me a kiss! I need it now!
Real answer : Why don't you fucking kiss her, Prince C

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8541362285614014], 'recall': [0.8332548141479492], 'f1': [0.843566358089447], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814814, 'rouge2': 0.0, 'rougeL': 0.14814814814814814, 'rougeLsum': 0.14814814814814814}
bleu 1/2 : 0 0
ppl : 13.781238555908203
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOMMY: Listen to me good, Evan...\n\n', 'answer': "I'll do whatever you want. You don't want me to ever see Kayleigh again, fine. Just let Crockett go. Besides, you kill him now and they'll stick you in juvy for sure. And I know you'd never leave your sister alone with your father. Oh fuck, Lenny, no!", 'gold_tag': "EVAN is willing to compromise and cares about people, expressing concern 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8504585027694702], 'recall': [0.8178673386573792], 'f1': [0.8338445425033569], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10344827586206896, 'rouge2': 0.03571428571428571, 'rougeL': 0.10344827586206896, 'rougeLsum': 0.10344827586206896}
bleu 1/2 : 0.0004559409827772581 9.118819655545164e-05
ppl : 306.1737060546875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOMMY: You forgot to put the toaster on the edge.\nEVAN: But you, Kayleigh and Lenny like toast. And that's really all that matters now.\n\n", 'answer': "I'll get you changed. Visiting hours are almost up.", 'gold_tag': 'TOMMY indicates that visiting hours are almost up', 'last_speaker': 'TOMMY'}
Last word -> TOMMY : "I'll get you ch



prediction : 
Real answer : I'll get you changed. Visiting hours are almost up.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.0], 'recall': [0.0], 'f1': [0.0], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : nan
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSALIERI: Herr Mozart, what brings you here?\nMOZART: Your Excellency, you requested some specimens of my work. Here they are. I don't have to tell you how much I need your help. I truly appreciate your looking at these. I have pressures on me - financial pressures. As you know, I'm a married man now.\nSALIERI: So you are. How is your pretty wife?\nMOZART: She is well. She is - well, actually, I'm about to become a father! She only told me last night. You are the first to know.\nSALIERI: I'm fl

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.85019850730896], 'recall': [0.8486620783805847], 'f1': [0.8494295477867126], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23529411764705882, 'rouge2': 0.0, 'rougeL': 0.17647058823529413, 'rougeLsum': 0.17647058823529413}
bleu 1/2 : 0.15144269288692877 0.02175992847656396
ppl : 23.021793365478516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOZART: Nine performances! Nine! That's all it's had - and withdrawn.\nSALIERI: I know; it's outrageous. Still, if the public doesn't like one's work one has to accept the fact gracefully.\nMOZART: But what is it they don't like?\nSALIERI: Well, I can speak for the Emperor. You made too many demands on the royal ear. The poor man can't concentrate for mo

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8207274079322815], 'recall': [0.7747199535369873], 'f1': [0.7970603704452515], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 30.863086700439453
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOZART: What happened? Is it over?\nSALIERI: I'm taking you home. You're not well.\n\n", 'answer': 'No, no. I have to get back. I have -', 'gold_tag': 'MOZART has some health issues , MOZART has an immediate need or obligation to fulfill', 'last_speaker': 'MOZART'}
Last word -> MOZART : "No, no. I have to get back. I have -"
prediction :  Why didn't you tell me?
Real answer : No, no. I have to get back. I have -


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8235921859741211], 'recall': [0.8307212591171265], 'f1': [0.8271414041519165], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 31.980308532714844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSALIERI: Thank you. Go.\nMOZART: Papa! Papa!\nSALIERI: Come now. Where is your wife?\nMOZART: Not here! She's not well, either. She went to the Spa.\nSALIERI: You mean she's not coming back?\nMOZART: You're so good to me. Truly. Thank you.\nSALIERI: No, please.\nMOZART: I mean to come to my opera. You are the only colleague who did.\nSALIERI: I would never miss anything that you had written. You must know that.\nMOZART: This is only a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8779098391532898], 'recall': [0.8841716647148132], 'f1': [0.8810296058654785], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 64.15898132324219
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOZART: What happened?\nSALIERI: He said to give you this. And if you finish the work by tomorrow night, he will pay you another hundred ducats.\nMOZART: Another? But that's too soon! Tomorrow night? It's impossible! Did he say a hundred?\nSALIERI: Yes. Can I - could I help you, in any way?\nMOZART: Would you? Actually, you could.\nSALIERI: My dear friend, it would be my greatest pleasure.\nMOZART: But you'd have to swear not to tell a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8508832454681396], 'recall': [0.8880273699760437], 'f1': [0.8690585494041443], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 45.49403381347656
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOZART: Do you want to rest a bit?\nSALIERI: Oh no. I'm not tired at all.\nMOZART: We'll stop for just a moment. Then we'll do the Lacrimosa.\nSALIERI: I can keep going, I assure you. Shall we try?\nMOZART: Would you stay with me while I sleep a little?\nSALIERI: I'm not leaving you.\nMOZART: I am so ashamed.\nSALIERI: What for?\n\n", 'answer': 'I was foolish. I thought you did not care for my work - or me. Forgive me. Forgive me!', 'g

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8657150864601135], 'recall': [0.8544521331787109], 'f1': [0.8600468039512634], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13793103448275862, 'rouge2': 0.0, 'rougeL': 0.13793103448275862, 'rougeLsum': 0.13793103448275862}
bleu 1/2 : 0.04392955283543867 0.014569784396324328
ppl : 81.90863800048828
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANNIE: I've decided it's impossible to properly say hello in this place\nTOM: A jogger, huh?\nANNIE: I don't jog, Mr. Booker. I run.\nTOM: Lucky for you. The grizzlies around here only go for joggers.\nANNIE: If I can survive rush hour, I figure I can handle grizzlies...\nTOM: You sleeping all right in that house?\nANNIE: I don't sleep all right anywhe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8853646516799927], 'recall': [0.8228732347488403], 'f1': [0.8529759049415588], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 5475.181640625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANNIE: Oh-oh. The answer's no.\nTOM: You haven't heard the question yet. Truth is, you'd be doing me a favor. I got all these eager young colts need riding and poor old Rimrock here is feeling kind of left out...\nANNIE: Poor thing.\nTOM: He'd be grateful, he'd take real good care with you.\nANNIE: Is this how you're going to make me pay my phone bill?\n\n", 'answer': "No, ma'am, I'm afraid that's extra.", 'gold_tag': 'TOM may be the man 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8329216241836548], 'recall': [0.8108799457550049], 'f1': [0.8217529654502869], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814814, 'rouge2': 0.07999999999999999, 'rougeL': 0.14814814814814814, 'rougeLsum': 0.14814814814814814}
bleu 1/2 : 0.0588235294117647 0.019174124721184262
ppl : 32.719940185546875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOM: Relax our center... It's just sitting in a bucket.\nANNIE: Yeah, it's been a while, but I... I remember the basic ideas...\nTOM: OK. I'll stop talking then.\nANNIE: Actually, I never rode Western. I'm sorry. Go ahead.\nTOM: Well, he don't know that. Just sit the horse. Good... You have a nice seat.\nANNIE: Thanks.\nTOM: Feel good?\

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8304891586303711], 'recall': [0.8292593359947205], 'f1': [0.82987380027771], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10714285714285714, 'rouge2': 0.0, 'rougeL': 0.10714285714285714, 'rougeLsum': 0.10714285714285714}
bleu 1/2 : 0.01927926228371342 0.004487004106609252
ppl : 30.07366371154785
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANNIE: It's a whole other world you have going on here. It just goes along, doing what it has to. And you're a part of it, you just wake up and, and there you are... And everything that seems like life or death some place else -- doesn't affect any of this one bit.\nTOM: Lift your leg.\nANNIE: How long did you live here with your wife?\nTOM: Five years. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8000457286834717], 'recall': [0.899368405342102], 'f1': [0.8468045592308044], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.022727272727272728, 'rouge2': 0.0, 'rougeL': 0.022727272727272728, 'rougeLsum': 0.022727272727272728}
bleu 1/2 : 0.012345679012345675 0.00392837100659193
ppl : 16.133989334106445
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOM: Everything under control?\nANNIE: Not really. I'd forgotten how long it's been since I've done this. And I couldn't get any Parmesan cheese.\nTOM: Just make yourself comfortable.\nANNIE: I am comfortable.\nTOM: Ha, ha... all right, well, uh I guess you can bring out the pasta.\nANNIE: You missed a button.\nTOM: Huh?\n\n", 'answer': "Thank you. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8673418760299683], 'recall': [0.8173394203186035], 'f1': [0.841598629951477], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307693, 'rouge2': 0.0, 'rougeL': 0.07692307692307693, 'rougeLsum': 0.07692307692307693}
bleu 1/2 : 0.001153125778821586 0.0004466036937458461
ppl : 53.76933670043945
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOM: I won't apologize for this. And I won't hide it. Not for anybody.\n\n", 'answer': "I won't ask you to.", 'gold_tag': 'Everyday Language', 'last_speaker': 'ANNIE'}
Last word -> ANNIE : "I won't ask you to."
prediction :  Why would you want to apologize?
Real answer : I won't ask you to.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8342167139053345], 'recall': [0.8641793727874756], 'f1': [0.848933756351471], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3333333333333333, 'rouge2': 0.0, 'rougeL': 0.3333333333333333, 'rougeLsum': 0.3333333333333333}
bleu 1/2 : 0.16666666666666669 0.05773502691896258
ppl : 109.828369140625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANNIE: Oh, God, what are we going to do? I'm supposed to --\n\n", 'answer': "Ssshhh... Stand still, Annie. Takes what we've got, just for now. Can you do that?", 'gold_tag': 'TOM is calm and comforting , TOM is trying to reassure ANNIE in the middle of a crisis', 'last_speaker': 'TOM'}
Last word -> TOM : "Ssshhh... Stand still, Annie. Takes what we've got, j

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8116825819015503], 'recall': [0.8203054070472717], 'f1': [0.8159711956977844], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0303030303030303, 'rougeL': 0.08823529411764705, 'rougeLsum': 0.08823529411764705}
bleu 1/2 : 0.04000000000000001 0.009035079029052514
ppl : 17.181310653686523
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANNIE: Show me again.\nTOM: Annie!\n\n', 'answer': 'One more time.', 'gold_tag': 'Everyday Language', 'last_speaker': 'ANNIE'}
Last word -> ANNIE : "One more time."
prediction :  What was that about I had to pay you back?
Real answer : One more time.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8135000467300415], 'recall': [0.8648873567581177], 'f1': [0.838407039642334], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67.80653381347656
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOM: I figured, whenever you decided to go, you'd be all set.\nANNIE: How thoughtful of you. And what if I decide not to go? Leave me alone!\nTOM: Annie, please...\nANNIE: NO!... NO!... YOU DON'T DECIDE THIS!... NO!...\nTOM: I don't know any other way, Annie.\nANNIE: Why?\nTOM: You think this is easy for me!? It's just happened this way sometimes. I can't explain it. But I look at you and see Grace and I see Robert, and no matter what y

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8503916263580322], 'recall': [0.8694138526916504], 'f1': [0.8597975373268127], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16949152542372883, 'rouge2': 0.0, 'rougeL': 0.1016949152542373, 'rougeLsum': 0.1016949152542373}
bleu 1/2 : 0.075 0.013867504905630728
ppl : 15.652286529541016
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOM: Annie?\nANNIE: Can we go for one more ride?\n\n', 'answer': "Sure. I'll saddle them up.", 'gold_tag': 'TOM is capable of saddling horses , TOM is experienced with horse riding', 'last_speaker': 'TOM'}
Last word -> TOM : "Sure. I'll saddle them up."
prediction :  What's going on? Annie, where are you?
Real answer : Sure. I'll saddle them up.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8554632663726807], 'recall': [0.881721019744873], 'f1': [0.8683937191963196], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.156024932861328
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTOM: What?\nANNIE: I mean, why did you get\nTOM: I got married because Betsy said we had to break up or get\nANNIE: But when you met her, did you believe she was the only person for you? That in some mystical, I don't even know him. But say something, but\nTOM: Yes, I did.\n\n", 'answer': 'Thank you, Tom. I feel so much', 'gold_tag': 'ANNIE shows sensitivity', 'last_speaker': 'ANNIE'}
Last word -> ANNIE : "Thank you, Tom. I feel so muc

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8346832394599915], 'recall': [0.8435885906219482], 'f1': [0.8391123414039612], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12121212121212123, 'rouge2': 0.0, 'rougeL': 0.12121212121212123, 'rougeLsum': 0.12121212121212123}
bleu 1/2 : 0.04545454545454546 0.014712247158412498
ppl : 12.149164199829102
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBENNY: Get your glove, c'mon.\nSCOTTY: What's the big deal?\n\n", 'answer': 'Nightgame!', 'gold_tag': 'Everyday Language', 'last_speaker': 'BENNY'}
Last word -> BENNY : "Nightgame!"
prediction :  Take it easy, it's not that serious.
Real answer : Nightgame!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8089374303817749], 'recall': [0.8254337310791016], 'f1': [0.817102313041687], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 19.538787841796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSCOTTY: Benny - wait. It's okay, it was my fault. I'll just take whatever I get. You don't have to do this.\n\n", 'answer': "Yeah I do, Smalls. I have to. ...Follow your heart, kid, and you'll never go wrong.", 'gold_tag': "Benny shows responsibility and leadership , Benny understands the importance of following one's heart , Benny inculcates a sense of wisdom and maturity", 'last_speaker': 'BENNY'}
Last word -> BENNY : "Yeah I do, Sma

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8490782976150513], 'recall': [0.8433539867401123], 'f1': [0.8462064862251282], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.0, 'rougeL': 0.12121212121212122, 'rougeLsum': 0.12121212121212122}
bleu 1/2 : 0 0
ppl : 48.666011810302734
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBENNY: OOOOH SSSSHIT!\n\n', 'answer': "What're we waitin' for?! Let's go after 'em!", 'gold_tag': 'Everyday Language', 'last_speaker': 'SCOTTY'}
Last word -> SCOTTY : "What're we waitin' for?! Let's go after 'em!"
prediction :  Hey, what's going on?
Real answer : What're we waitin' for?! Let's go after 'em!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8720171451568604], 'recall': [0.8289543390274048], 'f1': [0.8499406576156616], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.26666666666666666, 'rouge2': 0.0, 'rougeL': 0.26666666666666666, 'rougeLsum': 0.26666666666666666}
bleu 1/2 : 0 0
ppl : 16.893192291259766
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSANTEN: What the hell's going on?\nBOWMAN: Single event upsets. All over the board. Latch up. Free flow... We're gonna lose chips. Shut it down!\nSANTEN: Shut it down?\n\n", 'answer': "Now! Everything! SEP, some kind of massive solar flare. Santen's shutting off every system he can get his hands... Proton flux. Multiple event upsets...", 'gold_tag': 'BOWMAN indicates an immediate crisis 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8523086309432983], 'recall': [0.8081971406936646], 'f1': [0.8296669721603394], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 52.606998443603516
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOWMAN: Radiation alert. Safe area. Go. You, too.\nSANTEN: You need me here on the flight deck.\n\n', 'answer': 'I want one of us in charge back there.', 'gold_tag': 'BOWMAN has a position of authority , SANTEN is a subordinate to BOWMAN', 'last_speaker': 'BOWMAN'}
Last word -> BOWMAN : "I want one of us in charge back there."
prediction :  We'll have to wait. You'll make it?
Real answer : I want one of us in charge back there.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.845815896987915], 'recall': [0.869713306427002], 'f1': [0.8575981259346008], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 40.53257369995117
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSANTEN: Copy. We're okay, but the ship's hit bad.\n\n", 'answer': "... We're dead in the water... I can't run damage assessment. She's together now, but she might come apart at any moment. Change of plans. We're launching now. Suit up.", 'gold_tag': "BOWMAN is in charge and demonstrates leadership and decision-making skills , BOWMAN has knowledge in assessing damage and the state of the ship  , BOWMAN's occupation is related to spaceship

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.853519082069397], 'recall': [0.8414822816848755], 'f1': [0.8474579453468323], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.04878048780487805, 'rouge2': 0.0, 'rougeL': 0.04878048780487805, 'rougeLsum': 0.04878048780487805}
bleu 1/2 : 0 0
ppl : 197.34193420410156
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBOWMAN: You're talking to a class of second-graders from Iowa at 1430.\nSANTEN: It's your turn to do the kiddies.\nBOWMAN: Don't even try. You're up, and you know it. Besides, I outrank you. I could order you to do it.\nSANTEN: ... Just 'cause they promote faster in the Navy...\nBOWMAN: You gotta promote faster when you have the best people... 1430. With a smile on your face.\nSANTEN: Pul

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.813870906829834], 'recall': [0.8505594730377197], 'f1': [0.8318108320236206], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 12.886899948120117
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCROSKEY: Grab hold of yourself! You've got to talk them down. You're the only chance they've got!\n\n", 'answer': 'All right, Striker, now you listen to me and you listen close. Flying is no different than riding a bicycle...it just happens to be a lot harder to put baseball cards in the spokes. Now, if you just follow my instructions...', 'gold_tag': 'KRAMER has knowledge and experience in flying , KRAMER possibly has a background a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8373908400535583], 'recall': [0.8170357942581177], 'f1': [0.8270881175994873], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.22222222222222224, 'rouge2': 0.0, 'rougeL': 0.14814814814814817, 'rougeLsum': 0.14814814814814817}
bleu 1/2 : 0.09628119332763099 0.015445644412716347
ppl : 8.315596580505371
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKRAMER: I don't understand it. He should have been in range ten minutes ago.\n\n", 'answer': 'Gunderson, check the radar range. Anything yet?', 'gold_tag': 'McCROSKEY is in charge , McCROSKEY is giving orders to check the radar range , McCROSKEY holds a position of authority in the same field', 'last_speaker': 'McCROSKEY'}
Last word -> McCROSKEY : "Gun

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8603386878967285], 'recall': [0.8380100131034851], 'f1': [0.8490275144577026], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 686.2061157226562
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCROSKEY: Two more minutes! They could be miles off course.\n\n', 'answer': "That's impossible. They're on instruments!", 'gold_tag': 'KRAMER has knowledge of aircraft navigation systems', 'last_speaker': 'KRAMER'}
Last word -> KRAMER : "That's impossible. They're on instruments!"
prediction :  Not if their nav systems are working. Those things are very sophisticated.
Real answer : That's impossible. They're on instruments!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8621262311935425], 'recall': [0.8645031452178955], 'f1': [0.8633130788803101], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 132.65782165527344
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCROSKEY: Rex, I've decided that the best thing to do is to foam the runway -- let him do a wheels-up landing. It'd be a lot simpler.\nKRAMER: No, the risk of fire is too great. If she starts burning, you write off all those people who can't get out of there on their own power.\nMcCROSKEY: Well that's better than writing them all off? Are you going to play God with a hundred and 38 lives?\nKRAMER: No. A belly landing isn't all that s

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.88358473777771], 'recall': [0.8370242714881897], 'f1': [0.8596745729446411], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1530.1905517578125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCROSKEY: Ya know, this would be a tough landing for anyone to make. Maybe, if we hold them off for a bit we'll get a break in the weather.\n\n", 'answer': "All right, but let's wait until they reach the control area.", 'gold_tag': 'KRAMER has a decision-making role in the conversation', 'last_speaker': 'KRAMER'}
Last word -> KRAMER : "All right, but let's wait until they reach the control area."
prediction :  Good thought. Let's do th

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8990435600280762], 'recall': [0.8756997585296631], 'f1': [0.887218177318573], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2222222222222222, 'rouge2': 0.12500000000000003, 'rougeL': 0.2222222222222222, 'rougeLsum': 0.2222222222222222}
bleu 1/2 : 0 0
ppl : 58.88252258300781
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKRAMER: Steve, I want every light you can get poured on that field.\n\n', 'answer': "It's being done right now.", 'gold_tag': "McCROSKEY is carrying out KRAMER's order to illuminate the field", 'last_speaker': 'McCROSKEY'}
Last word -> McCROSKEY : "It's being done right now."
prediction :  The only light we have is on top of the power plant.
Real answer : It's being done right

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8416107296943665], 'recall': [0.8588821887969971], 'f1': [0.8501588106155396], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 57.38011932373047
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCROSKEY: It sure is quiet out there.\nKRAMER: Yeah -- too quiet.\n\n', 'answer': 'Looks like I picked the wrong week to quit sniffing glue.', 'gold_tag': 'McCROSKEY recently quit sniffing glue', 'last_speaker': 'McCROSKEY'}
Last word -> McCROSKEY : "Looks like I picked the wrong week to quit sniffing glue."
prediction :  I'm calling in for a new ticket.
Real answer : Looks like I picked the wrong week to quit sniffing glue.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8782116174697876], 'recall': [0.8651597499847412], 'f1': [0.8716368079185486], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473685, 'rouge2': 0.0, 'rougeL': 0.10526315789473685, 'rougeLsum': 0.10526315789473685}
bleu 1/2 : 0 0
ppl : 52.74391555786133
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKRAMER: Bring it down! Easy!\n\n', 'answer': "Look at all those buttons! Oh that's beautiful! Just beautiful!", 'gold_tag': 'McCROSKEY shows interest and admiration for complex machinery or technology , McCROSKEY is fascinated with the buttons', 'last_speaker': 'McCROSKEY'}
Last word -> McCROSKEY : "Look at all those buttons! Oh that's beautiful! Just beautiful!"
prediction :  Bring it do

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8701502680778503], 'recall': [0.8267884254455566], 'f1': [0.8479152917861938], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 908.8919677734375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKRAMER: Remember your brakes and switches! Get ready to flare it out! You're coming in too fast! Watch your speed!\nMcCROSKEY: He's coming right at us!\n\n", 'answer': "You're coming in too hot! Put down full flaps! Watch your nose!", 'gold_tag': 'KRAMER is knowledgeable about flying or landing a plane', 'last_speaker': 'KRAMER'}
Last word -> KRAMER : "You're coming in too hot! Put down full flaps! Watch your nose!"
prediction :  Take 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8894745111465454], 'recall': [0.8403376936912537], 'f1': [0.8642082214355469], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17391304347826086, 'rouge2': 0.09523809523809525, 'rougeL': 0.17391304347826086, 'rougeLsum': 0.17391304347826086}
bleu 1/2 : 0.15922918012750872 0.11942188509563156
ppl : 52.28293228149414
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: ...you big fuckin' hick, you. Come here. Come here. Get him up. Come here.\n\n", 'answer': 'Get up.', 'gold_tag': 'Everyday Language', 'last_speaker': 'MARINO'}
Last word -> MARINO : "Get up."
prediction :  This is just a big, stupid, stupid, stupid, stupid ass, that's what it is.
Real answer : Get up.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7896962761878967], 'recall': [0.893949031829834], 'f1': [0.8385949730873108], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 12.519536018371582
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: Come here, come here.\nMARINO: Get up.\n\n', 'answer': 'You go over there right now and you apologize. You better hope he lets you back in.', 'gold_tag': 'NICKY is in a position of power , NICKY is concerned about a present situation where someone needs to be apologized to immediately', 'last_speaker': 'NICKY'}
Last word -> NICKY : "You go over there right now and you apologize. You better hope he lets you back in."
prediction :

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8613494634628296], 'recall': [0.8485850095748901], 'f1': [0.8549196124076843], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1935483870967742, 'rouge2': 0.0, 'rougeL': 0.1935483870967742, 'rougeLsum': 0.1935483870967742}
bleu 1/2 : 0.1153025352864842 0.026755692780938665
ppl : 37.63753890991211
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: Be careful. Hey, come on. Get inside. Can you fuckin' believe that? Look at this motherfucker. I fucked up, Frankie. I fucked up good this time. Should have never started with this fuckin' broad.\nMARINO: Take it easy. What could you do? I mean, she threw herself at you, right?\n\n", 'answer': "I'm in a bad fuckin' spot here. You know that? Bad fuck

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8505644798278809], 'recall': [0.8425242900848389], 'f1': [0.846525251865387], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.18181818181818182, 'rouge2': 0.1, 'rougeL': 0.18181818181818182, 'rougeLsum': 0.18181818181818182}
bleu 1/2 : 0.0669076785648738 0.02261892368524442
ppl : 63.773658752441406
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICKY: Dominick! Oh, Dominick. Oh, Dom. Frankie. Frankie, leave the kid alone. He's still breathin'. He's still breathin'. Leave him alone. Frankie.\nMARINO: All right. Strip him.\n\n", 'answer': "No balls, you got no fuckin' balls. Oh, Dominick. Oh, Dom. Dominick. Dominick. Dominick.", 'gold_tag': 'NICKY is aggressive', 'last_speaker': 'NICKY'}
Last wor

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8313701748847961], 'recall': [0.842035174369812], 'f1': [0.8366686105728149], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 33.05766677856445
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPAULSEN: Well, look, there isn't a ton of information here, but from what there is, I'd say that these two people here -- what are their names? Shanna and Ashley?\nERIN: Right, I guess those are the kids --\nPAULSEN: They've both got some immune system problem. Can't say what from, whether it's viral or genetic or what, but something's wrong. And these guys -- Donna and Peter --\nERIN: Their parents, I'm pretty sure.\n\n", 'answer': "We

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8613742589950562], 'recall': [0.8437464833259583], 'f1': [0.8524692058563232], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.05704634655917688 0.019133926316149612
ppl : 138.85665893554688
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: ... and when I realized our area's just as bad as Hinkley, I thought maybe my neighbors are all sick too. So I went and asked.\nPAULSEN: You did?\nERIN: Uh-huh. Spent the last few days knocking on doors. And you know what? They're not. I mean, they got problems, but none of this cancer stuff. And their pets are fine. So I don't know -- I just can't shake the feeling that it wasn't no multi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8684422969818115], 'recall': [0.864148736000061], 'f1': [0.8662902116775513], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17857142857142855, 'rouge2': 0.07407407407407407, 'rougeL': 0.14285714285714285, 'rougeLsum': 0.14285714285714285}
bleu 1/2 : 0.1674742168678578 0.10830051676671404
ppl : 15.951539039611816
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPAULSEN: I gotta say, Erin -- first time I saw you, I did not peg you as the kind to go off and conduct her own epidemiological study.\n\n', 'answer': "Don't go telling anyone. It'll ruin my reputation.", 'gold_tag': 'ERIN values her reputation', 'last_speaker': 'ERIN'}
Last word -> ERIN : "Don't go telling anyone. It'll ruin my reputatio

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8594086170196533], 'recall': [0.883110523223877], 'f1': [0.8710983991622925], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 48.3859977722168
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIRECTOR: What can I do for you, Mr. Welles?\nWELLES: Call me Tom.\nDIRECTOR: Alright, Tom.\nWELLES: What I'd like, very simply, is access to your archive. And, now I understand this isn't something you normally do for private citizens...\nDIRECTOR: There are reasons for the way we do things here.\nWELLES: Absolutely. Of course I'll abide by whatever decision you make, but I'd appreciate if you'll hear me out... Few days ago, I was conta

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8284001350402832], 'recall': [0.8416202068328857], 'f1': [0.8349578380584717], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06666666666666667, 'rouge2': 0.0, 'rougeL': 0.06666666666666667, 'rougeLsum': 0.06666666666666667}
bleu 1/2 : 0.02272727272727273 0.007270084086278819
ppl : 14.498214721679688
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWELLES: Fuck.\n\n', 'answer': "Excuse me... Tom, hold on... Listen, maybe I can help after all. Why don't you come on back in... we'll see what we can do.", 'gold_tag': 'Everyday Language', 'last_speaker': 'DIRECTOR'}
Last word -> DIRECTOR : "Excuse me... Tom, hold on... Listen, maybe I can help after all. Why don't you come on back in... we'll see wh

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8265405893325806], 'recall': [0.8191842436790466], 'f1': [0.822845995426178], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09230769230769231, 'rouge2': 0.031746031746031744, 'rougeL': 0.09230769230769231, 'rougeLsum': 0.09230769230769231}
bleu 1/2 : 0.028571428571428577 0.009166984970282115
ppl : 9.868012428283691
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDIRECTOR: This is it. It's not much. (points at computers) We've got less than five percent on computer and we lose that funding in December. I'll have someone show it to you anyway. Other than that, I'm afraid it's the wet thumb method. Files are mostly by state and year of disappearance. We try to keep the children and adults separat

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8195866346359253], 'recall': [0.8531957864761353], 'f1': [0.8360535502433777], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06521739130434782, 'rouge2': 0.022222222222222223, 'rougeL': 0.06521739130434782, 'rougeLsum': 0.06521739130434782}
bleu 1/2 : 0.01219512195121951 0.00388016830983099
ppl : 14.014321327209473
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTARCK: The antennae array\'s completely fried, we\'ve got no radio, no laser, no highgain... No one\'s going to be coming to help us.\nMILLER: How much oh-two do we have?\nSTARCK: Oxygen is not the problem.\nMILLER: Carbon dioxide?\nSTARCK: It\'s building up with every breath we take. And the CO2 filters on the Event Horizon are shot.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8595526814460754], 'recall': [0.8512488007545471], 'f1': [0.8553805947303772], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17777777777777778, 'rouge2': 0.046511627906976744, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0.022466755668570322 0.007394714572768667
ppl : 22.285053253173828
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTARCK: Miller...\nMILLER: What is it, Starck?\nSTARCK: ...I ran the bio-scan with the DNA/RNA filter. The results were bio-readings of indeterminate origin...\nMILLER: "...bio-readings of indeterminate origin," don\'t you have anything useful to tell me?\nSTARCK: I\'ve got a theory.\nMILLER: Go ahead.\nSTARCK: There was a another su

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8871505260467529], 'recall': [0.8632259368896484], 'f1': [0.875024676322937], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.1637461506155964 0.05789300674674099
ppl : 446.2840270996094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTARCK: You've got to listen...\nMILLER: To what? What are you saying? This ship is alive?\nSTARCK: I didn't say that, I said the bio- readings correspond to what happened to you, the ship is reacting to us...\nMILLER: We're hanging on by our fingernails and you're giving me bullshit stories...\nSTARCK: It's not bullshit, it's the only conclusion the data

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.837948739528656], 'recall': [0.8155782222747803], 'f1': [0.826612114906311], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12121212121212122, 'rouge2': 0.0, 'rougeL': 0.12121212121212122, 'rougeLsum': 0.12121212121212122}
bleu 1/2 : 0 0
ppl : 32.803977966308594
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMILLER: Are you cold? You need something to warm you up?\nJACKIE: You could torch the club so I don’t have to do this shit.\nMILLER: Be prepared - it’s not as big a house as they thought. I think the weather kept people home.\n\n', 'answer': 'You sure it’s not the marquee? Reads like Night of the Living Dead.', 'gold_tag': 'JACKIE is a performer', 'last_speaker': 'JACKIE'}
Last word -> JAC

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8573629260063171], 'recall': [0.8365897536277771], 'f1': [0.8468489646911621], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 72.1383285522461
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJACKIE: I’d try it on you but you have no sense of humor.\nMILLER: No, I don’t.\nJACKIE: I have never seen you laugh.\nMILLER: No you haven’t.\nJACKIE: How can that be? You represent comics.\nMILLER: You just answered your own question. Can you imagine how fucking painful my life would be if I had to act like an audience with all my comics? “You laughed more at his joke, than ya did at mine” “You think he’s funny, he’s not funny”. So I 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8073323965072632], 'recall': [0.8322286605834961], 'f1': [0.8195914626121521], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1176470588235294, 'rouge2': 0.02, 'rougeL': 0.09803921568627451, 'rougeLsum': 0.09803921568627451}
bleu 1/2 : 0.07228915662650603 0.029691330061333544
ppl : 13.569295883178711
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJACKIE: I’m hot now. I’m viral. Your father knew how to take advantage of opportunities like this... “Millah the Killah” I used to call him.\n\n', 'answer': 'He loved when you called him that. It killed him when you stopped.', 'gold_tag': 'MILLER\'s father loved being called "Millah the Killah" by JACKIE , Shared memories: JACKIE and MILLER share a me

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8926995992660522], 'recall': [0.8599774837493896], 'f1': [0.8760330677032471], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0.026349806140466083 0.009621588806014277
ppl : 135.38475036621094
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJACKIE: What am I a Rent-A-Comic?\nMILLER: She’s a fan who runs a cable network.\nJACKIE: There was a neon sign on her forehead that said “NO. You didn’t see it?\nMILLER: There are new executives out there. You have to cultivate them. They say no this time, you come back with another idea.\nJACKIE: I come back and tap dance some more for these fuckin

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8362516164779663], 'recall': [0.8677763938903809], 'f1': [0.8517223596572876], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 41.52958297729492
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMILLER: Over three million views in eight hours and it keeps climbing. You’re on Kimmel. You’re on Conan. Carol from RAW TV called me. She needs a host for a reality show pilot. They shoot on Monday in NY?\nJACKIE: Monday? Why so fast?\nMILLER: They had D’Angelo signed but he had to drop out last minute.\nJACKIE: Why? He got a better deal?\nMILLER: He’s got pancreatic cancer.\n\n', 'answe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8614246845245361], 'recall': [0.8593284487724304], 'f1': [0.8603752255439758], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06896551724137932, 'rouge2': 0.0, 'rougeL': 0.06896551724137932, 'rougeLsum': 0.06896551724137932}
bleu 1/2 : 0.05555555555555554 0.018077538151554672
ppl : 16.864120483398438
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBB: What is this? What is this crowd here? We're charging admission?\n\n", 'answer': 'Back away... give me some elbow', 'gold_tag': 'TILLEY values personal space , TILLEY is focused on a task, as indicated by the request for some elbow room', 'last_speaker': 'TILLEY'}
Last word -> TILLEY : "Back away... give me some elbow"
prediction :  Yes. You can g

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.841679573059082], 'recall': [0.8128610849380493], 'f1': [0.8270193338394165], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 33.56831741333008
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTILLEY: Yeah. You in there? Yeah. They got my license as well. Sorry to hear it. This is where my car used to be. Stolen? I.R.S. Fucking bandits! Bandits! Thieving sons of bitches!\nBB: You need a ride uptown?\nTILLEY: I could use one.\n\n', 'answer': 'Come on.', 'gold_tag': 'Everyday Language', 'last_speaker': 'BB'}
Last word -> BB : "Come on."
prediction :  Well I don't drive. I would go by myself but I don't want to be caught with no

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8321017026901245], 'recall': [0.8872686624526978], 'f1': [0.8588001728057861], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 17.433137893676758
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTILLEY: Some bullshit commission, huh? Tell me, where's it written it the Constitution that says you can't hustle for money? Where's it written? It ain't like I went into an alley and hit a guy over the head with a brick and stole his money... not like I broke into somebody's house and stole his stuff. All I'm doing is selling... where's the crime in that?\nBB: I don't know what the world's coming to.\n\n", 'answer': "You're telling m

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8293510675430298], 'recall': [0.8698028326034546], 'f1': [0.8490954041481018], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.31250000000000006, 'rouge2': 0.0, 'rougeL': 0.1875, 'rougeLsum': 0.1875}
bleu 1/2 : 0.0588235294117647 0.019174124721184262
ppl : 38.62843322753906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBB: You know what our big crime is? We're nickel and dime guys. We're small time hustlers. They got us because we're hustling nickels and dimes.\nTILLEY: Nickels and dimes. You got a good point there, BB. You're right on the money with that kind of thinking.\nBB: Gotta find a new business to get into.\nTILLEY: New? Very hard to find something new to get into.\nBB: Maybe... maybe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8461991548538208], 'recall': [0.8450517058372498], 'f1': [0.845625102519989], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0425531914893617, 'rouge2': 0.0, 'rougeL': 0.0425531914893617, 'rougeLsum': 0.0425531914893617}
bleu 1/2 : 0.03125 0.010040241611281232
ppl : 14.798113822937012
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSOL: Stop thinking, Max, just feel. Use your intuition. It's the only way to get into the flow. What did you think of Hamlet?\nMAX: I didn't get to it.\nSOL: It's been a month. You haven't taken a single break.\nMAX: I'm so close, Sol. I'm so close but I just can't grab it.\nSOL: Have you met the new fish my niece bought me? I named her Icarus. After you. My renegade

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8583431839942932], 'recall': [0.8619815707206726], 'f1': [0.8601585626602173], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 3200.84716796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMAX: Euclid crashed. I lost all my data, my hardware.\nSOL: Your mainframe?\nMAX: Burnt...\nSOL: What happened?\nMAX: I don't know, first I got these horrible picks. Then Euclid spits out some numbers. Never saw anything like it and then it fries. The whole machine just crashed.\nSOL: You have a printout?\nMAX: Of?\nSOL: The picks, the number?\nMAX: I threw it out.\nSOL: What was the number it spit out?\nMAX: I don't know, just a long s

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8614485263824463], 'recall': [0.8427855372428894], 'f1': [0.8520147800445557], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14035087719298245, 'rouge2': 0.07272727272727272, 'rougeL': 0.14035087719298245, 'rougeLsum': 0.14035087719298245}
bleu 1/2 : 0.0019087735547149904 0.0014428971814036012
ppl : 23.849021911621094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSOL: Max?\nMAX: What's going on, Sol!?\n\n", 'answer': "Relax, it's early.", 'gold_tag': 'Everyday Language', 'last_speaker': 'SOL'}
Last word -> SOL : "Relax, it's early."
prediction :  I think something's happening there! Come on! We were just talking!
Real answer : Relax, it's early.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8227525949478149], 'recall': [0.8661372661590576], 'f1': [0.8438876867294312], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0 0
ppl : 50.953861236572266
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSOL: Now, what\'s up?\nMAX: What\'s the two hundred and sixteen number, Sol?\nSOL: Excuse me?\nMAX: You asked me if I had seen a two hundred and sixteen digit number, right?\nSOL: Oh, you mean the bug. I found it working on Pi.\nMAX: What do you mean by "found it"?\nSOL: What\'s this all about, Max?\nMAX: Well, there\'s these religious Jews who have...\nSOL: Religious Jews?\nMAX: Well, you know, Hassidim. I met one in the coffee

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.821973443031311], 'recall': [0.855934739112854], 'f1': [0.8386104106903076], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 32.175697326660156
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSOL: Listen to me. The Ancient Japanese considered the Go simple and ordered, in fact, the possibilities of game play are endless. They say that no two Go games have ever been alike. Just like snowflakes. So, the Go board actually represents an extremely complex and chaotic universe. That is the truth of our world, Max. pattern.\nMAX: But as a Go game progresses, the possibilities become smaller and smaller. The board does take on order

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8353663682937622], 'recall': [0.8309191465377808], 'f1': [0.833136796951294], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07228915662650602, 'rouge2': 0.0, 'rougeL': 0.07228915662650602, 'rougeLsum': 0.07228915662650602}
bleu 1/2 : 0.014227407158651357 0.003263991054377284
ppl : 29.181791305541992
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSOL: You're early. I was just studying our... What did you do to yourself?\nMAX: You lied to me.\nSOL: I thought you were going to take a break.\nMAX: You found the two sixteen number in Pi, didn't you? You saw it. I saw it, Sol. I don't know what happened, but I know things. The market is going to crash. It's going to crash. It hasn't yet, but I know

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9367514848709106], 'recall': [0.8967422246932983], 'f1': [0.9163103103637695], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1257108.625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARTIE: Polly, why're y'doin' this?\nPOLLY: I don't know.\nARTIE: Angels? Be reasonable, Polly, it's rubbish. You've said it y'self. You'd be better off spendin' more time with the livin'.\nPOLLY: Meaning?\nARTIE: You know perfectly well what I mean. Your daughter needs you. Instead you sit around holdin' on t'...\nPOLLY: You can't even say his name.\nARTIE: Let me come with y'then. It'll be dark soon.\n\n", 'answer': 'Thank you, I can manage

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8326195478439331], 'recall': [0.8561689853668213], 'f1': [0.844230055809021], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.25, 'rougeLsum': 0.25}
bleu 1/2 : 0 0
ppl : 332.0350646972656
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPOLLY: Artie, did you knock...? What have you got there?\n\n', 'answer': "Nothin'...", 'gold_tag': 'ARTIE is secretive', 'last_speaker': 'ARTIE'}
Last word -> ARTIE : "Nothin'..."
prediction :  Are you calling Artie names? (I don't like the way she said it)
Real answer : Nothin'...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8178901672363281], 'recall': [0.81210857629776], 'f1': [0.8149891495704651], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 58.2064323425293
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPOLLY: Where are you going?\n\n', 'answer': "T'find that reporter.", 'gold_tag': 'ARTIE is planning to find a reporter, which is an immediate action', 'last_speaker': 'ARTIE'}
Last word -> ARTIE : "T'find that reporter."
prediction :  I don't know.
Real answer : T'find that reporter.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8342122435569763], 'recall': [0.8567205667495728], 'f1': [0.8453166484832764], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.25, 'rougeLsum': 0.25}
bleu 1/2 : 0 0
ppl : 21.51224708557129
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPATRICK: Everything seems to be ok.\nMICHELLE: Well, thank you. I appreciate it.\nPATRICK: Please. I didn’t do shit- pardon my French. I almost had him. He just took off so fast... In school, I could do a mile in six-fifty.\nMICHELLE: The other way to think of it is his speed was a testament to how scared he was of you.\nPATRICK: Thank you for salvaging my pride.\nMICHELLE: Anytime.\nPATRICK: Well, if you see anything, hear anything

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.824600338935852], 'recall': [0.8180941343307495], 'f1': [0.8213343620300293], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 7528537.5
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPATRICK: I didn’t want to disturb you but I just got home and saw all your\nMICHELLE: That wouldn’t be good.\nPATRICK: I know it probably sounds like I’m being chicken little or something but it'll amaze you what the winds can do up here.\nMICHELLE: No, I’ve seen it. You should’ve been here in ‘99. It was like the\nPATRICK: I believe it. You’re probably going to need a hand.\nMICHELLE: Yes, probably. Thank you.\nPATRICK: Your mother’s funeral w

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8568159341812134], 'recall': [0.8809070587158203], 'f1': [0.868694543838501], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666669, 'rouge2': 0.0, 'rougeL': 0.16666666666666669, 'rougeLsum': 0.16666666666666669}
bleu 1/2 : 0 0
ppl : 26.625957489013672
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPATRICK: Any symptoms? Dizziness...?\nMICHELLE: Oh, no, I’m fine. I think... Except my leg.\n\n', 'answer': 'You can take off the seat belt now.', 'gold_tag': "PATRICK is in a position to help MICHELLE , PATRICK shows concern for MICHELLE's well-being , PATRICK directs MICHELLE's actions to alleviate her discomfort", 'last_speaker': 'PATRICK'}
Last word -> PATRICK : "You can take off the 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8518460988998413], 'recall': [0.8426775932312012], 'f1': [0.8472370505332947], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714288, 'rouge2': 0.0, 'rougeL': 0.14285714285714288, 'rougeLsum': 0.14285714285714288}
bleu 1/2 : 0.11942188509563156 0.04136895450425726
ppl : 277.71636962890625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPATRICK: You should probably go to the E.R.\nMICHELLE: I don’t want any entanglements right now. I just want to go home. I’ll see my own doctor.\n\n', 'answer': 'It’s your call.', 'gold_tag': "PATRICK is concerned about MICHELLE's health , PATRICK respects MICHELLE's decisions", 'last_speaker': 'PATRICK'}
Last word -> PATRICK : "It’s your call."
predic

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8514817953109741], 'recall': [0.8635598421096802], 'f1': [0.8574783205986023], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67.96489715576172
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPATRICK: Looks to me like maybe a hairline fracture. You need real treatment.\nMICHELLE: Why did you do it?\n\n', 'answer': 'It was necessary.', 'gold_tag': 'Everyday Language', 'last_speaker': 'PATRICK'}
Last word -> PATRICK : "It was necessary."
prediction :  I did it because he doesn’t have anyone.
Real answer : It was necessary.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.854155421257019], 'recall': [0.9051665663719177], 'f1': [0.8789215087890625], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 46.03874206542969
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMISS STUBBS: Jenny. Again.\n\n', 'answer': "Isn't it because Mr Rochester's blind?", 'gold_tag': 'JENNY references Mr. Rochester, a character from Jane Eyre , JENNY is knowledgeable about literature , JENNY is well-read or studying literature', 'last_speaker': 'JENNY'}
Last word -> JENNY : "Isn't it because Mr Rochester's blind?"
prediction :  [She sits on a seat. The lights are low] Not a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8401919603347778], 'recall': [0.8140016794204712], 'f1': [0.8268894553184509], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 106.7236557006836
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMISS STUBBS: Jenny, could I have a word?\nJENNY: Of course.(To Hattie and Tina) I'll catch you up.\nMISS STUBBS: You can do anything you want, Jenny. You know that. You're clever and you're pretty... But sometimes those things fight. I'm worried that at the moment clever\nJENNY: What do you mean?\nMISS STUBBS: I couldn't bear it if clever Jenny lost. It's because of people like you that I plough through illiterate essays by Sandra Love

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8077589273452759], 'recall': [0.8494759202003479], 'f1': [0.8280923366546631], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0851063829787234, 'rouge2': 0.0, 'rougeL': 0.0851063829787234, 'rougeLsum': 0.0851063829787234}
bleu 1/2 : 0.025 0.00562543950463012
ppl : 12.522440910339355
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJENNY: This is lovely. But it is. Really. I'd love to live somewhere like this.\nMISS STUBBS: Oh, it's not hard. Go to Oxford and become a teacher and this is what you end up with.\nJENNY: But all these books and pictures.... Yes, but...That's all you need, isn't it? Just a place to...I'm sorry I said those silly things. I didn't understand.\nMISS STUBBS: Let's forget 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8916690349578857], 'recall': [0.8607574105262756], 'f1': [0.8759406208992004], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3157894736842105, 'rouge2': 0.11764705882352942, 'rougeL': 0.3157894736842105, 'rougeLsum': 0.3157894736842105}
bleu 1/2 : 0.24202205228903967 0.1509272666182457
ppl : 66.5102310180664
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: Hey. It's Tyler.\nALLY: Hi.\nTYLER: You're going to stand me up. I can feel it in my bones. I'll be devastated.\nALLY: No... I...I was just...\nTYLER: You were just sitting around in your pajamas watching T.V.\n\n", 'answer': "No. I'm not watching T.V. I'm just...getting dressed. And I'm going to meet you at the place at..what time aga

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8791261315345764], 'recall': [0.8070510029792786], 'f1': [0.8415481448173523], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05714285714285715, 'rouge2': 0.0, 'rougeL': 0.05714285714285715, 'rougeLsum': 0.05714285714285715}
bleu 1/2 : 0 0
ppl : 446.43682861328125
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: Really? I had a nice time.\n\n', 'answer': "You had a nice time or you're having a nice time? Because if you're having one, why rush to end it?", 'gold_tag': "TYLER is inquiring about the current status of ALLY's enjoyment", 'last_speaker': 'TYLER'}
Last word -> TYLER : "You had a nice time or you're having a nice time? Because if you're having one, why rush to end it?"
prediction 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8526056408882141], 'recall': [0.8065581917762756], 'f1': [0.8289429545402527], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 262.03839111328125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: Did I tell you I'm Caucasian?\nALLY: Apparently. Can't we just stroll while I feed you ice cream with fireworks in the\nTYLER: No. If I stop, he wins.\nALLY: Who, the attendant?\n\n", 'answer': 'No, the frigging Giant Panda.', 'gold_tag': "TYLER's humor is more sarcastic and dry", 'last_speaker': 'TYLER'}
Last word -> TYLER : "No, the frigging Giant Panda."
prediction :  We were eating and drinking, what you were doing.
Real an

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8260430097579956], 'recall': [0.8342980146408081], 'f1': [0.8301500082015991], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 104.5973129272461
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: I don't know what you're smiling about.\nTYLER: I won.\nALLY: Paying off the attendant is not winning... It's cheating.\nTYLER: I just paid a premium to cut out the middle man. That's innovation. That's free market enterprising. That's what made America great.\nALLY: Aha. By the way, what happened to your eye?\nTYLER: Oh, it's nothing, just a bar fight.\nALLY: How butch.\nTYLER: I know. I'm practically a pirate.\nALLY: Okay, Capt

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9209606051445007], 'recall': [0.9191555976867676], 'f1': [0.9200572371482849], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3333333333333333, 'rouge2': 0.2, 'rougeL': 0.3333333333333333, 'rougeLsum': 0.3333333333333333}
bleu 1/2 : 0.16666666666666669 0.05773502691896258
ppl : 37.384952545166016
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: I know. But we have coasters.\nALLY: I don't do coasters till the third date. Is that you?\nTYLER: That's Michael. My brother.\nALLY: He looks like you... or you look like him. Does he play around here?\nTYLER: Not any more.\nALLY: Do you play?\nTYLER: It depends who you ask.\nALLY: If I asked anybody?\nTYLER: They'd say no. I try but it turns out 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8545629978179932], 'recall': [0.8420032262802124], 'f1': [0.8482366800308228], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0 0
ppl : 23.587234497070312
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: So what kind of grade did you end up with from Vogelstein?\nTYLER: I don't actually get grades. Beat. I'm not technically enrolled. I worked out an auditing thing.\nALLY: Oh. And The Strand is not a significant career choice.\nTYLER: I like the books. You don't have to help me, you know.\nALLY: At the rate you're going, dish washing is going to be your life's vocation. I can say I knew you when. This is the playful, you get m

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8305372595787048], 'recall': [0.7929735779762268], 'f1': [0.8113209009170532], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1081081081081081, 'rouge2': 0.0, 'rougeL': 0.05405405405405405, 'rougeLsum': 0.05405405405405405}
bleu 1/2 : 0 0
ppl : 69.30535125732422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: Apologize.\nALLY: Never. Your middle name is Keats? Your parents are pretentious as hell.\nTYLER: Apologize.\nALLY: I'd rather eat monkeys.\nTYLER: I surrender! Oh my God, you're relentless.\n\n", 'answer': 'Victory at all costs.', 'gold_tag': 'ALLY is competitive and displays a relentless pursuit of victory', 'last_speaker': 'ALLY'}
Last word -> ALLY : "Victory at all costs."
predi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8229593634605408], 'recall': [0.8019507527351379], 'f1': [0.812319278717041], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 18.193538665771484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: You like?\nTYLER: Let's just skip dinner and I'll tell you how great you look in like eighteen different languages.\nALLY: You just want to skip dinner. I got you a tie.\nTYLER: I was going to make a joke about how I'm not prepared for the commitment of exchanging accessories and how neck wear especially is a slippery slope, but I should probably shut up and say thank you.\n\n", 'answer': "You're learning.", 'gold_tag': 'ALLY has

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8524534702301025], 'recall': [0.8905012607574463], 'f1': [0.8710620403289795], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 192.4957733154297
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: Where are you going?\nTYLER: I have something I've got to do.\nALLY: Now?\n\n", 'answer': 'Are you capable of taking care of yourself for an hour? Do you need me here every minute, or what?', 'gold_tag': 'Tyler is responsible enough to look after Ally , Tyler has to do something for the next hour', 'last_speaker': 'TYLER'}
Last word -> TYLER : "Are you capable of taking care of yourself for an hour? Do you need me here every minu

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8664014339447021], 'recall': [0.8278465270996094], 'f1': [0.8466852903366089], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 37.00456619262695
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: He banged an Inuit! She's really nice. And smart. I don't get it. What's going on?\nTYLER: Your dad was here.\nALLY: What?\nTYLER: When I got home.\nALLY: I hate him!\nTYLER: There's more. He knows me. Before I met you. He arrested me and Aidan. He gave me that gash over my eye.\nALLY: I don't understand.\nTYLER: I mouthed off to him. He ... responded. Then Aidan saw you guys together at school. Aidan suggested...I...introduced m

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8983007669448853], 'recall': [0.8637320399284363], 'f1': [0.8806772828102112], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2857142857142857, 'rouge2': 0.16666666666666666, 'rougeL': 0.2857142857142857, 'rougeLsum': 0.2857142857142857}
bleu 1/2 : 0.09196986029286058 0.033582644728665616
ppl : 26.200244903564453
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nALLY: Using the little sister is dirty pool.\nTYLER: Is it working? Beat.\n\n', 'answer': 'Maybe.', 'gold_tag': 'Everyday Language', 'last_speaker': 'ALLY'}
Last word -> ALLY : "Maybe."
prediction :  This is just what I need. The world needs me, so I'll do it. I'll be the one who'll save us from the evil, and bring peace into the world.
R

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8282115459442139], 'recall': [0.9252766370773315], 'f1': [0.8740575909614563], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 10.445451736450195
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: Hey...Hey.\nALLY: Hmmm.\nTYLER: I'll be back in an hour. I have to stop by my dad's office.\nALLY: Do you want french toast or pancakes for breakfast?\nTYLER: Whichever tastes better a la mode.\nALLY: Say hi to your dad for me.\nTYLER: If I said I love you... what would you say?\nALLY: I love you.\n\n", 'answer': 'I love you too.', 'gold_tag': 'TYLER is in a romantic relationship with ALLY', 'last_speaker': 'TYLER'}
Last word -

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8425724506378174], 'recall': [0.8537909984588623], 'f1': [0.8481446504592896], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1802.8621826171875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Erin -- hi. Sorry you had to wait. Here, sit down, sit down.\nERIN: Thanks a lot. I tell you, I never thought just standing would take it out of me, but ever since that shithead hit me, it feels like my whole body's put together wrong.\n\n", 'answer': "Jesus, you poor thing. Well, listen, you came to the right place. Whoever did this to you made one hell of a mistake, and you and me, we're gonna make him pay for it. Why don't you 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8445353507995605], 'recall': [0.8460450172424316], 'f1': [0.8452894687652588], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23529411764705885, 'rouge2': 0.0, 'rougeL': 0.14705882352941177, 'rougeLsum': 0.14705882352941177}
bleu 1/2 : 0.08207738127923497 0.013256692994720331
ppl : 16.50189781188965
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: ... Matthew's six, Katie's four, and Beth's just nine months. ... just wanna be a good mom, a nice person, a decent citizen. Just wanna take good care of my kids. You know?\n\n", 'answer': 'Yeah. I know.', 'gold_tag': 'Everyday Language', 'last_speaker': 'ED'}
Last word -> ED : "Yeah. I know."
prediction :  Yeah. I think just about any mom would 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.841215968132019], 'recall': [0.9316916465759277], 'f1': [0.884145200252533], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2727272727272727, 'rouge2': 0.09999999999999999, 'rougeL': 0.2727272727272727, 'rougeLsum': 0.2727272727272727}
bleu 1/2 : 0.16666666666666669 0.0990147542976674
ppl : 24.079784393310547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Open and shut? Open and fucking shut?\nED: If you hadn't used profanity --\nERIN: Oh, please, it was long over by then. God damn, he made me look like some cheap --\nED: I told you the questions might get a little personal --\nERIN: Bullshit. You told me I'd get half a million dollars. You told me I'd be set.\nED: Okay -- let's try and 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8054834604263306], 'recall': [0.8469476699829102], 'f1': [0.8256953358650208], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 131.5106658935547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Erin! How's it going?\nERIN: You never called me back. I left messages.\nED: You did? Wow, sorry about that. Listen, Mario's a little not so bright. He seems to think that you said --\nERIN: There's two things I can't stand, Mr. Masry. Being ignored, and being lied to. You did both.\nED: I never lied. I may have miscalculated -- that happens sometimes, but --\nERIN: You said things would be fine, and they're not.\nED: I'm sorry abo

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8852826356887817], 'recall': [0.9011066555976868], 'f1': [0.893124520778656], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 550.0281372070312
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Mr. Masry?\nED: Yeah?\nERIN: I was wondering -- could you tell me who I'd talk to about maybe getting an advance on my paycheck? Just -- for the weekend.\nED: Jane's the office manager. She handles payroll and petty cash. But she leaves early on Fridays.\nERIN: Oh. Okay. That's okay.\nED: Oh, for Christ's sake ... All I have is hundreds.\nERIN: I don't wanna take your money, Mr. Masry.\n\n", 'answer': "Bullshit, you don't.", 'gold

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8301366567611694], 'recall': [0.802963376045227], 'f1': [0.81632399559021], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67594.546875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: How long's she been crying like that? ... Well, she's got that tooth coming in -- Give her a cold washcloth to suck on -- I gotta go -- there's a clean one in that bag -- I'll check back in a bit. Sorry. My kid --\nED: Where's Anna?\nERIN: Out to lunch with the girls.\nED: Oh. Huh. Well, look, I got this file I need valued.\nERIN: Oh. Okay.\nED: You do know how to do that, don't you?\nERIN: Yeah. I got it. No problem.\nED: Good. You're a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8550167083740234], 'recall': [0.8573712706565857], 'f1': [0.8561922907829285], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 56.513126373291016
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: What are you doing here?\nED: I got an interesting call this afternoon. It was from a Dr. Frankel.\nERIN: Oh, yeah?\nED: He wanted you to know the legal limit for hexavalent chromium, is .05 parts per million. And that at the rate you mentioned, .58, it could be responsible for the cancers in that family you asked about. The Irvings.\nERIN: Well, that was nice of him. Isn't it funny how some people go out of their way to help pe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8250761032104492], 'recall': [0.9381409883499146], 'f1': [0.8779834508895874], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 209.0410614013672
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: But, PG&E told her about the chromium?\nERIN: They told her something, but it can't have been too specific, cause I talked to her, and she sure didn't think her water was bad.\nED: So what made you think it was?\nERIN: It doesn't take a genius to look at those medical records and think something's wrong.\nED: What medical records?\nERIN: The ones in the box of files. The box of files? The one from your office? Boy, you musta really

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.830849289894104], 'recall': [0.8679695725440979], 'f1': [0.8490038514137268], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 22.724271774291992
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: It's Sinatra's world, we just live in it. Hello? ... Hi, babys. Baby's fine. Yes, I did. I did, too, you just didn't feel it. You think I could leave without kissing my babys? Okay, here you go. Bye-bye ... bye-bye ... no, you. Okay, together. Bye-bye.\n\n", 'answer': 'Um -- you mind pulling over? Just -- for a second?', 'gold_tag': 'ERIN feels uncomfortable , ERIN needs some private space , ERIN requested to pull over', 'last_spea

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8479411005973816], 'recall': [0.7986690998077393], 'f1': [0.822567880153656], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0.08067401742967989 0.02755542282313655
ppl : 54.664031982421875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Hunh-uh. Absolutely not.\nERIN: That's crazy -- why not?\n\n", 'answer': "Because I said no. Look -- the only reason PG&E's even talking to us is cause this is a quiet little real estate dispute. We add plaintiffs, and suddenly we're in the middle of a toxic tort -- with a statute problem -- against a massive utility. No, thank you.", 'gold_tag': 'ED is possibly a lawyer or professional inv

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8522688150405884], 'recall': [0.8315106630325317], 'f1': [0.8417617678642273], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06779661016949154, 'rouge2': 0.0, 'rougeL': 0.06779661016949154, 'rougeLsum': 0.06779661016949154}
bleu 1/2 : 0.0003069679258442461 0.00010484950742849092
ppl : 22.15473175048828
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Okay, so here's what I'll do. I'll go on up to Ted and Rita Daniels -- two of the nicest people you'd ever hope to meet, who spend every single day watching their little girl fight like a dog against this cancer -- I'll tell them we can't help them cause you don't feel like working that hard.\nED: It's not about working hard --\nERIN: Bullshi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8257439136505127], 'recall': [0.819932222366333], 'f1': [0.8228278160095215], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0 0
ppl : 281.523193359375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: And this shit is bad news, Mr. Masry. Not only does it attack every organ of the body, it fucks with your DNA, too. That means these people's genes, and the genes of their kids, and the genes of their grandkids --\n\n", 'answer': 'I know how DNA works, Erin --', 'gold_tag': 'ED is knowledgeable about DNA', 'last_speaker': 'ED'}
Last word -> ED : "I know how DNA works, Erin --"
predict

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8423605561256409], 'recall': [0.8323941826820374], 'f1': [0.8373477458953857], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714288, 'rouge2': 0.0, 'rougeL': 0.14285714285714288, 'rougeLsum': 0.14285714285714288}
bleu 1/2 : 0.12500000000000003 0.04225771273642583
ppl : 166.85792541503906
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: We can get these people. With a little effort, I really think we can nail their asses to the wall.\nED: Oh, you do? With all your legal expertise, you believe that?\n\n', 'answer': "Okay, fine. I don't know shit about shit. But I know the difference -- -- BETWEEN RIGHT AND WRONG!", 'gold_tag': 'ERIN admits her lack of legal expertise', 'last_spea

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8160027861595154], 'recall': [0.8304327130317688], 'f1': [0.8231545686721802], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.020618556701030927, 'rougeL': 0.10101010101010101, 'rougeLsum': 0.10101010101010101}
bleu 1/2 : 0.03797468354430379 0.006977498959044452
ppl : 15.03950309753418
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Damn it. How many families we talking about here?\nERIN: Four more. Eleven people. So far.\nED: You think there's more?\nERIN: Well -- I found one document at the water board that had a toxic test well reading from 1967. A hell of a lot of people have lived on that land since then.\nED: This is a whole different ball game, Erin. A m

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8217658996582031], 'recall': [0.8225194215774536], 'f1': [0.822142481803894], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10909090909090907, 'rouge2': 0.018518518518518517, 'rougeL': 0.09090909090909091, 'rougeLsum': 0.09090909090909091}
bleu 1/2 : 0.025 0.017789201674120497
ppl : 20.15934181213379
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Is that what I think it is?\n\n', 'answer': 'She lived on the plume. You never know.', 'gold_tag': 'ERIN has experience dealing with a case related to the plume area , ED and ERIN have worked on or discussed a case involving a person who lived on the plume', 'last_speaker': 'ERIN'}
Last word -> ERIN : "She lived on the plume. You never know."
pre

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8458194136619568], 'recall': [0.8627781867980957], 'f1': [0.8542146682739258], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 40.8738899230957
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: They used the hex chrom here, in these cooling tanks, as an anti-corrosive. Then they dumped it here, in these six ponds.\nED: I don't remember seeing any ponds up there.\nERIN: They covered 'em over. And not too carefully either, cause you dig one inch under the surface, and the dirt is green as\nED: And that's what caused the contamination?\nERIN: It didn't help, but no. The real problem's on the bottom. See, according to this, 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8133485913276672], 'recall': [0.747628390789032], 'f1': [0.779105007648468], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 378.0323791503906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Erin -- lemme tell you something. If I'da put three researchers on this, I wouldn't expect them to dig up all the information you got here. This is some damn good work.\nERIN: Yeah? Then gimme another raise.\nED: Hey, I got a staff to pay, plus rent, plus I haven't billed a minute of my time since I started on this case, so you can quit hitting me up like I'm rich or something.\nERIN: Don't give me that. You're gonna get plenty rich 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8466792106628418], 'recall': [0.8493960499763489], 'f1': [0.8480355143547058], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.19047619047619047, 'rouge2': 0.0, 'rougeL': 0.19047619047619047, 'rougeLsum': 0.19047619047619047}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 27.42043685913086
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Jesus. They look like the Secret Service.\n\n', 'answer': "They're trying to intimidate us. Tell them to wait in the conference room.", 'gold_tag': 'ED is in a position of authority , ED directs ERIN on what to do', 'last_speaker': 'ED'}
Last word -> ED : "They're trying to intimidate us. Tell them to wait in the conference room."
prediction :  T

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8753713369369507], 'recall': [0.8835970163345337], 'f1': [0.8794649839401245], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23999999999999996, 'rouge2': 0.08695652173913043, 'rougeL': 0.23999999999999996, 'rougeLsum': 0.23999999999999996}
bleu 1/2 : 0.14248453076221212 0.03379317364116882
ppl : 17.1185359954834
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Hey. A new plaintiff called, wants to meet you. I told him we'd be out there Thursday.\nERIN: D'you get his name? Course not. Jesus, Ed --\nED: He said he'd be at the gas station at six.\n\n", 'answer': 'Boy, this job takes me to some of the best damn places, huh?', 'gold_tag': 'Everyday Language', 'last_speaker': 'ERIN'}
Last word ->

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8224612474441528], 'recall': [0.8169595003128052], 'f1': [0.8197011947631836], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 48.49069595336914
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Someone's following me.\nED: What? Who?\nERIN: Some guy in a truck -- he waited till I was alone, then he followed me, like, two miles. Jesus, I'm shaking. Get me a beer.\nED: Beer, please. What kind of truck? He's gone. Did you get a license plate? Or a make?\nERIN: No, Ed -- what with me running for my life, I didn't have time to check those things --\nED: I was just asking. Are you all right?\nERIN: Yeah. Yeah, I'm ... fine.\n

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8806174993515015], 'recall': [0.8569605946540833], 'f1': [0.8686279654502869], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.03949329237565206 0.013489527418631811
ppl : 126.0315933227539
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Lock the door every which way you can. And if there's anything -- funny sound, whatever -- pound on the wall. I'll come right over.\nERIN: I think you should know -- there's very few people in the world who don't piss me off 90 percent of the time. You're one of 'em.\n\n", 'answer': "I'm triple locking my door, so don't even think of trying to take

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8305742740631104], 'recall': [0.8520792722702026], 'f1': [0.8411893248558044], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2105263157894737, 'rouge2': 0.0, 'rougeL': 0.15789473684210525, 'rougeLsum': 0.15789473684210525}
bleu 1/2 : 0.14285714285714285 0.026726124191242442
ppl : 50.585758209228516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: I want my fucking money --\nED: I'm sorry, I'm gonna have to put you on hold for just one second here -- Do you mind?\nERIN: Yeah, I mind. You bet your ass I mind.\nED: Oh, Jesus. You wanna tell me what the problem is here, or --\nERIN: It's my paycheck. Which I earned. Which I deserve. Which I shouldn't have to beg for. That fat-ass bitch won't 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8533985018730164], 'recall': [0.8267761468887329], 'f1': [0.8398764133453369], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17500000000000002, 'rouge2': 0.0, 'rougeL': 0.15, 'rougeLsum': 0.15}
bleu 1/2 : 0.08435117013232406 0.01214795691633062
ppl : 15.870749473571777
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Believe it or not, I would've been satisfied with just the check.\nED: Well, you go threatening to leave, I can't take any chances. You're the only person around who understands what I'm doing. Things come up, I gotta know I got someone to turn to.\nERIN: What kind of things come up?\nED: Things like the head counsel for PG&E calling me with an offer. 20 million, plus attorne

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8151774406433105], 'recall': [0.8394392728805542], 'f1': [0.8271304965019226], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13.799365997314453
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Holy shit. Who do they represent, God?\nED: Don't joke. They might. So do me a favor and behave yourself for once. Ed Masry to see Kurt Potter.\n\n", 'answer': "Scuse me, sir, you got a real nice ass, you know that? Oh, I'm sorry. Was that not what you meant by behaving myself?", 'gold_tag': 'ERIN is a forthright and uninhibited individual , ERIN uses humor and flirtatious comments in formal settings', 'last_speaker': 'ERIN'}
La

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.855133593082428], 'recall': [0.8194774389266968], 'f1': [0.8369258642196655], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11428571428571428, 'rouge2': 0.0, 'rougeL': 0.11428571428571428, 'rougeLsum': 0.11428571428571428}
bleu 1/2 : 0.0493193927883213 0.011624692361541632
ppl : 40.95785903930664
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: She insulted me!\nED: Bullshit. It was a misunderstanding. But instead of handling it politely, instead of treating her with respect --\nERIN: Why the fuck should I respect her?\nED: Because that's how people treat each other!\nERIN: Not in my world.\nED: Gee, I wonder why.\n\n", 'answer': "Hey! You're my ride!", 'gold_tag': 'ERIN is dependent on E

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8502171039581299], 'recall': [0.8629820346832275], 'f1': [0.856552004814148], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 144.30699157714844
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: If you tell me to relax, I\'m gonna kick your fucking head off --\nED: Erin, it\'s just a meeting.\nERIN: People don\'t fly down in their own god damn plane for "just a meeting" --\nED: Look, you said you weren\'t feeling great. I thought you should rest.\nERIN: Bullshit. You\'d drag me off my deathbed if it suited you.\nED: Okay, look. It\'s an important meeting. Kurt thought, if it was just lawyers --\nERIN: Kurt thought? What 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8329549431800842], 'recall': [0.8349733352661133], 'f1': [0.8339629769325256], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17857142857142858, 'rouge2': 0.0, 'rougeL': 0.14285714285714285, 'rougeLsum': 0.14285714285714285}
bleu 1/2 : 0.14276154730389426 0.02300257517967063
ppl : 26.536556243896484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Between 50 and 400 million, definitely?\nED: Uh-huh.\nERIN: And if you had to guess ...\nED: With nothing linking it to the corporate offices yet, I'd say we'll end up on the lower end of that. Still a lot of money.\nERIN: So why would PG&E offer it?\nED: Because. They know the evidence; they know they're gonna lose a jury trial. Maybe they would

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.865763783454895], 'recall': [0.8718118071556091], 'f1': [0.868777334690094], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23809523809523808, 'rouge2': 0.05, 'rougeL': 0.14285714285714282, 'rougeLsum': 0.14285714285714282}
bleu 1/2 : 0.09405352498784599 0.021640728223007746
ppl : 33.80498123168945
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIN: Morning!\nED: Erin? What are you --\nERIN: You know what, Mr. Potter? I completely forgot your birthday this year. And seeing as how you've been so good to me, I think that is a terrible oversight. So what I been doing over the last few days is I've been putting together a present for you. 635. They all signed. Every single one.\nED: Ho - ly - shi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8716510534286499], 'recall': [0.8187217116355896], 'f1': [0.8443577289581299], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15384615384615383, 'rouge2': 0.02631578947368421, 'rougeL': 0.12820512820512822, 'rougeLsum': 0.12820512820512822}
bleu 1/2 : 0.010427899912055078 0.0017161210230038036
ppl : 74.28224182128906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nED: Hey! I don't know what to say.\nERIN: Say you were wrong.\nED: I was wrong.\nERIN: Say you shortchanged me and you shortchanged yourself.\nED: I did. Both.\nERIN: Say you'd be the luckiest son of a bitch on Earth if I didn't up and quit over all this.\nED: The luckiest son of a bitch in the universe, Erin. The luckiest son of a bi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8726068139076233], 'recall': [0.8417760729789734], 'f1': [0.8569141626358032], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0909090909090909, 'rouge2': 0.0, 'rougeL': 0.0909090909090909, 'rougeLsum': 0.0909090909090909}
bleu 1/2 : 0 0
ppl : 75.22005462646484
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKENNY: Yes, sir.\nGREEN: Can I help you?\nKENNY: Yes, sir. I need a room\n\n', 'answer': 'I think we can do that. Come in.', 'gold_tag': 'GREEN is likely an employee or owner of a business , GREEN is possibly associated with a hotel as they are able to provide a room', 'last_speaker': 'GREEN'}
Last word -> GREEN : "I think we can do that. Come in."
prediction :  You are checking in. This is 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8556614518165588], 'recall': [0.8713607788085938], 'f1': [0.8634397983551025], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1, 'rouge2': 0.0, 'rougeL': 0.1, 'rougeLsum': 0.1}
bleu 1/2 : 0.0909090909090909 0.03015113445777637
ppl : 46.25747299194336
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKENNY: Kenneth Fraser. Glad to meet you... My wire's been in the hospital so things might be a little dusty, but You're in music, I guess, aren't you? 'Cause my niece sings most of the day. That won't bother you, will it?\nGREEN: I didn't think it would. She's calling herself L.A. Joan this- week. Last week it was Dakota. Guess if you have to change your name a state's as good as anything.\nKENNY: She

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8194042444229126], 'recall': [0.8377374410629272], 'f1': [0.8284693956375122], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 597.3021850585938
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGREEN: Sure... you're a fine boy, Ken, you\nKENNY: Okay... you get to sleep. I'll see you in the morning. Good night.\n\n", 'answer': 'Good night,', 'gold_tag': "GREEN has a good opinion about KENNY's character", 'last_speaker': 'GREEN'}
Last word -> GREEN : "Good night,"
prediction :  OK, good night.
Real answer : Good night,


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9130120873451233], 'recall': [0.8921158313751221], 'f1': [0.9024430513381958], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.8, 'rouge2': 0.6666666666666666, 'rougeL': 0.8, 'rougeLsum': 0.8}
bleu 1/2 : 0 0
ppl : 253.4626007080078
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARMAND FL�CHARD: Mademoiselle...\nARABELLA DE CHAVILLE: Tiens, voil� le petit Chopin. A qui ?\nARMAND FL�CHARD: A toi, mademoiselle.\n\n', 'answer': 'Et bien, allons-y, mon vieux.', 'gold_tag': 'Everyday Language', 'last_speaker': 'ARABELLA DE CHAVILLE'}
Last word -> ARABELLA DE CHAVILLE : "Et bien, allons-y, mon vieux."
prediction :  Mademoiselle Arabelle, madame.
Real answer : Et bien, allons-y, mon vieux.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7994507551193237], 'recall': [0.7888250350952148], 'f1': [0.7941023707389832], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 7281.048828125
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARMAND FL�CHARD: Oh, mon billet.\nARABELLA DE CHAVILLE: Et bien, Tarzan ?\n\n', 'answer': 'Travaillons... mademoiselle, travaillons.', 'gold_tag': 'ARMAND FLÉCHARD is involved in some kind of work or job', 'last_speaker': 'ARMAND FL�CHARD'}
Last word -> ARMAND FL�CHARD : "Travaillons... mademoiselle, travaillons."
prediction :  Est-ce que tu peux prendre mon billet ? Je veux acheter des tickets pour aller dans la gare.
Real answer : Trava

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7902291417121887], 'recall': [0.7852413654327393], 'f1': [0.7877273559570312], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.242341995239258
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARMAND FL�CHARD: Nous allons attaquer aujourd'hui l'allegretto amoroso. Mais � quoi pensez-vous donc ?\nARABELLA DE CHAVILLE: Oh, � autre chose. Oh, c'est pas pr�t de m'arriver. D'ailleurs, je n'aimerais jamais qu'un homme d'action, un homme comme moi. Et � Montpaillard... ... il n'y a que des nouilles !\nARMAND FL�CHARD: Merci beaucoup, mademoiselle. Je suis donc... une nouille.\nARABELLA DE CHAVILLE: Mais vous �tes de Dijon.\nARMAND

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7739219069480896], 'recall': [0.8009418249130249], 'f1': [0.7872000932693481], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.18181818181818182, 'rouge2': 0.12903225806451613, 'rougeL': 0.18181818181818182, 'rougeLsum': 0.18181818181818182}
bleu 1/2 : 0.04545454545454546 0.014712247158412498
ppl : 36.18153381347656
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARMAND FL�CHARD: Au galop, mademoiselle, au galop !\nARABELLA DE CHAVILLE: Monsieur Fl�chard, comme vous me traitez durement.\nARMAND FL�CHARD: Oh, et tant pis.\nARABELLA DE CHAVILLE: Tant mieux. J'aime quand vous �tes m�chant avec moi.\nARMAND FL�CHARD: Oh !\nARABELLA DE CHAVILLE: Vous �tes un homme, vous.\nARMAND FL�CHARD: Non, je sui

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8623613119125366], 'recall': [0.8138614892959595], 'f1': [0.8374097347259521], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1806.92529296875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARMAND FL�CHARD: Pourquoi ?\nARABELLA DE CHAVILLE: C'est mon secret. Vous �tes ivre, monsieur Fl�chard.\n\n", 'answer': 'Mais... mais... Arabella... Mademoiselle ! Mais enfin, je... mais...', 'gold_tag': 'Everyday Language', 'last_speaker': 'ARMAND FL�CHARD'}
Last word -> ARMAND FL�CHARD : "Mais... mais... Arabella... Mademoiselle ! Mais enfin, je... mais..."
prediction :  Oh oui. C'est un peu l�t � ma part, mais vous avez perdu vos lun

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7941531538963318], 'recall': [0.7977921366691589], 'f1': [0.7959684729576111], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307691, 'rouge2': 0.0, 'rougeL': 0.07692307692307691, 'rougeLsum': 0.07692307692307691}
bleu 1/2 : 0 0
ppl : 77.08434295654297
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARABELLA DE CHAVILLE: C'est faux.\nARMAND FL�CHARD: C'est moi qui ai ross� Parju !\nARABELLA DE CHAVILLE: Vous ? Allons donc !\nARMAND FL�CHARD: Tenez, la preuve !\nARABELLA DE CHAVILLE: Je ne vous crois pas.\nARMAND FL�CHARD: N'emp�che... L'homme qui vous aime dans l'ombre.\nARABELLA DE CHAVILLE: Vous avez lu mes lettres.\nARMAND FL�CHARD: C'est moi qui les ai �crites.\nARABELLA DE CHAVI

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8733465671539307], 'recall': [0.8630298376083374], 'f1': [0.8681575655937195], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.28571428571428575, 'rouge2': 0.0, 'rougeL': 0.28571428571428575, 'rougeLsum': 0.28571428571428575}
bleu 1/2 : 0.25 0.09128709291752769
ppl : 3094.091552734375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nARMAND FL�CHARD: Oh oui, Arabella.\nARABELLA DE CHAVILLE: Vous allez �tre mis en prison ?\nARMAND FL�CHARD: Oui... Comment donc ? En prison ? Mais j'y vais.\nARABELLA DE CHAVILLE: Vous feriez �a pour moi ?\nARMAND FL�CHARD: Oui, Arabella.\nARABELLA DE CHAVILLE: Tout de suite ?\n\n", 'answer': "J'y vais. J'y vais !", 'gold_tag': 'Everyday Language', 'last_speaker': 'AR

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8018232583999634], 'recall': [0.790577232837677], 'f1': [0.7961605191230774], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 19820.935546875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNATHAN: Meditations on a Banana Slug was a delightful read.\nLILA: Thank you so much. I love slugs. All slugs, not just banana slugs.\nNATHAN: As do I.\nLILA: They're so even keel. They forge ahead with slow determination. They don't get distracted or side-tracked. They don't care what they look like. They don't care that people look at\nNATHAN: They don't seem to be especially ego driven, this is true.\nLILA: You've got to respect that.\

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8417949676513672], 'recall': [0.8222072720527649], 'f1': [0.8318858742713928], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 84.66878509521484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILA: It looks wonderful.\nNATHAN: You look wonderful. I'm on top of the world tonight, Lila. Work is going splendidly and my personal life is ...\nLILA: Um-mmm. Oh Nathan, this salad is delish...\nNATHAN: My God! The fork! The fork!\nLILA: I'm sorry?\nNATHAN: Tell her, Harold... It's just that... It's nothing. It's just that the outside fork is the\nLILA: Oh, I'm sorry. I'm sorry, Nathan. I never really learned those things.\nNATHAN: 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8448419570922852], 'recall': [0.8903897404670715], 'f1': [0.8670181035995483], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0 0
ppl : 19.942556381225586
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNATHAN: Darling, did you bring the insect repellent lotion?\nLILA: Yes, darling.\nNATHAN: Oh, and the sun block?\nLILA: Of course.\nNATHAN: What SPF, sweetie? Perfectomundo! We are ready! Say, wouldn\'t it be wonderful to have an insect repellent lotion that also worked as a sun block? Think of all the time one would save.\nLILA: Yes, darling.\nNATHAN: I think I\'ll get Johannsen in chemistry on that. Oh! Did you bring the first

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.852759838104248], 'recall': [0.8667959570884705], 'f1': [0.8597205877304077], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473682, 'rouge2': 0.0, 'rougeL': 0.10526315789473682, 'rougeLsum': 0.10526315789473682}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 30.602941513061523
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILA: Did you see that?\nNATHAN: What?\nLILA: I don't know. Something.\nNATHAN: A deer?\nLILA: No. Too... upright. Might've been a person.\nNATHAN: It might behoove us to turn back at this point.\nLILA: Come on.\n\n", 'answer': "If it's a person, why should we go see it? It's not like it's nature or anything. It's just a person. Sometimes people who li

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8441567420959473], 'recall': [0.7937849760055542], 'f1': [0.8181962966918945], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.02298850574712644, 'rouge2': 0.0, 'rougeL': 0.02298850574712644, 'rougeLsum': 0.02298850574712644}
bleu 1/2 : 2.4809901599134267e-07 8.771624830659407e-08
ppl : 86.02230072021484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILA: Who are you? You don't understand my language, do you? I take it back, you do understand my language. Oh God.\nNATHAN: Oh my God! Is he dead? Please put something on. You'll catch cold. It's cold. What do you suppose he is, a survivalist?\nLILA: I think he's feral.\nNATHAN: Feral? Don't touch him! He might be diseased! He might... My God, rab

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8063539266586304], 'recall': [0.8160065412521362], 'f1': [0.8111515641212463], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 19.714744567871094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNATHAN: ...what is it that makes us human, if not the knowledge that we are indeed human? Think of this poor soul's education as the greatest gift we could bestow upon...\nLILA: All right.\n\n", 'answer': "Great. Grab his feet. We'll throw him in the trunk.", 'gold_tag': 'NATHAN is authoritative, giving instructions to LILA', 'last_speaker': 'NATHAN'}
Last word -> NATHAN : "Great. Grab his feet. We'll throw him in the trunk."
predicti

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8366496562957764], 'recall': [0.8810504674911499], 'f1': [0.8582761883735657], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1176470588235294, 'rouge2': 0.0, 'rougeL': 0.1176470588235294, 'rougeLsum': 0.1176470588235294}
bleu 1/2 : 0.05000000000000001 0.01622214211307625
ppl : 15.391034126281738
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILA: Are you seeing somebody else, Nathan? I just have to know.\nNATHAN: Of course not.\nLILA: It would just be helpful to know.\nNATHAN: No.\nLILA: Because, you know, you seem so distant. And you work late every night. And we hardly ever have sex, and when we do, it's... I don't know. It feels different.\nNATHAN: I'm just preoccupied.\nLILA: Do you like

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8651353716850281], 'recall': [0.8659545183181763], 'f1': [0.8655447363853455], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.1637461506155964 0.05789300674674099
ppl : 136.6413116455078
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLILA: Oh! I didn't see you there, sneaky boy! You're like a boy sneaking in...\nNATHAN: ...the back door of a movie theater. Yes, indeed.\nLILA: You remember that from my book? I'm touched! What's wrong?\nNATHAN: Nothing. Hard day. Gonna have a drink.\nLILA: I'll make it. I'm so happy, Nathan! Everything's going to be so great! Scotch on the rocks, right

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8259421586990356], 'recall': [0.8174830675125122], 'f1': [0.8216908574104309], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06153846153846154, 'rouge2': 0.0, 'rougeL': 0.03076923076923077, 'rougeLsum': 0.03076923076923077}
bleu 1/2 : 0.02653819668791812 0.008551978820280986
ppl : 16.471567153930664
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNATHAN: Bravo to you, Puff!\n\n', 'answer': 'That was wonderful!', 'gold_tag': 'Everyday Language', 'last_speaker': 'LILA'}
Last word -> LILA : "That was wonderful!"
prediction :  Good luck, Nathan.
Real answer : That was wonderful!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8900508284568787], 'recall': [0.8685994148254395], 'f1': [0.8791942596435547], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 510.57037353515625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNATHAN: I'm going to go down and check on Puff. See how he's holding up.\nLILA: Should I come with?\nNATHAN: Nah. You just relax. How's the book?\nLILA: Ummm. It's good.\n\n", 'answer': "I won't be long.", 'gold_tag': "NATHAN's temporal information", 'last_speaker': 'NATHAN'}
Last word -> NATHAN : "I won't be long."
prediction :  How's the book?
Real answer : I won't be long.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8187116980552673], 'recall': [0.8269330263137817], 'f1': [0.8228018283843994], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 162.00912475585938
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNATHAN: Hey.\nLILA: You were gone a long time.\nNATHAN: Yeah. Puff and I got into a big, philosophical discussion. He\'s really quite well read, considering he\'s only been literate for a month now. He\'s going to make us famous, Lila.\nLILA: So he\'s doing okay?\nNATHAN: Seemed fine. Quiet evening enjoying\nLILA: That\'s funny because, you know, I just went and picked him up at some flophouse on the lower eastside. He called here whe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8281537890434265], 'recall': [0.8196728229522705], 'f1': [0.8238914012908936], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 20.146892547607422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: Next! Oh, no. What do you want?\nUTAH: Shrimp and fries.\nTYLER: I mean, what do you want? What are you doing hanging around here.\nUTAH: I need you to teach me.\nTYLER: Gimme a break. One shrimp and fries to go! Anything to drink?\nUTAH: I'm serious.\nTYLER: I can see that. But forget it. Stick to tennis, or whatever you're good at. Miniature golf. Here, your number's 37.\nUTAH: Well, I'm just gonna go back out there till I ca

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8555049896240234], 'recall': [0.8550550937652588], 'f1': [0.8552799820899963], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.14285714285714285 0.048795003647426664
ppl : 43.98386764526367
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nUTAH: Nice place.\n\n', 'answer': "He rents it for the summer. Bodhi always gets some slick place and throws it open to every surf burnout around. Most a these guys can't keep a job. When the swell comes, they're gone, they have to ride.", 'gold_tag': 'TYLER is aware that Bodhi rents a slick place every summer , TYLER is familiar with the habits of sur

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8758458495140076], 'recall': [0.8268311023712158], 'f1': [0.8506329655647278], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0425531914893617, 'rouge2': 0.0, 'rougeL': 0.0425531914893617, 'rougeLsum': 0.0425531914893617}
bleu 1/2 : 0.00022275502956896066 7.875579597581101e-05
ppl : 67.85456848144531
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTYLER: You had enough?\nUTAH: Yeah. I just want to sit out here for a minute.\nTYLER: Look at you.\nUTAH: What?\nTYLER: Well, usually you have this sort of intense scowl of concentration, like you're doing this for a school project or something... I don't know, like something's driving you. See, it's gone. If I didn't know better I'd say you looked al

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8584397435188293], 'recall': [0.9019308090209961], 'f1': [0.8796480894088745], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 41.876487731933594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nUTAH: Ouch.\nTYLER: Betadyne doesn't hurt.\nUTAH: You're kneeling on my hand.\nTYLER: So what'd the other guy look like?\nUTAH: Never saw him, was your basic hit and run.\nTYLER: But you look like you been in a train wreck, how'd he just drive away? Johnny, what is it with you? You have that look again, it's like you're about to tell me something and then you don't... or you can't. What's going on? What? What do you want to tell me?\n

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8461649417877197], 'recall': [0.8627103567123413], 'f1': [0.8543575406074524], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.19047619047619044, 'rouge2': 0.05, 'rougeL': 0.19047619047619044, 'rougeLsum': 0.19047619047619044}
bleu 1/2 : 0.10344827586206899 0.06078306738548309
ppl : 23.089035034179688
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPOLLY: Elsie!\nELSIE: Yes, Mum?\nPOLLY: Elsie, where have you been?\n\n', 'answer': 'In the garden...', 'gold_tag': 'ELSIE was recently in the garden', 'last_speaker': 'ELSIE'}
Last word -> ELSIE : "In the garden..."
prediction :  I am here, mum. This is my family.
Real answer : In the garden...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8544167280197144], 'recall': [0.8288007974624634], 'f1': [0.8414138555526733], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 55.872684478759766
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELSIE: I'm sorry, Joseph. It's all my fault. I didn't mean for any of this to happen...\nPOLLY: What are you doing in here? This is Joseph's room! My Joseph's room!\nELSIE: Mummy, you're hurting me!\n\n", 'answer': "I'm sorry...", 'gold_tag': 'POLLY immediately shows remorse', 'last_speaker': 'POLLY'}
Last word -> POLLY : "I'm sorry..."
prediction :  You must be Joseph. Tell me you are well. Tell me I don't have to cry any more. Tell 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8256708979606628], 'recall': [0.8461835384368896], 'f1': [0.835801362991333], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333333, 'rouge2': 0.0, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0 0
ppl : 25.79417610168457
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELSIE: Are they gone?\n\n', 'answer': "On the contrary. They've asked to hear about the fairies.", 'gold_tag': 'Everyday Language', 'last_speaker': 'POLLY'}
Last word -> POLLY : "On the contrary. They've asked to hear about the fairies."
prediction :  No, they will be back later.
Real answer : On the contrary. They've asked to hear about the fairies.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8561221361160278], 'recall': [0.8453834056854248], 'f1': [0.8507188558578491], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352942, 'rouge2': 0.0, 'rougeL': 0.11764705882352942, 'rougeLsum': 0.11764705882352942}
bleu 1/2 : 0 0
ppl : 105.86732482910156
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELSIE: I'm so sorry, Mum. We had to do it this way so you could see them. But they are real...they are.\n\n", 'answer': 'I believe you.', 'gold_tag': "POLLY's trust in her daughter is strong", 'last_speaker': 'POLLY'}
Last word -> POLLY : "I believe you."
prediction :  How are I a hero? You were the one that saw the evidence, so I had to do the dirty work. You did good.
Real answer : I b

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8405120372772217], 'recall': [0.8882297277450562], 'f1': [0.8637123107910156], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814814, 'rouge2': 0.0, 'rougeL': 0.14814814814814814, 'rougeLsum': 0.14814814814814814}
bleu 1/2 : 0.041666666666666664 0.013459547551454133
ppl : 38.95928955078125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: Look. We can leave our coats over there. I want to get rid of this.\nJENNY: I'm so sorry.\nHELEN: That's OK. It's nice, isn't it?\nJENNY: It's beautiful. Where did it come from?\nHELEN: Oh, South Ken somewhere. This is...Well, it's good for this sort of concert, isn't it?\nJENNY: Thank you.\nHELEN: We should go shopping together one day, if you

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8234870433807373], 'recall': [0.8087911009788513], 'f1': [0.8160728812217712], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 15.227816581726074
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: There. You'll do. You can keep it. I can only wear so many every day.\nJENNY: Really? Thank you.\nHELEN: What about tonight? Will you be needing a nightie? Or not?\nJENNY: A nightie? Will we be sharing bedrooms?\nHELEN: You're not sleeping with him?\nJENNY: No. I'm...No.\nHELEN: Good for you.\nJENNY: Really? Do you think so?\nHELEN: You're only sixteen. And you don't want to get into the family way, do you?\nJENNY: Oh, I'd make

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8924636244773865], 'recall': [0.8247506618499756], 'f1': [0.8572720885276794], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1615.0216064453125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: Oh, it's always like this. There are millions of places I've never seen because I've been stuck in here.\nJENNY: You never get out?\n\n", 'answer': "There's never anywhere to go in the places they stop.", 'gold_tag': 'HELEN is dissatisfied with the places she gets to visit', 'last_speaker': 'HELEN'}
Last word -> HELEN : "There's never anywhere to go in the places they stop."
prediction :  Well, I do. (She gets up form her chair

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8183044195175171], 'recall': [0.8509347438812256], 'f1': [0.8343006372451782], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.07142857142857141 0.023440361546924773
ppl : 45.28706359863281
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: They won\'t be long. Either way.\nJENNY: "Either way"?\n\n', 'answer': "Sometimes they find something, sometimes they don't. And when they do find something, we usually have to leave quite quickly. They can be a bit naughty, sometimes. Anyway. It's nice to have company. I'm usually outside on my own.", 'gold_tag': 'HELEN often finds herself outside alone , HELEN appreciates company , HELEN

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8664261102676392], 'recall': [0.8430694341659546], 'f1': [0.8545882105827332], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16, 'rouge2': 0.04166666666666667, 'rougeL': 0.12, 'rougeLsum': 0.12}
bleu 1/2 : 0.004432788512836392 0.0008741576834515867
ppl : 51.96991729736328
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: Ready.\n\n', 'answer': 'Jesus.', 'gold_tag': 'Everyday Language', 'last_speaker': 'MARCUS'}
Last word -> MARCUS : "Jesus."
prediction :  OK
Real answer : Jesus.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9400529861450195], 'recall': [0.9699887633323669], 'f1': [0.9547863006591797], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 840594.625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMARCUS: How's it goin'?\nWILLIE: I'm finished when I'm finished.\nMARCUS: I'm goin' downstairs... I need a melon-bailer and a loofah.\nWILLIE: Got it.\n\n", 'answer': "Fuck the loofah, let's go.", 'gold_tag': 'MARCUS speaks in a directive manner', 'last_speaker': 'MARCUS'}
Last word -> MARCUS : "Fuck the loofah, let's go."
prediction :  Let's play a game of "guess the melon". Tell me what kind of melon it is, and I'll name one from the list.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8296997547149658], 'recall': [0.8662850856781006], 'f1': [0.8475977778434753], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1935483870967742, 'rouge2': 0.06896551724137931, 'rougeL': 0.12903225806451613, 'rougeLsum': 0.12903225806451613}
bleu 1/2 : 0.043478260869565216 0.014058038927888336
ppl : 9.734620094299316
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: I gotta get a drink on. See ya tomorrow.\nMARCUS: Just don't come in to work stinkin' of booze again.\nWILLIE: Don't worry about me. Get going, you'll be late for your Wizard of Oz Candy Bar Guild thing.\nMARCUS: Lollipop Guild, asshole. Jesus, two year olds flip me shit better'n you.\nWILLIE: You tryin' to say something to me?\

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8546593189239502], 'recall': [0.8616424202919006], 'f1': [0.8581367135047913], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814814, 'rouge2': 0.0, 'rougeL': 0.14814814814814814, 'rougeLsum': 0.14814814814814814}
bleu 1/2 : 0.06666666666666667 0.02182178902359924
ppl : 29.044635772705078
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMARCUS: That\'s just the kinda shit that\'s gonna get us pinched!\nWILLIE: She said she was eighteen.\nMARCUS: You promised no arcades! You said you\'d only hustle Big & Tall!\nWILLIE: Ah, it\'s like shooting fish in a barrel -- there\'s no sport,\nMARCUS: How many times, you fuck? "The bigger the store, the bigger the take." Well, we can\'t work the b

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8557252287864685], 'recall': [0.805739164352417], 'f1': [0.8299802541732788], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09090909090909093, 'rouge2': 0.0, 'rougeL': 0.09090909090909093, 'rougeLsum': 0.09090909090909093}
bleu 1/2 : 0 0
ppl : 35.26494216918945
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: No, I think it's a cop though. You think someone's onto us?\nMARCUS: Is there anything in the room? Anything professional?\nWILLIE: No. Clothes.\n\n", 'answer': 'Just ditch. You got anywhere to sack out for a while?', 'gold_tag': 'WILLIE lives a transient lifestyle, indicated by his need to find a temporary place to stay , WILLIE is in a transient situation, needing to find a tempo

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8485301733016968], 'recall': [0.865058183670044], 'f1': [0.8567144870758057], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08695652173913043, 'rouge2': 0.0, 'rougeL': 0.08695652173913043, 'rougeLsum': 0.08695652173913043}
bleu 1/2 : 0.0909090909090909 0.03015113445777637
ppl : 63.392940521240234
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: GOD dammit!\nMARCUS: You tear your ball again?\nWILLIE: No, it's okay.\n\n", 'answer': "Let's do the other thing. There.", 'gold_tag': 'Shared memories - MARCUS and WILLIE seem to have a shared history of WILLIE frequently experiencing a specific injury or medical condition', 'last_speaker': 'MARCUS'}
Last word -> MARCUS : "Let's do the other thi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8652311563491821], 'recall': [0.8899599313735962], 'f1': [0.8774213790893555], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0.12500000000000003 0.04225771273642583
ppl : 25.86846351623535
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: I think I've turned a corner.\nMARCUS: Yeah? You fucking Petites now?\nWILLIE: No no. No; I beat the crap out of some kids today -- but, you know, for a purpose. It really made me feel pretty good about myself -- like I did something constructive for a change. Accomplished somethin'.\n\n", 'answer': '...You need many years of therapy. Many, many

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7607678771018982], 'recall': [0.7767821550369263], 'f1': [0.7686916589736938], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08695652173913043, 'rouge2': 0.0, 'rougeL': 0.08695652173913043, 'rougeLsum': 0.08695652173913043}
bleu 1/2 : 0 0
ppl : 74.7089614868164
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: 'Tendo it is. ...Everything I touch turns to shit\n\n", 'answer': "What are you, drinking Sterno now? 'Cause you're sounding like my Aunt Tilly right before she smeared her own shit on the bedroom walls and we had to lock her up and she spent the rest of her life with a shaved head and eating lunch through a tube up her nose... ...You better be in shape by this evening, fat man. Af

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.844197154045105], 'recall': [0.8091398477554321], 'f1': [0.826296865940094], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13861386138613863, 'rouge2': 0.0202020202020202, 'rougeL': 0.09900990099009901, 'rougeLsum': 0.09900990099009901}
bleu 1/2 : 0.00041312536277772645 9.64852976186419e-05
ppl : 23.27419090270996
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWILLIE: Barbie it is... ...That it?\nMARCUS: Let's go. ...Oh Christ.\n\n", 'answer': "Let's do it.", 'gold_tag': 'Everyday Language', 'last_speaker': 'WILLIE'}
Last word -> WILLIE : "Let's do it."
prediction :  What the hell?
Real answer : Let's do it.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8400229215621948], 'recall': [0.8770046830177307], 'f1': [0.858115553855896], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 247.55857849121094
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK FURY: How bad is it?\n\n', 'answer': "That's the problem, sir. We don't know.", 'gold_tag': 'Everyday Language', 'last_speaker': 'AGENT PHIL COULSON'}
Last word -> AGENT PHIL COULSON : "That's the problem, sir. We don't know."
prediction :  I don’t know. You need to come back here. (NICK FURY enters the room and sits at the desk) Do you have a flashlight?
Real answer : That's the problem, sir. We don't know.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8223258256912231], 'recall': [0.8614745140075684], 'f1': [0.8414450883865356], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23529411764705882, 'rouge2': 0.125, 'rougeL': 0.1764705882352941, 'rougeLsum': 0.1764705882352941}
bleu 1/2 : 0.08333333333333333 0.019034674690672018
ppl : 27.70304298400879
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK FURY: Sound the general call. I want every living soul not working rescue looking for that brief case. Roger that. Coulson, get back to base. This is a LEVEL SEVEN. As of right now, we are at war.\n\n', 'answer': 'What do we do?', 'gold_tag': 'AGENT PHIL COULSON is a member of the team that responds to the LEVEL SEVEN alert , AGENT PHIL COULSON ha

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9083991646766663], 'recall': [0.8896802663803101], 'f1': [0.898942232131958], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3333333333333333, 'rouge2': 0.0, 'rougeL': 0.3333333333333333, 'rougeLsum': 0.3333333333333333}
bleu 1/2 : 0.18393972058572117 0.082260343798398
ppl : 1000.1011962890625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nAGENT PHIL COULSON: Sorry, boss. They got rabbited.\nNICK FURY: Just stay awake. EYES ON ME!\nAGENT PHIL COULSON: No. I'm clocked out here.\nNICK FURY: Not an option.\n\n", 'answer': "It's okay, boss. This was never going to work... if they didn't have something... to...", 'gold_tag': "AGENT PHIL COULSON indicates he is 'clocked out,' suggesting a deteriorat

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.860478401184082], 'recall': [0.8082534074783325], 'f1': [0.8335486650466919], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0909090909090909, 'rouge2': 0.0, 'rougeL': 0.0909090909090909, 'rougeLsum': 0.0909090909090909}
bleu 1/2 : 0.027067056647322542 0.009569649651041096
ppl : 73.81143951416016
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDREW: This is ridiculous.\n\n', 'answer': 'Back to work.', 'gold_tag': 'Everyday Language', 'last_speaker': 'D.S.'}
Last word -> D.S. : "Back to work."
prediction :  How can you say that? I’ve only seen the photos too – but you can’t show me, or else I’ll know you’re a faker and I’ll have a full and proper hearing about your whole mess; what do you know a

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7918068170547485], 'recall': [0.8609652519226074], 'f1': [0.8249391317367554], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.022988505747126436, 'rouge2': 0.0, 'rougeL': 0.022988505747126436, 'rougeLsum': 0.022988505747126436}
bleu 1/2 : 0.012820512820512822 0.004080443192842739
ppl : 16.68659210205078
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDREW: Cops! Somebody called the cops?!?\n\n', 'answer': "Drew! Come with me to San De's chamber... Just Drew.", 'gold_tag': 'Everyday Language', 'last_speaker': 'D.S.'}
Last word -> D.S. : "Drew! Come with me to San De's chamber... Just Drew."
prediction :  Cops on the roof here.
Real answer : Drew! Come with me to San De's chamber... Just Drew.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8554171323776245], 'recall': [0.8119345903396606], 'f1': [0.8331089019775391], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1782.08447265625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nD.S.: Try to hit me here with your feet.\n\n', 'answer': 'With pleasure.', 'gold_tag': 'Everyday Language', 'last_speaker': 'DREW'}
Last word -> DREW : "With pleasure."
prediction :  You'll get hurt there when I step on you.
Real answer : With pleasure.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8424550294876099], 'recall': [0.8889137506484985], 'f1': [0.8650610446929932], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 82.26988220214844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDREW: We've got to rebuild this?\n\n", 'answer': 'No. Tear it down.', 'gold_tag': 'Everyday Language', 'last_speaker': 'D.S.'}
Last word -> D.S. : "No. Tear it down."
prediction :  Yes, we will need to rebuild this from the ashes.
Real answer : No. Tear it down.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8860007524490356], 'recall': [0.9091019034385681], 'f1': [0.8974027037620544], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 61.10089111328125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDREW: I can't do it.\n\n", 'answer': "There is no can't, only won't.", 'gold_tag': 'D.S. acts as a mentor or coach , D.S. believes in potential and motivation rather than accepting failure', 'last_speaker': 'D.S.'}
Last word -> D.S. : "There is no can't, only won't."
prediction :  Is there anything I can do to help?
Real answer : There is no can't, only won't.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8087180852890015], 'recall': [0.8155539035797119], 'f1': [0.812121570110321], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.375, 'rouge2': 0.0, 'rougeL': 0.25, 'rougeLsum': 0.25}
bleu 1/2 : 0 0
ppl : 15.485372543334961
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTACY: There... There's his car. I know he's at work tonight. He hasn't come into Swenson's since he called my house. My mother told him I was still at high school, after I told him I was nineteen. I guess I should tell him I'm fifteen.\nLINDA: Don't you dare, you'll never hear from him again.\nSTACY: Does Doug care that you're seventeen?\nLINDA: Doug sees beyond that stuff to what the person inside is like. That's why I'm marrying

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8501591682434082], 'recall': [0.842739462852478], 'f1': [0.8464330434799194], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23529411764705882, 'rouge2': 0.0, 'rougeL': 0.23529411764705882, 'rougeLsum': 0.23529411764705882}
bleu 1/2 : 0.0493193927883213 0.017437038542312454
ppl : 103.43024444580078
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLINDA: You've got to get used to working Christmas. People are always screaming and yelling... then they get home and they're all Christmasy.\nSTACY: I think Christmas brings out the worst in people.\nLINDA: I guess Ron hasn't called yet.\nSTACY: Not since November.\nLINDA: Stacy, it doesn't look good for the relationship.\nSTACY: Don't you think it mea

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8621925115585327], 'recall': [0.8444857597351074], 'f1': [0.8532472848892212], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 85.87242126464844
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTACY: What do you think of that guy who works at the theatre? You know, Mark Ratner.\nLINDA: Oh, come on. What is he? Fifteen?\nSTACY: Sixteen.\n\n', 'answer': 'Just watch out if he pulls up in a van, and then puts on a Led Zeppelin tape.', 'gold_tag': 'LINDA is aware of popular rock band Led Zeppelin , LINDA has a skeptical opinion about Mark, the guy who works at the theatre', 'last_speaker': 'LINDA'}
Last word -> LINDA : "Just watc

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8471578359603882], 'recall': [0.8436551094055176], 'f1': [0.8454028367996216], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05714285714285714, 'rouge2': 0.0, 'rougeL': 0.05714285714285714, 'rougeLsum': 0.05714285714285714}
bleu 1/2 : 0 0
ppl : 23.850788116455078
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSTACY: I really thought he would show up. I waited... and waited... and waited...\nLINDA: That little prick.\nSTACY: Then I called his house, and his mother told me he was in the garage helping his father.\nLINDA: That little prick.\nSTACY: I paid for it and everything.\nLINDA: There goes your stereo for another year. Mike Damone is a no-brain little prick. I'm not letting him get away w

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8248281478881836], 'recall': [0.8566457033157349], 'f1': [0.8404358625411987], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0821917808219178, 'rouge2': 0.0, 'rougeL': 0.0547945205479452, 'rougeLsum': 0.0547945205479452}
bleu 1/2 : 0.05000000000000001 0.009205746178983234
ppl : 18.584896087646484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: What was that?\nYURI: Oh nothing.\nJOHN: Tell me.\nYURI: No. It is too judgmental.\nJOHN: Tell me what he said.\nYURI: He says why did you send to Russia for a wife. You are not ashamed of it? It's no surprise to want to love.\nJOHN: No. It's not that.\nYURI: Do you believe in love?\nJOHN: I suppose it's... I mean define your terms.\n\n", 'answer':

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8266462087631226], 'recall': [0.7797893285751343], 'f1': [0.8025344014167786], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.031413612565445025, 'rouge2': 0.0, 'rougeL': 0.020942408376963356, 'rougeLsum': 0.020942408376963356}
bleu 1/2 : 1.3841285779600526e-16 3.460321444900132e-17
ppl : 66.05847930908203
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nYURI: John. We can see you hiding. How is bank?\nJOHN: Fine. I thought you were leaving today.\n\n', 'answer': "To be indoors on such a day. It's crime.", 'gold_tag': 'YURI enjoys outdoor activities', 'last_speaker': 'YURI'}
Last word -> YURI : "To be indoors on such a day. It's crime."
prediction :  I'll talk to you later if you behave well. If

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8389608860015869], 'recall': [0.8673297762870789], 'f1': [0.852909505367279], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333333, 'rouge2': 0.0, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0 0
ppl : 35.04963684082031
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: Hey.\n\n', 'answer': "He's just having fun. He's maybe too strong you know...", 'gold_tag': 'Everyday Language', 'last_speaker': 'YURI'}
Last word -> YURI : "He's just having fun. He's maybe too strong you know..."
prediction :  Yeah, I'd be there too!
Real answer : He's just having fun. He's maybe too strong you know...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8254172205924988], 'recall': [0.8314566612243652], 'f1': [0.828425943851471], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1111111111111111, 'rouge2': 0.0, 'rougeL': 0.1111111111111111, 'rougeLsum': 0.1111111111111111}
bleu 1/2 : 0 0
ppl : 47.95749282836914
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nYURI: I understand. I'm so sorry\nJOHN: You can stay tonight.\nYURI: I have brought you trouble. Maybe I should have come alone.\n\n", 'answer': 'Good night.', 'gold_tag': 'Everyday Language', 'last_speaker': 'JOHN'}
Last word -> JOHN : "Good night."
prediction :  You've got to be kidding. That makes 480,000 Americans dead.
Real answer : Good night.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8205517530441284], 'recall': [0.8815040588378906], 'f1': [0.8499365448951721], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 15.71968936920166
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: What are you doing?\nYURI: John, I'm sorry. It's my fault.\nJOHN: What's he doing? What the fuck are you doing? Leave her alone.\nYURI: He says sit down. Or he'll cut her. Sit down please.\nJOHN: Tell him to stop and let her go, and we'll talk. Put the fucking kettle down.\nYURI: John.\nJOHN: Put the fucking kettle down. Tell, Yuri, tell him put it down or I'm going to make him.\nYURI: He says you scare him so much he must go to 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8357905149459839], 'recall': [0.871058464050293], 'f1': [0.8530601263046265], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 3032.510498046875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: His prints were all over the kitchen, including one on the bag.\nZACK: Half-a-thumb print.\nBITSEY: Okay, half-a-thumb print.\nZACK: Could have touched it before it was a murder weapon.\nBITSEY: Do you fondle your friends' garbage bags?\nZACK: Yeah, I get very touchy around household plastics. 'Hello, everybody -- ooooh, Tupperware.' Chill. I'm just saying the bag could have been out on the counter or something.\nBITSEY: Hey, Za

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8351930975914001], 'recall': [0.7888748049736023], 'f1': [0.8113734126091003], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 5426.2412109375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: Try phone records, a net search.\nZACK: Fucking better than watching you work. You think he's telling the truth?\nBITSEY: Don't ask me that. I don't know. There is no truth, only perspectives.\nZACK: Can't say that. If you say 'there is no truth,' you're claiming it's true that there is no truth -- it's a logical contradiction.\nBITSEY: Working on our philosophy merit badge, Zack?\nZACK: I, on the other hand, think Gale's telling

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8603882789611816], 'recall': [0.8630889058113098], 'f1': [0.8617364764213562], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 98.27686309814453
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: Thirty-three o what?\nBITSEY: Seven. 'Check out the crime scene in Austin.' You made it sound so close. Shit, two and half hours and that damn lig --\n\n", 'answer': 'Bitsey.', 'gold_tag': "Zack knows about Bitsey's assignment , Zack has a calming influence , Zack tries to pacify Bitsey when she gets exasperated", 'last_speaker': 'ZACK'}
Last word -> ZACK : "Bitsey."
prediction :  "You made it sound so close, I can't believe it, 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.821272611618042], 'recall': [0.8422325849533081], 'f1': [0.8316205143928528], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 26.599687576293945
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: It was in the case file.\nZACK: Fuck, in her stomach? That's colder than Wisconsin.\nBITSEY: Zack, open your vanity.\nZACK: What?\nBITSEY: The vanity mirror, pull it down. See that truck behind us?\nZACK: Yeah.\nBITSEY: Isn't that the cowboy from the rest area?\nZACK: Same truck.\nBITSEY: Weird coincidence, huh?\n\n", 'answer': "Coincidences are always weird, that's why they're coincidences.", 'gold_tag': 'Everyday Language', '

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8797833323478699], 'recall': [0.8194476366043091], 'f1': [0.8485442996025085], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0 0
ppl : 69.47409057617188
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: Fuck. Hey.\nBITSEY: For God's sake, Zack.\nZACK: You're not a happy camper.\nBITSEY: It's not a happy place.\nZACK: Talked to the Austin prosecutor. Belyeu's a yokel. Prosecutor says he fucked up the penalty phase, says Gale probably would have gotten life on mitigating factors. Gale stuck with him, though, all through appeals, despite major pro bono offers.\nBITSEY: What else?\nZAC

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8181068301200867], 'recall': [0.9099438786506653], 'f1': [0.8615850210189819], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 209.34739685058594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: This is mostly an Oklahoma caravan. I got stuck behind them on the way in. Execution's not for another 36 hours. It's gonna be a zoo. There's the P.R. guy.\nBITSEY: What a life, waiting around for someone to die.\nZACK: Fuck, look.\n\n", 'answer': 'This is a little too coincidental.', 'gold_tag': "BITSEY's outlook on life is somewhat cynical and detached", 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "This is a little too coi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8775249123573303], 'recall': [0.8597277402877808], 'f1': [0.8685351014137268], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3333333333333333, 'rouge2': 0.0, 'rougeL': 0.3333333333333333, 'rougeLsum': 0.3333333333333333}
bleu 1/2 : 0.3274923012311928 0.08187307530779821
ppl : 86.75019836425781
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: 'Cause I couldn't see the license plate.\nBITSEY: Forget it. What time is it?\nZACK: 6:05. Why don't you have a watch?\n\n", 'answer': "It's a long story.", 'gold_tag': 'BITSEY does not carry a watch , BITSEY has a significant story behind her not wearing one', 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "It's a long story."
prediction :  I don't 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8822544813156128], 'recall': [0.8766582012176514], 'f1': [0.8794474601745605], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 42.09751892089844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: What!?\nBITSEY: It's open.\nZACK: So? Maybe the maid forgot.\n\n", 'answer': 'Look!', 'gold_tag': 'Everyday Language', 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "Look!"
prediction :  Yes! We have the right to live like normal people!
Real answer : Look!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8465093374252319], 'recall': [0.9325010776519775], 'f1': [0.8874269723892212], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 50.22345733642578
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: I'll check the bathroom. Clear in here! Don't touch it! Fuck, Bitsey, there could have been prints.\n\n", 'answer': 'Look.', 'gold_tag': 'Everyday Language', 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "Look."
prediction :  What did I do? How did they get it?
Real answer : Look.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.829226016998291], 'recall': [0.9171041250228882], 'f1': [0.8709539771080017], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 31.9758243560791
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: Let's say Gale's right. Some sick- fuck-Agatha Christie-wannabe set him up, arranged like the perfect murder. Why send a magazine journalist proof a few hours before he's won? Doesn't make sense.\nBITSEY: No, it's perfect. He knows News Magazine won't give the scoop to the dailies or nightly news, not after having paid for it. Probably guessed we wouldn't call the cops. And, most importantly, knows I'll tell Gale tomorrow.\nZACK: W

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8716758489608765], 'recall': [0.8789691925048828], 'f1': [0.8753073215484619], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 105.39848327636719
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: Did you see the cowboy go in?\nZACK: He went in Belyeu's building?\nBITSEY: Into his office. Just saw him.\nZACK: Fuck!\nBITSEY: Follow him. Find out who he is, where he lives -- what he --\nZACK: How are you gonna get to Ellis?\n\n", 'answer': 'A taxi.', 'gold_tag': 'BITSEY suggests taking a taxi, implying she might be in a rush due to the urgency of the situation', 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "A taxi."
pr

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8379076719284058], 'recall': [0.8966264724731445], 'f1': [0.8662731647491455], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 20.046703338623047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: Zack! Wake up! Get up! Did you throw the towel on the floor?\nZACK: What?\nBITSEY: The towel on my bathroom floor. Did you put it there?\nZACK: Yeah, I guess. It's a hotel room. What --\nBITSEY: Do you do that at home?\nZACK: No. Fuck, Bitsey. I'm sorry, it's not like it's --\nBITSEY: Get up.\nZACK: What the fuck's wrong with you?\nBITSEY: Grab the T.V. I want to check something.\nZACK: What?\nBITSEY: We're taking a tour.\nZAC

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8200633525848389], 'recall': [0.8831528425216675], 'f1': [0.8504396080970764], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 745.0380249023438
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nZACK: Hairy.\n\n', 'answer': 'Come on.', 'gold_tag': 'Everyday Language', 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "Come on."
prediction :  Hey look
Real answer : Come on.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8621279001235962], 'recall': [0.9055578708648682], 'f1': [0.8833093643188477], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 11003.0029296875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: ...Because of the Berlin thing, Constance knew the police would go whose life was shit anyway. But...\nZACK: She was in love with him.\n\n', 'answer': "I don't know. Something. They were close. She wouldn't want him dead.", 'gold_tag': 'BITSEY engages in thoughtful dialogue about the situation involving Constance and the man she was close with', 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "I don't know. Something. They were 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8006146550178528], 'recall': [0.8259800672531128], 'f1': [0.8130995035171509], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 154.6263885498047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: Remember this thing about Dusty being a bull-horner, going to far?\nZACK: Why he was fired from DeathWatch, and the A.C.L.U.\nBITSEY: Right. Maybe the plan was for Dusty to release the tape after Gale's conviction, after a year or so. You know, force him to dry out, let him play the heroic victim, give him back his dignity. So, Dusty Wright's sitting on this tape, waiting, the only one who knows about it. And maybe good ole Dus

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8510832786560059], 'recall': [0.8512663245201111], 'f1': [0.8511747121810913], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67.83635711669922
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: Don't move from the booth. Call the second you see the truck. Remember, let it ring just once. Then get into the woods --\nZACK: I know. Go.\nBITSEY: Into the woods, Zack.\n\n", 'answer': 'Go!', 'gold_tag': "BITSEY and ZACK seem to have prior experience with this drill, as indicated by BITSEY's instructions and ZACK's quick understanding.", 'last_speaker': 'ZACK'}
Last word -> ZACK : "Go!"
prediction :  Okay, hold on. Can you h

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8132333755493164], 'recall': [0.8317382335662842], 'f1': [0.8223816752433777], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 11.94579029083252
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBITSEY: Everyone! Wake up New York, the warden, the Governor, the goddamn Supreme Court death clerk! How far is it?\nZACK: Took me 30 this afternoon. You've got 26, maybe more.\n\n", 'answer': "I'll make it.", 'gold_tag': "BITSEY's temporal information", 'last_speaker': 'BITSEY'}
Last word -> BITSEY : "I'll make it."
prediction :  Let's go to the Governor. He'll say. "Hey, you're in prison. You're not dead yet." The Supreme Court will 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8215538263320923], 'recall': [0.8822470307350159], 'f1': [0.8508193492889404], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.028985507246376812, 'rouge2': 0.0, 'rougeL': 0.028985507246376812, 'rougeLsum': 0.028985507246376812}
bleu 1/2 : 0 0
ppl : 13.277792930603027
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBEN: You crying?\nELENA: I'm just sad Ben -- I mean it was... you were, but, you know. I just don't know...\nBEN: Whatever that means Elena -- And you complain about me not communicating... I thought it was --\nELENA: No, I didn't mean to sound negative. It was -- But Ben. What is going to happen with us? Have you --\nBEN: You have to bring this up now? What? Did I do something here? 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8542925119400024], 'recall': [0.8361344337463379], 'f1': [0.8451159596443176], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09523809523809525, 'rouge2': 0.0, 'rougeL': 0.09523809523809525, 'rougeLsum': 0.09523809523809525}
bleu 1/2 : 0 0
ppl : 26.758045196533203
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELENA: The turkey in?\n\n', 'answer': 'Stuffed and baking.', 'gold_tag': 'BEN is involved in cooking , Cooking indicates his domestic side', 'last_speaker': 'BEN'}
Last word -> BEN : "Stuffed and baking."
prediction :  Well, I can't seem to find it. Look, man, I didn't buy it.
Real answer : Stuffed and baking.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8334841132164001], 'recall': [0.8516944646835327], 'f1': [0.8424909114837646], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 18.470619201660156
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELENA: Oh lord. You think I'm so dense. And now you want to be seen with your dense wife at the cocktail party. You want to wear that with those pants at all. You want to wear that, and you want me to shake hands with your friends and accord me the respect of talking honestly about this... You don't\nBEN: Sure I do. Do I know what loneliness feels like? Sure I do. I know a lot about it, if that's what you mean.\n\n", 'answer': "Benjam

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8504456877708435], 'recall': [0.8546649217605591], 'f1': [0.8525500893592834], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.0625 0.02041241452319315
ppl : 9.931099891662598
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROXANNE: Do you miss your home, Captain? Have you someone there?\nWILLARD: No. Not really. I was discharged from the army four years ago. I went home, wasted some time, bought a Mustang Mach 1, drove it a week. Then I re-upped for another tour. No, everything I love is here.\nROXANNE: Then you are like us. What will you do after the war?\nWILLARD: I just follow my footsteps, one at a time, trying to answer the

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8847314119338989], 'recall': [0.8417977094650269], 'f1': [0.8627307415008545], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0.007023973918212732 0.0024331759394766186
ppl : 97.18850708007812
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROXANNE: Are you warm, Captain?\nWILLARD: The river is beautiful.\n\n', 'answer': 'I spend hours watching that river from my bedroom window. It fascinates me.', 'gold_tag': 'ROXANNE spends hours watching the river from her bedroom window , ROXANNE has a contemplative or introspective nature', 'last_speaker': 'ROXANNE'}
Last word -> ROXANNE : "I spend

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8663870096206665], 'recall': [0.8558396697044373], 'f1': [0.8610811233520508], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307693, 'rouge2': 0.0, 'rougeL': 0.07692307692307693, 'rougeLsum': 0.07692307692307693}
bleu 1/2 : 0 0
ppl : 24.09331512451172
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROXANNE: I will fix you breakfast.\nWILLARD: I'm afraid I won't have time -- I gotta --\nROXANNE: Whe you reach the boat you will find that half your fifty calibre stores -- a case of grenades, a mortar and two M-16's and a case of clips are being transfered to us by your order.\nWILLARD: So that's it.\nROXANNE: You may think what you wish, Captain, but I like you very much.\nWILLARD: Wha

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8492712378501892], 'recall': [0.8654458522796631], 'f1': [0.8572822213172913], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1111111111111111, 'rouge2': 0.0, 'rougeL': 0.1111111111111111, 'rougeLsum': 0.1111111111111111}
bleu 1/2 : 0.037037037037037035 0.01193524790065722
ppl : 22.16252899169922
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCALEB: What do you got going tomorrow?\nBUDDY: Same as always. egg.\nMcCALEB: I need a driver for a few days. Ten bucks an hour plus meals.\nBUDDY: What is it, an investigation?\nMcCALEB: I’m looking for a driver, Buddy, not a partner.\nBUDDY: Okay. I’m in. Whose car.\nMcCALEB: We take yours, I pay for gas. We take mine, I sit in the back.\nBUDDY: Right

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8448970913887024], 'recall': [0.8569910526275635], 'f1': [0.8509011268615723], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 88.60088348388672
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCALEB: Hey, Buddy! You’re working tomorrow. Six AM.\n\n', 'answer': 'Cool. Aww shit, Terry!', 'gold_tag': 'BUDDY works the early shift', 'last_speaker': 'BUDDY'}
Last word -> BUDDY : "Cool. Aww shit, Terry!"
prediction :  Yes.
Real answer : Cool. Aww shit, Terry!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9706835150718689], 'recall': [0.848886251449585], 'f1': [0.9057084321975708], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 2505.07666015625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBUDDY: Who wants to go to the desert anyhow?\nMcCALEB: You go where the clues lead.\n\n', 'answer': 'Don’t any clues lead to the beach for crying out loud?!', 'gold_tag': 'BUDDY prefers going to the beach', 'last_speaker': 'BUDDY'}
Last word -> BUDDY : "Don’t any clues lead to the beach for crying out loud?!"
prediction :  I got it!
Real answer : Don’t any clues lead to the beach for crying out loud?!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.826324462890625], 'recall': [0.8082166910171509], 'f1': [0.8171703219413757], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 235.66468811035156
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMcCALEB: Shit... What do you figure I owe you, Buddy?\nBUDDY: Um, about two hundred or so.\nMcCALEB: Sounds right.\nBUDDY: You look like shit, Terry. You should take a nap or something.\nMcCALEB: You take a check?\nBUDDY: Sure.\nMcCALEB: I’ll bring it over. Should I make it out to Buddy?\nBUDDY: Jasper. Jasper Noone.\nMcCALEB: I knew the Noone part.\n\n', 'answer': 'I look more like a Buddy than a Jasper, huh? Take a nap!', 'gold_tag':

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8713595867156982], 'recall': [0.842064380645752], 'f1': [0.8564615249633789], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 67.00318145751953
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLUKE: You ask me that counselor's making a mountain out of a molehill...\nJACKIE: I'm worried.\nLUKE: Me too.\nJACKIE: Luke, I need to switch next Friday for Thursday, so why don't you take the weekend... ...that way you'll be there, and I'll pick up Monday, Tuesday, and Wednesday.\nLUKE: Fine. But I wanted to take the kids to work with me on Wednesday so I'll take Wednesday, and you can pick up that Thursday, Friday after soccer, and S

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.829445481300354], 'recall': [0.8292050957679749], 'f1': [0.829325258731842], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 44.946990966796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJACKIE: You know, you can come inside the house.\nLUKE: Did you see the lawyer? Oh. Well...\nJACKIE: Called him. We set an ap...\nLUKE: Don't do this.\nJACKIE: You're saying, don't make the kids a football, don't put them through a war. But I'm doing this for their well-being.\nLUKE: Partly. But partly, you're mad. You know the kids aren't really in danger. This is about Rachel, and you're right, I'm disappointed in her learning curve, 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8379116058349609], 'recall': [0.8362860679626465], 'f1': [0.8370980024337769], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.03571428571428571, 'rouge2': 0.0, 'rougeL': 0.03571428571428571, 'rougeLsum': 0.03571428571428571}
bleu 1/2 : 0.02702702702702703 0.008664587415167273
ppl : 7.935913562774658
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJACKIE: I thought a phone call was inappropriate. I could have taken you to that restaurant, but it would have been a waste of money. I know. I wouldn't know what to say. If it were you.\nLUKE: We're going to win this.\nJACKIE: Walk in the park. And thanks for the 'we'.\nLUKE: You're not alone in this. You're not alone. Jesus, you're not alone, okay? W

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8845553398132324], 'recall': [0.8957005143165588], 'f1': [0.8900930285453796], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 306.422119140625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHERRICK: You're early, Mr. Rus. Again, pick it up at bar twenty-four. Where were you, Mr. Rus?\nRUS: Sorry.\nHERRICK: And Miss Lubbers, could you find a key closer to the one the rest of us are using? Thank you. People, let's try to start together. Again, from bar twenty- four. Mr. Rus, if you would, please. Let's walk and talk. Mr. Rus, I thought you said you could read music.\n\n", 'answer': "I can, when it has words. This stuff's got

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8485691547393799], 'recall': [0.8373247981071472], 'f1': [0.8429094552993774], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2105263157894737, 'rouge2': 0.0, 'rougeL': 0.15789473684210525, 'rougeLsum': 0.15789473684210525}
bleu 1/2 : 0.10458938416503243 0.024106661926760035
ppl : 22.017595291137695
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHERRICK: E - G - B - D - F... It's easy to remember. Every Good Boy Does Fine. F - A - C - E... Face, that's easy enough. Those are the notes.\nRUS: It's like a code.\n\n", 'answer': "Exactly! And you have to know that code because it's telling you where to come in. Exactly!", 'gold_tag': 'HERRICK has knowledge of reading musical notes', 'last_speaker'

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8799830079078674], 'recall': [0.832019567489624], 'f1': [0.8553294539451599], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09523809523809525, 'rouge2': 0.0, 'rougeL': 0.09523809523809525, 'rougeLsum': 0.09523809523809525}
bleu 1/2 : 0 0
ppl : 196.20436096191406
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHERRICK: Just sing the notes on the page.\nRUS: I sing like shit... sorry, Mr. Herrick. I sing like... you know.\nHERRICK: I'm not auditioning you for the Metropolitan opera. Just sing the notes.\nRUS: Okay.\n\n", 'answer': "That's a sharp. Go on.", 'gold_tag': 'HERRICK is knowledgeable about singing and musical notation', 'last_speaker': 'HERRICK'}
Last word -> HERRICK : "That's a sharp.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9375439286231995], 'recall': [0.8659644722938538], 'f1': [0.9003337621688843], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 6285.04248046875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nRUS: Da-da-da-da-da-de-de-dum.. Herrick isn't really listening. Da-dum-da-dum-da-da-da... Da-da-dum-dum-de. BOOM! BOOM! Da-da-da-daah! Converted to PDF by www.screentalk.org 43. BOOM! Rus laughs and beats out a celebratory riff on the big drum.\nHERRICK: All right, all right! I don't see any notation calling for improvisation, Mr. Rus.\nRUS: I did it! Wow! Hey, Mr. Herrick I can play this shit! Sorry, play this drum.\nHERRICK: Yes, Mr. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8642776012420654], 'recall': [0.8342937231063843], 'f1': [0.8490209579467773], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 59.06066131591797
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEEBAND: What are you doing up so late?\nKURT: I... I’ve been out.\n\n', 'answer': 'Alcohol? Hardly conducive to good work. You should get to bed.', 'gold_tag': "SEEBAND is concerned about KURT's potential consumption of alcohol and its impact on his work , KURT has been out late at night", 'last_speaker': 'SEEBAND'}
Last word -> SEEBAND : "Alcohol? Hardly conducive to good work. You should get to bed."
prediction :  Have you been out 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.856518030166626], 'recall': [0.8549581170082092], 'f1': [0.8557373881340027], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352942, 'rouge2': 0.0, 'rougeL': 0.11764705882352942, 'rougeLsum': 0.11764705882352942}
bleu 1/2 : 0 0
ppl : 96.48641204833984
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKURT: Professor Seeband, about yesterday; of course I would never...\n\n', 'answer': 'You have a new subject. Me. For the directors’ gallery. (indicates a sketched portrait of Lenin) That style, more or less. We start tomorrow at four in the afternoon. In my office at the hospital.', 'gold_tag': 'SEEBAND is a professor and likely a director , SEEBAND works in an office at a hospital , SEEB

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8412996530532837], 'recall': [0.8320156335830688], 'f1': [0.8366318941116333], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05128205128205128, 'rouge2': 0.0, 'rougeL': 0.05128205128205128, 'rougeLsum': 0.05128205128205128}
bleu 1/2 : 0.00010768563514392188 3.932123432506473e-05
ppl : 336.2634582519531
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKURT: An... abortion?\nSEEBAND: If I do not remove the fetus she will not survive the pregnancy.\nKURT: Here?\nSEEBAND: We have a reputation to maintain.\nKURT: Is it really the only way? Really?\n\n', 'answer': 'Are you doubting my professional ability? Would I put myself through something like this... and my daughter!... if it were not absolutely

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8517872095108032], 'recall': [0.8077895641326904], 'f1': [0.8292051553726196], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814817, 'rouge2': 0.08, 'rougeL': 0.14814814814814817, 'rougeLsum': 0.14814814814814817}
bleu 1/2 : 0.0004242112671132693 0.0001642963172803238
ppl : 411.0784606933594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCOP #1: For God's sake, Tom, there's a monster in there! Call for assistance and tell them to bring guns.\n\n", 'answer': 'Keep moving! Will you people', 'gold_tag': 'Everyday Language', 'last_speaker': 'COP #2'}
Last word -> COP #2 : "Keep moving! Will you people"
prediction :  Sir, I just got information about the monster. I am sending an Officer

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8399115800857544], 'recall': [0.8118700385093689], 'f1': [0.8256528377532959], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 134.96055603027344
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCOP #1: There he is!\n\n', 'answer': "It's that Spider-Man!", 'gold_tag': 'Everyday Language', 'last_speaker': 'COP #2'}
Last word -> COP #2 : "It's that Spider-Man!"
prediction :  There's two in there.
Real answer : It's that Spider-Man!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8829960823059082], 'recall': [0.8592596054077148], 'f1': [0.8709661364555359], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.20000000000000004, 'rouge2': 0.0, 'rougeL': 0.20000000000000004, 'rougeLsum': 0.20000000000000004}
bleu 1/2 : 0 0
ppl : 91.77983093261719
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCOP #2: He won't get away this time!\nCOP #1: Hold your fire! He's got a girl.\n\n", 'answer': "Looks like he's got a hostage.", 'gold_tag': 'Everyday Language', 'last_speaker': 'COP #2'}
Last word -> COP #2 : "Looks like he's got a hostage."
prediction :  He's got what?
Real answer : Looks like he's got a hostage.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8668015599250793], 'recall': [0.8787967562675476], 'f1': [0.8727579712867737], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.5454545454545454, 'rouge2': 0.4444444444444444, 'rougeL': 0.5454545454545454, 'rougeLsum': 0.5454545454545454}
bleu 1/2 : 0.12262648039048077 0.04749303163563625
ppl : 123.44308471679688
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWARDEN: I am wet.\nKAREN: If you're looking for my husband, he isn't here.\nWARDEN: And if I'm not looking for him?\nKAREN: He still isn't here. If that does you any good.\nWARDEN: Well, I'm looking for him. Do you know where he is?\nKAREN: I haven't the slightest idea. Perhaps he's in town. I guess it was `in town' the way you put it, wasn

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8746405839920044], 'recall': [0.8361554741859436], 'f1': [0.8549651503562927], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.043478260869565216, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0.09799524229446188 0.05831883409863456
ppl : 16.078439712524414
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAREN: You're taking an awful chance, you know. My maid is liable to be home any time.\nWARDEN: No she won't. Thursday's her day off.\nKAREN: You think of everything, don't you, Sergeant?\nWARDEN: I try. In my position you have to.\nKAREN: Are these really important?\nWARDEN: Yes. But not important they get signed today. Tomorrow's okay. I got carbons of those ba

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8271586894989014], 'recall': [0.8435958623886108], 'f1': [0.8352963924407959], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08695652173913043, 'rouge2': 0.0, 'rougeL': 0.08695652173913043, 'rougeLsum': 0.08695652173913043}
bleu 1/2 : 0 0
ppl : 29.227231979370117
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAREN: Why, hello. I didn't think you were coming.\nWARDEN: Why not? I'm not late.\nKAREN: No, I guess you're not. But then I came a little early. I must have been overanxious. You weren't overanxious though, were you? You got here right on the dot.\nWARDEN: Maybe I'd of been early too only I stopped to get a drink.\nKAREN: You certainly chose a savory spot for our meeting.\nWARDEN: Woul

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8299362063407898], 'recall': [0.8576216697692871], 'f1': [0.843551754951477], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1935483870967742, 'rouge2': 0.0, 'rougeL': 0.12903225806451615, 'rougeLsum': 0.12903225806451615}
bleu 1/2 : 0 0
ppl : 24.956462860107422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAREN: I hope the ocean's ice cold. I hope\nWARDEN: Just so long's you melt afterwards.\n\n", 'answer': 'I love the way you laugh.', 'gold_tag': 'KAREN values humor , WARDEN displays a witty nature', 'last_speaker': 'KAREN'}
Last word -> KAREN : "I love the way you laugh."
prediction :  Cool
Real answer : I love the way you laugh.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8460530638694763], 'recall': [0.8548836708068848], 'f1': [0.8504454493522644], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13912490.0
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAREN: Nobody ever kissed me that way... not really...\nWARDEN: Nobody?\nKAREN: No. Nobody.\nWARDEN: Not even one? Out of all the many men you've been kissed by?\nKAREN: Well, that will take some figuring. How many men do you think there've been?\nWARDEN: I wouldn't know. Can't you even make me a rough estimate?\nKAREN: Not without an adding machine. Do you have your adding machine with you?\nWARDEN: No, I forgot to bring it.\nKAREN: Then I g

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8414702415466309], 'recall': [0.8558729290962219], 'f1': [0.848610520362854], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2222222222222222, 'rouge2': 0.038461538461538464, 'rougeL': 0.2222222222222222, 'rougeLsum': 0.2222222222222222}
bleu 1/2 : 0.12500000000000003 0.020080483222562468
ppl : 21.557994842529297
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAREN: All right. I've never told it to anyone before. But I think now is the time. I'll tell you the whole bloody messy thing. You can take it back to the barracks with you. I'd been married to Captain Dana E. Holmes two years. Only he was a First Lieutenant then. Back at Fort Bliss. We lived right on a little lake where we could fish an

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8344097137451172], 'recall': [0.8666850328445435], 'f1': [0.8502411842346191], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2909090909090909, 'rouge2': 0.0, 'rougeL': 0.2545454545454545, 'rougeLsum': 0.2545454545454545}
bleu 1/2 : 0.14705882352941177 0.021110016546037454
ppl : 11.972183227539062
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAREN: You just love to shock people, don't you?! Where on earth did you learn to dance like that?\n\n", 'answer': 'Believe it or not -- Chicago, Illinois.', 'gold_tag': 'WARDEN learned to dance in Chicago, Illinois', 'last_speaker': 'WARDEN'}
Last word -> WARDEN : "Believe it or not -- Chicago, Illinois."
prediction :  It was a bit hard going at first. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8275041580200195], 'recall': [0.8252904415130615], 'f1': [0.8263958096504211], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0392156862745098, 'rouge2': 0.0, 'rougeL': 0.0392156862745098, 'rougeLsum': 0.0392156862745098}
bleu 1/2 : 0.022222222222222227 0.007106690545187016
ppl : 17.031314849853516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDOCTOR EINSTEIN: Hey, Johnny, down here, what do you think I find ?\nJONATHAN: What ?\nDOCTOR EINSTEIN: The Panama Canal. And it just fits Mister Spenalzo. See the hole he's digging. Four feet wide, six feet long. He just fits ! You'd think he knew we were bringing Mister Spenalzo along. That's hospitality.\nJONATHAN: Rather a good joke on my aunts. The

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.824856162071228], 'recall': [0.8358984589576721], 'f1': [0.830340564250946], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 801.1834716796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDOCTOR EINSTEIN: Hey, Johnny, Johnny, come quick !\nJONATHAN: What's the matter ?\nDOCTOR EINSTEIN: You know that hole in the cellar ?\nJONATHAN: Yes.\n\n", 'answer': 'Well, we got an ace in the hole.', 'gold_tag': 'Everyday Language', 'last_speaker': 'DOCTOR EINSTEIN'}
Last word -> DOCTOR EINSTEIN : "Well, we got an ace in the hole."
prediction :  The time is now.
Real answer : Well, we got an ace in the hole.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8561171889305115], 'recall': [0.8550366759300232], 'f1': [0.8555766344070435], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 84.87808990478516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDOCTOR EINSTEIN: This is all fixed up nice now. Nice and smooth like a lake. Oh, Johnny, bed feels good already. You know, we didn't get any sleep for forty-eight hours.\nJONATHAN: You're forgetting, Doctor.\n\n", 'answer': 'What ?', 'gold_tag': 'Everyday Language', 'last_speaker': 'DOCTOR EINSTEIN'}
Last word -> DOCTOR EINSTEIN : "What ?"
prediction :  Right. We don't have to move, now d

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8216826915740967], 'recall': [0.8118427991867065], 'f1': [0.8167331218719482], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 25.2706241607666
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJONATHAN: My brother, Mortimer. I just heard him upstairs.\nDOCTOR EINSTEIN: No, no, no ! I am tired. You forget, I got to operate on your face tomorrow.\nJONATHAN: (voice over. We just see his shadow on the wall behind You are going to operate tomorrow, Doctor. But tonight we are taking care of Mortimer.\nDOCTOR EINSTEIN: No, but Johnny, not tonight ! I'm sleepy. We'll do it tomorrow. Or the next day.\nJONATHAN: Look at me, Doctor. You

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8523707985877991], 'recall': [0.8428862690925598], 'f1': [0.8476019501686096], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1923076923076923, 'rouge2': 0.07999999999999999, 'rougeL': 0.15384615384615383, 'rougeLsum': 0.15384615384615383}
bleu 1/2 : 0.09337718753880407 0.05507086076385581
ppl : 58.355525970458984
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nD.J.: His name is Ferris Mueller.\n\n', 'answer': 'Bueller. Ferris Bueller.', 'gold_tag': 'Everyday Language', 'last_speaker': 'FERRIS'}
Last word -> FERRIS : "Bueller. Ferris Bueller."
prediction :  Well the truth is I-I tried to find something that would have to do with him.
Real answer : Bueller. Ferris Bueller.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8000446557998657], 'recall': [0.821555495262146], 'f1': [0.8106574416160583], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 45.249881744384766
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nD.J.: Sorry about that.\n\n', 'answer': "It's cool.", 'gold_tag': 'Everyday Language', 'last_speaker': 'FERRIS'}
Last word -> FERRIS : "It's cool."
prediction :  Okay.
Real answer : It's cool.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9381223917007446], 'recall': [0.8754082322120667], 'f1': [0.9056808948516846], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 6285.04248046875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nD.J.: He has an incredible story.\n\n', 'answer': "I'm going to tell a massive lie here. It's going to by very thick and very steamy. I think radio's a facinating medium, it challenges the imagination. Unlike television which provides the images, radio... You know this. Anyway, it's always been a dream of mine to be on the radio. I have what I consider to be an excellent broadcast voice. I practise it in the bathroom all the time. I use

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8317490816116333], 'recall': [0.7973775863647461], 'f1': [0.814200758934021], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0392156862745098, 'rouge2': 0.0, 'rougeL': 0.0392156862745098, 'rougeLsum': 0.0392156862745098}
bleu 1/2 : 3.7241978670585916e-08 7.268889940274504e-09
ppl : 27.934276580810547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFERRIS: I was going to say I knew Springsteen's home phone number and I was going to give out the number of the New Jersey State Police but I thought I might get busted. After I got flunked in driver's ed for sideswiping a mail box, which was not in any way, shape or from my fault. I was putting out a cigarette, like I was told. It was weird. I'm so u

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8212757110595703], 'recall': [0.8564978837966919], 'f1': [0.8385171294212341], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08080808080808081, 'rouge2': 0.0, 'rougeL': 0.0606060606060606, 'rougeLsum': 0.0606060606060606}
bleu 1/2 : 0.024390243902439022 0.005487386648053276
ppl : 20.86052703857422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK: How did you get all of this stuff into the country?\nRAYMOND: This is America. There is nothing you can not buy.\nNICK: So why all the secrecy? Why aren't you guys working with the US military?\nRAYMOND: I am not permitted to speak of such things.\nNICK: You said you wanted my trust, then I need yours.\nRAYMOND: I am a patriot. I love my country. 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8328145742416382], 'recall': [0.8767619132995605], 'f1': [0.854223370552063], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 66.78340911865234
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK: What's with the chewing gum?\nRAYMOND: Makes us look more American.\n\n", 'answer': 'Just let me do all the talking.', 'gold_tag': 'NICK is a leader or decision maker in their current scenario', 'last_speaker': 'NICK'}
Last word -> NICK : "Just let me do all the talking."
prediction :  Well, he must be a regular old American.
Real answer : Just let me do all the talking.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8535842895507812], 'recall': [0.8561052083969116], 'f1': [0.8548429012298584], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 117.87403869628906
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK: Three eggs. I thought there would be more.\nRAYMOND: You were right.\n\n', 'answer': "That can't be. That's got to be over twenty eggs.", 'gold_tag': 'NICK scrutinizes the number of eggs', 'last_speaker': 'NICK'}
Last word -> NICK : "That can't be. That's got to be over twenty eggs."
prediction :  You were? How many do you think we need?
Real answer : That can't be. That's got to be over twenty eggs.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8335543870925903], 'recall': [0.8476494550704956], 'f1': [0.8405428528785706], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 93.01316833496094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nNICK: Hey, it's me!\n\n", 'answer': 'What happened?', 'gold_tag': 'Everyday Language', 'last_speaker': 'RAYMOND'}
Last word -> RAYMOND : "What happened?"
prediction :  You're not my mom.
Real answer : What happened?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8406825065612793], 'recall': [0.83207106590271], 'f1': [0.8363546133041382], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 64.4334716796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBUD: Lynn Wheeler. We haven't been properly introduced. I'm Detective Bud Carter.\nLYNN: I know who you are.\nBUD: You have a ride home?\nLYNN: We took the bus.\n\n", 'answer': "Take a cab. 'You need anything...anything at all. Be sure to call me.", 'gold_tag': 'Everyday Language', 'last_speaker': 'BUD'}
Last word -> BUD : "Take a cab. 'You need anything...anything at all. Be sure to call me."
prediction :  I heard about your accident. Al

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.870003342628479], 'recall': [0.84693843126297], 'f1': [0.8583159446716309], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333333, 'rouge2': 0.0, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0 0
ppl : 139.93431091308594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLYNN: It's Officer White, isn't it? Can I get you a drink?\nBUD: Yeah, plain scotch.\nLYNN: I was friendly with Sue Lefferts, but we weren't really friends. You know what I mean?\nBUD: Are you sorry she's dead?\nLYNN: Of course I am. What kind of question is that?\nBUD: Have you ever heard of Dick Stensland?\nLYNN: No I haven't. Do you know why Pierce is humoring you?\nBUD: You use words li

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8227224349975586], 'recall': [0.8802943229675293], 'f1': [0.8505352139472961], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.94973373413086
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLYNN: Did you forget some -- I wondered when you might ring the bell again, Officer White.\nBUD: It's Bud.\n\n", 'answer': "You should see yourself. You look like you're ten years old. If you'd called first, I wouldn't look this ridiculous.", 'gold_tag': 'LYNN might be somewhat self-conscious about her appearance , BUD potentially looks young for his age', 'last_speaker': 'LYNN'}
Last word -> LYNN : "You should see yourself. You look l

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8928157687187195], 'recall': [0.8228318691253662], 'f1': [0.8563963770866394], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857142, 'rouge2': 0.0, 'rougeL': 0.07142857142857142, 'rougeLsum': 0.07142857142857142}
bleu 1/2 : 0.00995741367357279 0.003520477365831486
ppl : 156.31642150878906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBUD: Who was that guy who was here earlier?\nLYNN: It doesn't matter. All they get is Veronica Lake. You got the real Lynn Margaret Bracken... Where'd this come from?\nBUD: When I was ten, my old man threw a bottle at my mother. I guess I got in the way.\nLYNN: So you saved her.\nBUD: Yeah. But not for long.\nLYNN: Do you like being a cop, Bud?\nBUD: 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8340641856193542], 'recall': [0.8384543061256409], 'f1': [0.8362535238265991], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.28571428571428575, 'rouge2': 0.0, 'rougeL': 0.28571428571428575, 'rougeLsum': 0.28571428571428575}
bleu 1/2 : 0 0
ppl : 199.77679443359375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBUD: Did you talk to Exley?\nLYNN: Come in out of the rain. In the morning we'll have both our stories for breakfast.\nBUD: I want to know about Exley.\nLYNN: He's the opposite of you. He's more like me. Cold, calculating.\nBUD: How'd you get to know so much about him?\nLYNN: Come in out of the rain, Bud.\nBUD: You gonna tell me what happened with you and Exley?\nLYNN: We talked.\nBUD: S

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8501584529876709], 'recall': [0.865427553653717], 'f1': [0.8577250838279724], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333333, 'rouge2': 0.0, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0 0
ppl : 37.99229049682617
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHARRY: Empty. Go.\nGIB: Harry Tasker wallet. Harry Tasker passport. Plane ticket stub, hotel receipt, Tasker. Two postcards of Lake Geneva. House keys. Souvenir snowing Swiss village.\nHARRY: What's this for?\nGIB: For Dana, schmuck. Bring your kid something. You know. The dad thing.\nHARRY: Got it. Nice touch. Okay, pick me up at eight. The de-brief is at ten hundred.\nGIB: Hey, hey, hey.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8224545121192932], 'recall': [0.9061936140060425], 'f1': [0.8622958064079285], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 22.598962783813477
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Kids. Ten seconds of joy. Thirty years of misery.\nHARRY: She knows not to steal. I've taught her better than that.\n\n", 'answer': "Yeah, but you're not her parents, anymore, you and Helen. Her parents are Axl Rose and Madonna. The five minutes you spend a day with her can't compete with that kind of constant bombardment. You're outgunned, amigo.", 'gold_tag': 'Gib is aware of popular culture figures like Axl Rose and Madonna , 



prediction : 
Real answer : Yeah, but you're not her parents, anymore, you and Helen. Her parents are Axl Rose and Madonna. The five minutes you spend a day with her can't compete with that kind of constant bombardment. You're outgunned, amigo.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.0], 'recall': [0.0], 'f1': [0.0], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : nan
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Twenty here, fifty there... I figured my wife's boyfriend was taking it.\nHARRY: I thought you moved out.\n\n", 'answer': "Well . . . I moved back in. My lawyer said it would give me a better claim on the house in the property settlement. Don't change the subject... you owe me two hundred bucks.", 'gold_tag': "GIB's lawyer's advice has led him to move back in with his spouse , GIB plans to get a beneficial claim from the property settlement", 'last_speaker': 'GIB'}
Last word -> GIB : "Wel

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8540767431259155], 'recall': [0.8421385288238525], 'f1': [0.8480656147003174], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0851063829787234, 'rouge2': 0.0, 'rougeL': 0.0851063829787234, 'rougeLsum': 0.0851063829787234}
bleu 1/2 : 0.014161092357946473 0.004660986106335019
ppl : 76.872802734375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: It's all set up... ghost phones and fax, all the usual stuff. You have a suite at the Marquis Hotel under Renquist. Okay, reality check. Go.\n\n", 'answer': "Hi, I'm Harry Renquist. I own a--", 'gold_tag': 'HARRY is a business owner under the alias of Harry Renquist , HARRY will be taking on the persona of a business owner in the near future', 'last_s

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.821807861328125], 'recall': [0.8435856103897095], 'f1': [0.8325543403625488], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.23529411764705882, 'rouge2': 0.13333333333333333, 'rougeL': 0.23529411764705882, 'rougeLsum': 0.23529411764705882}
bleu 1/2 : 0.14285714285714285 0.048795003647426664
ppl : 18.920557022094727
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: There's another guy, still in the car.\n\n", 'answer': 'Stay on him.', 'gold_tag': 'HARRY is commanding , HARRY suggests a position of authority', 'last_speaker': 'HARRY'}
Last word -> HARRY : "Stay on him."
prediction :  Yeah, I guess that's why I'm here. Who's he? Oh. So this isn't the date I was supposed to meet my girlfriend, r

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.806605339050293], 'recall': [0.8754957318305969], 'f1': [0.8396399021148682], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 14.155574798583984
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Copy that.\n\n', 'answer': 'And make it fast. My horse is getting tired.', 'gold_tag': "HARRY seems to be in a hurry , HARRY's statement about the horse getting tired implies he may currently be on horseback", 'last_speaker': 'HARRY'}
Last word -> HARRY : "And make it fast. My horse is getting tired."
prediction :  Yeah, I did it. This will work. We can make it work.
Real answer : And make it fast. My horse is getting tired.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8852246403694153], 'recall': [0.8843624591827393], 'f1': [0.8847934007644653], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1904761904761905, 'rouge2': 0.10526315789473685, 'rougeL': 0.1904761904761905, 'rougeLsum': 0.1904761904761905}
bleu 1/2 : 0.16666666666666669 0.12309149097933275
ppl : 30.795839309692383
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Hey, did that guy Harry chased last night call in to give himself up yet? Cause apparently that's the only way we're going to catch the son of a bitch --\n\n", 'answer': 'Okay. Here she comes.', 'gold_tag': 'HARRY is observant , HARRY values the tactical aspect of his work', 'last_speaker': 'HARRY'}
Last word -> HARRY : "Okay. Here sh

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.800861120223999], 'recall': [0.8896868228912354], 'f1': [0.8429403901100159], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 130.45127868652344
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHARRY: Look at the way the little punk is driving. He's all over the place. Look-- he hasn't signalled once.\n\n", 'answer': "Goddammit! That's outrageous.", 'gold_tag': "GIB is outraged , The cause of GIB's outrage is reckless driving", 'last_speaker': 'GIB'}
Last word -> GIB : "Goddammit! That's outrageous."
prediction :  That's just the way he is. It's nothing. Harry's just a kid, don't you see? Not really like the rest of us.
Real 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8327631950378418], 'recall': [0.8316383361816406], 'f1': [0.8322004079818726], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333336, 'rouge2': 0.07142857142857142, 'rougeL': 0.13333333333333336, 'rougeLsum': 0.13333333333333336}
bleu 1/2 : 0.04545454545454546 0.014712247158412498
ppl : 12.781267166137695
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHARRY: So, you wanna play huh? You little--\n\n', 'answer': "That's it. Get 'im Harry We'll teach that little puke not to signal.", 'gold_tag': 'HARRY shows a willingness to fight', 'last_speaker': 'GIB'}
Last word -> GIB : "That's it. Get 'im Harry We'll teach that little puke not to signal."
prediction :  I would like - to have my ow

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8357625603675842], 'recall': [0.8389986157417297], 'f1': [0.8373774886131287], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06666666666666667, 'rouge2': 0.0, 'rougeL': 0.06666666666666667, 'rougeLsum': 0.06666666666666667}
bleu 1/2 : 0.0625 0.02041241452319315
ppl : 29.192529678344727
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Ha! The little prick ditched you! That is so goddamn funny.\nHARRY: Son of a bitch!\n\n', 'answer': 'Can we go to the office now, Mr. Superspy?', 'gold_tag': 'Everyday Language', 'last_speaker': 'GIB'}
Last word -> GIB : "Can we go to the office now, Mr. Superspy?"
prediction :  You did what?!! You don't think I will just leave you with that shit-head just bec

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8083900213241577], 'recall': [0.8461723327636719], 'f1': [0.8268498182296753], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07792207792207792, 'rouge2': 0.0, 'rougeL': 0.051948051948051945, 'rougeLsum': 0.051948051948051945}
bleu 1/2 : 0.030303030303030304 0.0068278874199891905
ppl : 18.695158004760742
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: What's going on?\nHARRY: Look, uh... I've got to talk to Helen about this thing with Dana. I'm just going to run in and see if she can get away for lunch.\nGIB: You want me to just hang-- ?\nHARRY: Just hang a minute.\n\n", 'answer': "I'll just hang then, shall I?", 'gold_tag': 'GIB acts patiently and cooperatively', 'last_speaker': 'GIB'}
La

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8283669948577881], 'recall': [0.805006206035614], 'f1': [0.8165194988250732], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 164.419921875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: You look like you got gut-kicked. What's the matter? You sick?\nHARRY: He-- Helen. Helen-- it's Helen. It's Helen, Gib.\nGIB: Something to do with Helen, is what I'm getting.\nHARRY: She's having an affair.\nGIB: Congratulations. Welcome to the club.\nHARRY: It can't be. Not Helen.\nGIB: Nobody believes it can happen to them.\nHARRY: It can't be.\nGIB: Same thing happened to me with wife two. I had no idea until I came home and the hou

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8539328575134277], 'recall': [0.8192147612571716], 'f1': [0.8362135291099548], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07407407407407407, 'rouge2': 0.02531645569620253, 'rougeL': 0.07407407407407407, 'rougeLsum': 0.07407407407407407}
bleu 1/2 : 0.00011398524569431456 3.8534046149930103e-05
ppl : 62.38795852661133
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Okay, she's turning on Seventeenth. Make a left, you should see her.\n\n", 'answer': 'There she is.', 'gold_tag': "HARRY displays his alertness , HARRY acts quickly upon GIB's instructions", 'last_speaker': 'HARRY'}
Last word -> HARRY : "There she is."
prediction :  You know she doesn't like me.
Real answer : There she is.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.87071293592453], 'recall': [0.9087542295455933], 'f1': [0.8893269896507263], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.2}
bleu 1/2 : 0.16666666666666669 0.05773502691896258
ppl : 44.87101364135742
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: Guy's a spook!\nHARRY: Yeah, but for who? *\n\n", 'answer': 'He could be working her to get to you.', 'gold_tag': 'GIB is knowledgeable about intelligence operations , HARRY is likely a person of interest , Others may want to infiltrate HARRY , HARRY has something valuable or secret', 'last_speaker': 'GIB'}
Last word -> GIB : "He could be working her to get to you."
prediction :  That's a lie!
Rea

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8510387539863586], 'recall': [0.8641965389251709], 'f1': [0.8575671911239624], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 74.28396606445312
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHARRY: Shit! It's almost eight. * She's still at my house. *\n\n", 'answer': 'The purse is still at the house.', 'gold_tag': 'Everyday Language', 'last_speaker': 'GIB'}
Last word -> GIB : "The purse is still at the house."
prediction :  I wouldn't wait longer...
Real answer : The purse is still at the house.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.838790237903595], 'recall': [0.8754667043685913], 'f1': [0.8567360639572144], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 169.8761749267578
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: You've reached a new low with this one. I can't believe you're crazy enough to use the room at the Marquis.\nHARRY: Why not? You think I can afford a suite like that on my salary? Is Jean-Claude done yet? *\n\n", 'answer': 'Not quite.', 'gold_tag': 'GIB suggests caution and prudence in operations', 'last_speaker': 'GIB'}
Last word -> GIB : "Not quite."
prediction :  It's 89 degrees in the hallway.
Real answer : Not quite.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8681236505508423], 'recall': [0.871871829032898], 'f1': [0.8699936866760254], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 45.55819320678711
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: I thought this look like your work.\nHARRY: Let's go. I'll brief you in the air.\n\n", 'answer': "You're welcome.", 'gold_tag': 'GIB suggests his job involves analysis or field observation', 'last_speaker': 'GIB'}
Last word -> GIB : "You're welcome."
prediction :  Come on, come on. You really got to get it out. No matter what, I'm going to beat your ass if I gotta.
Real answer : You're welcome.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8209794759750366], 'recall': [0.8480671048164368], 'f1': [0.8343034386634827], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07407407407407407, 'rouge2': 0.0, 'rougeL': 0.07407407407407407, 'rougeLsum': 0.07407407407407407}
bleu 1/2 : 0 0
ppl : 16.800952911376953
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: --you tell the son of bitch this is Bright Boy Alert. Repeat,\nHARRY: ... the Coast Guard has to clear them back to a twenty mile radius. Anybody that can't make the minimum safe distance we need an airlift on, immediately--\nGIB: I can get 3 Marine Corps Harriers here in about 12 minutes. They're on maneuvers out of Boca Chica.\n\n", 'answer': "Get 'em. I'll brief them on the way i

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8145334720611572], 'recall': [0.8651669025421143], 'f1': [0.8390870094299316], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11538461538461536, 'rouge2': 0.0, 'rougeL': 0.07692307692307693, 'rougeLsum': 0.07692307692307693}
bleu 1/2 : 0.05128205128205127 0.011616917255955381
ppl : 24.603858947753906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGIB: ... well get the Highway Patrol to go through the streets and tell everybody on their damn loudspeakers. Just the basics... get away from windows, don't look at it... yeah--\n\n", 'answer': 'Here they come. Roger, Mike Three Five, you are cleared to engage. Be advised, your targets have Stingers and light machine guns.', 'gold_tag': 'HARRY is in 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8572326898574829], 'recall': [0.820976734161377], 'f1': [0.8387130498886108], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.000592034515244793 0.00022929398179230649
ppl : 139.25210571289062
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: What the hell are you?\n\n', 'answer': 'Interested? Follow the crow.', 'gold_tag': 'Everyday Language', 'last_speaker': 'SKULL COWBOY'}
Last word -> SKULL COWBOY : "Interested? Follow the crow."
prediction :  (looking at Eric) You are a zombie. A reanimated corpse, and the only way I can understand how it moves are not in any conventional sense of the word, I can understand it if you'll 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8040065765380859], 'recall': [0.8316113948822021], 'f1': [0.817575991153717], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.031746031746031744, 'rouge2': 0.0, 'rougeL': 0.031746031746031744, 'rougeLsum': 0.031746031746031744}
bleu 1/2 : 0.01724137931034483 0.0054998212587135924
ppl : 15.281291007995605
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSKULL COWBOY: Get it?\nERIC: Leave me alone -- !\n\n', 'answer': 'Do something about it.', 'gold_tag': 'Everyday Language', 'last_speaker': 'SKULL COWBOY'}
Last word -> SKULL COWBOY : "Do something about it."
Real answer : Do something about it.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8262126445770264], 'recall': [0.8646923303604126], 'f1': [0.8450146317481995], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 60.115299224853516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSKULL COWBOY: Glad to see you're finally with the program.\nERIC: Bugger off to the graveyard, skull- face, I'm busy.\nSKULL COWBOY: You work for the dead. Forget that, and you can forget it all.\n\n", 'answer': 'Forget this. She always red red was her color.', 'gold_tag': 'ERIC has a connection with someone who loved the color red', 'last_speaker': 'ERIC'}
Last word -> ERIC : "Forget this. She always red red was her color."
predictio

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8214699029922485], 'recall': [0.8382179737091064], 'f1': [0.8297594785690308], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 165.1239776611328
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSKULL COWBOY: Do this thing and you will be vulnerable. The blood will not return. No powers. No reunion. Nothing.\nERIC: Fine with me.\nSKULL COWBOY: You'll be alone.\n\n", 'answer': "I'm already alone.", 'gold_tag': 'ERIC is in a state of isolation', 'last_speaker': 'ERIC'}
Last word -> ERIC : "I'm already alone."
prediction :  You think I wouldn't think that you would?
Real answer : I'm already alone.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7959832549095154], 'recall': [0.857693612575531], 'f1': [0.8256869912147522], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15384615384615383, 'rouge2': 0.0, 'rougeL': 0.15384615384615383, 'rougeLsum': 0.15384615384615383}
bleu 1/2 : 0 0
ppl : 53.775882720947266
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: Don't waste my time.\n\n", 'answer': "Very well, it's your ass.", 'gold_tag': 'SKULL COWBOY speaks in a direct, no-nonsense manner', 'last_speaker': 'SKULL COWBOY'}
Last word -> SKULL COWBOY : "Very well, it's your ass."
prediction :  I see.
Real answer : Very well, it's your ass.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8811278343200684], 'recall': [0.8492595553398132], 'f1': [0.8649001717567444], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 602.51904296875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSWAY: Come back when you've learned how to drink, Billy --\nMEMPHIS: See you're still stealing the sailors from the sea --\n\n", 'answer': 'What are you doing here?', 'gold_tag': 'SWAY is familiar with MEMPHIS and is surprised at his presence', 'last_speaker': 'SWAY'}
Last word -> SWAY : "What are you doing here?"
prediction :   I won't give you anything, Billy, I'm only looking to build up my business -- you don't have to steal anything

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8351657390594482], 'recall': [0.8363513946533203], 'f1': [0.8357581496238708], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06666666666666667, 'rouge2': 0.0, 'rougeL': 0.06666666666666667, 'rougeLsum': 0.06666666666666667}
bleu 1/2 : 0.043478260869565216 0.014058038927888336
ppl : 22.26987075805664
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMEMPHIS: Gosh, no. Lipstick? What next? Mascara, blush,\nSWAY: Deodorant.\nMEMPHIS: Wow. Bozo, the very sexy clown --\nSWAY: So, you seeing anybody?\nMEMPHIS: No. I had a girl. She was great. The problem is: great girls come along once every ten years. So I gotta wait another three years before I can even bother to look...\nSWAY: She was so great, why

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8505499958992004], 'recall': [0.8337379693984985], 'f1': [0.8420600295066833], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0 0
ppl : 45.55460739135742
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSWAY: What are you doing... ?\nMEMPHIS: Seeing if you wanted to go for a ride?\nSWAY: I can't. I got a back load of repairs and one of the mechanics called in sick and I haven't slept and-- (stops short; sees the new Eleanor Where to -- ?\nMEMPHIS: I dunno. I know a place.\nSWAY: This time it's for real?\n\n", 'answer': 'Oh, yeah. For real, point-five.', 'gold_tag': "MEMPHIS invites SWAY for a 'real' car ride", 'last_speaker': 'M

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8522515892982483], 'recall': [0.8300861120223999], 'f1': [0.8410228490829468], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1565.4556884765625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGWEN: Unbelievable.\nJOSH: What?\nGWEN: I'm reading a stupid article in a stupid magazine.\nJOSH: If it's stupid, why read it? Yeah. She's a bore.\nGWEN: She's breaking up with her husband of twenty years. Said a friend,\nJOSH: Some poor tree gave its life so we could know that.\n\n", 'answer': "It's comforting, reading about these broken marriages.", 'gold_tag': 'GWEN finds comfort in reading about broken marriages', 'last_speaker': 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8714146614074707], 'recall': [0.8724560141563416], 'f1': [0.8719350099563599], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 87.12943267822266
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOSH: Why?\nGWEN: It's not enough for our marriage to succeed. Everyone else's must fail.\nJOSH: Is this going to be one of those nights?\nGWEN: What?\nJOSH: One of those nights we talk.\nGWEN: No. There's no need to talk, because there's no problem.\nJOSH: I've been under a lot of stress.\nGWEN: You don't have to apologize. There's nothing wrong with the slump we're going through. In fact, it's a good thing.\nJOSH: How's that?\nGWEN: 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.847759485244751], 'recall': [0.8704543709754944], 'f1': [0.8589571118354797], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 19.560466766357422
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGWEN: You missed the on-ramp.\nJOSH: I know.\nGWEN: Why?\n\n', 'answer': 'Somewhere we need to go first.', 'gold_tag': 'Everyday Language', 'last_speaker': 'JOSH'}
Last word -> JOSH : "Somewhere we need to go first."
prediction : I don't know, what kind of person would miss an on-ramp?!
Real answer : Somewhere we need to go first.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8027899265289307], 'recall': [0.854543924331665], 'f1': [0.8278588652610779], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 28.932296752929688
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGWEN: We don't have to do this tonight.\nJOSH: Yes, we do.\nGWEN: We can't go in here. Everything's different.\n\n", 'answer': "That's okay. We're different, too.", 'gold_tag': 'JOSH is experiencing a change tonight', 'last_speaker': 'JOSH'}
Last word -> JOSH : "That's okay. We're different, too."
prediction :  We can't do this.
Real answer : That's okay. We're different, too.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8697739839553833], 'recall': [0.8790158033370972], 'f1': [0.8743704557418823], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 44.17261505126953
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGWEN: Sarah\'s a very interesting girl. Did you know she\'s training for the marathon?\nJOSH: I don\'t want to talk about Sarah.\nGWEN: What do you want to talk about, then?\nJOSH: I want to talk about me... ...and what a fool I\'ve been. I thought if we came here tonight we could...start again.\nGWEN: But I thought you were over this place. And, look...There\'s nothing left. Not one remn

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8167641162872314], 'recall': [0.8404178023338318], 'f1': [0.8284221291542053], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.029850746268656716, 'rouge2': 0.0, 'rougeL': 0.029850746268656716, 'rougeLsum': 0.029850746268656716}
bleu 1/2 : 0 0
ppl : 22.82757568359375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOSH: By the way, I didn't get your name.\nGWEN: What? Oh, my name. My name is Gail. Gail Bonds.\nJOSH: Well, Gail, I'm not usually this forward on a first date, may I take you to my car and ravish you?\nGWEN: Hmm. On one condition.\nJOSH: What's that?\nGWEN: That you tell me your name.\n\n", 'answer': "Oh, I thought everyone knew my name. I'm Bill Gates. Billionaire.", 'gold_tag': 'JO

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.847610592842102], 'recall': [0.852070689201355], 'f1': [0.8498347401618958], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0.043443485862611285 0.01586331812240599
ppl : 1176.3826904296875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGWEN: Is it true what they say, Bill, about billionaires? That all the wealth is just a way to compensate...for size?\nJOSH: People are jealous.\nGWEN: I can imagine. It must be so lonely at the top. What are you doing?\nJOSH: What do you think I'm doing?\nGWEN: We're not in a bed.\n\n", 'answer': "You shouldn't read fortune cookies too literally. ...Can anyone see us?...Is anyone looking?...Wh

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8452416658401489], 'recall': [0.7861489057540894], 'f1': [0.8146250247955322], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16, 'rouge2': 0.020408163265306124, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.015720316857141484 0.002562098186640613
ppl : 17.254648208618164
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOSH: You're right. We could use some more color in the living room.\nGWEN: Yes?\nJOSH: I said you're right.\nGWEN: About what?\nJOSH: About more color in the living room.\n\n", 'answer': "Oh. I'm glad you think so.", 'gold_tag': 'Everyday Language', 'last_speaker': 'GWEN'}
Last word -> GWEN : "Oh. I'm glad you think so."
prediction :  I wouldn't know
Real answer : Oh. I'm glad

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8133939504623413], 'recall': [0.8310229778289795], 'f1': [0.8221139311790466], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.18181818181818182, 'rouge2': 0.0, 'rougeL': 0.18181818181818182, 'rougeLsum': 0.18181818181818182}
bleu 1/2 : 0 0
ppl : 58.507266998291016
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJUDITH: Good morning Tom Quiney. A barrel of Huffcap ale and three flagons of Malmesey wine to be delivered to New Place if you would. On our account.\nQUINEY: Your usual order and my usual reply. Marry me Judith. I would help bring back your smile.\nJUDITH: Yes and every other maid in the county’s. You chase them all.\nQUINEY: But I only ever propose to one.\nJUDITH: I’d think you’d tir

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8666427135467529], 'recall': [0.8314414620399475], 'f1': [0.8486772775650024], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1509433962264151, 'rouge2': 0.0, 'rougeL': 0.11320754716981132, 'rougeLsum': 0.11320754716981132}
bleu 1/2 : 0.008152440795673243 0.0019215487232808008
ppl : 120.99413299560547
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nQUINEY: A glass Miss Shakespeare? To toast this sunny morning?\nJUDITH: I need no excuse for a glass of wine Mr. Quiney. Beat.\nQUINEY: There is another pleasure, equally heady.\nJUDITH: And plenty of country girls for you to get it with.\nQUINEY: Your looks won’t last forever Jude.\n\n', 'answer': 'And I shall be glad when they’re gone because I sha

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8571221232414246], 'recall': [0.837496280670166], 'f1': [0.847195565700531], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08, 'rouge2': 0.0, 'rougeL': 0.08, 'rougeLsum': 0.08}
bleu 1/2 : 0.004578909722183545 0.0016719814290797606
ppl : 478.17132568359375
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nQUINEY: You know that I am not a good man. There have been women. Many women.\n\n', 'answer': 'I have seen too little of life. You’ve seen too much. Between us perhaps we may begin again.', 'gold_tag': 'JUDITH has led a sheltered life , QUINEY acknowledges his moral shortcomings', 'last_speaker': 'JUDITH'}
Last word -> JUDITH : "I have seen too little of life. You’ve seen too much. Between us pe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.818897008895874], 'recall': [0.8270999193191528], 'f1': [0.8229780197143555], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08695652173913043, 'rouge2': 0.0, 'rougeL': 0.08695652173913043, 'rougeLsum': 0.08695652173913043}
bleu 1/2 : 0.007549345855579625 0.0027566313463393993
ppl : 230.6244659423828
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJUDITH: I saw Margaret Wheeler in the Church today Tom.\n\n', 'answer': 'I know it Judith and I have confessed all to you. I can not undo what is done. All I can say is that her child and she will not want.', 'gold_tag': 'QUINEY has confessed a past mistake involving Margaret Wheeler and her child , QUINEY has revealed a level of responsibility', 'las

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8178555965423584], 'recall': [0.7948111295700073], 'f1': [0.8061687350273132], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13011.365234375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCOBB V.O.: It's cookin', Bud. It's cookin' fast.\n\n", 'answer': "Mike, I've got Edgar Bingham now. Just showed with two of his crew, for a sit down with Kay.", 'gold_tag': 'BUD is involved in some kind of operation , BUD is currently dealing with Edgar Bingham and his crew', 'last_speaker': 'BUD'}
Last word -> BUD : "Mike, I've got Edgar Bingham now. Just showed with two of his crew, for a sit down with Kay."
prediction :  We're out of 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8163514137268066], 'recall': [0.8208819031715393], 'f1': [0.818610429763794], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07407407407407408, 'rouge2': 0.0, 'rougeL': 0.07407407407407408, 'rougeLsum': 0.07407407407407408}
bleu 1/2 : 0.01909314066544796 0.006614057941723061
ppl : 109.56363677978516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCOBB V.O.: Negative at Tibedaux's. Can't walk in. What's our move?\n\n", 'answer': 'We wait.', 'gold_tag': 'Everyday Language', 'last_speaker': 'BUD'}
Last word -> BUD : "We wait."
prediction :  We have to play it now, Cobb V.O.
Real answer : We wait.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8382891416549683], 'recall': [0.8704086542129517], 'f1': [0.8540470004081726], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.0, 'rougeL': 0.1818181818181818, 'rougeLsum': 0.1818181818181818}
bleu 1/2 : 0.12500000000000003 0.04225771273642583
ppl : 170.84564208984375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBUD: It's Jesse...\nCOBB V.O.: He's heading toward the alley, Bud. We take him down?\nBUD: Let him go.\nCOBB V.O.: What?!\nBUD: Let him go.\nCOBB V.O.: Bud, that's crazy! Jesse goes in first, can't let that happen.\n\n", 'answer': "That's exactly what we're gonna do! We got no way inside. Let Jesse go in hard and draw fire. He pushes Lucian out to us. Sta

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8355299830436707], 'recall': [0.8362380862236023], 'f1': [0.8358838558197021], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0816326530612245, 'rouge2': 0.0, 'rougeL': 0.0816326530612245, 'rougeLsum': 0.0816326530612245}
bleu 1/2 : 0 0
ppl : 26.120534896850586
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBLANCA: So how’s that thing?\nMARIA: What thing?\nBLANCA: Your job? What do you have to do?\nMARIA: I told you not to talk about it.\nBLANCA: Maria... Maria, look at me!\nMARIA: What the hell are you doing?\nBLANCA: I’m going to be a mule. I talked to Franklin and he arranged it.\nMARIA: You what?\nBLANCA: You heard me. Why not?\nMARIA: Because you can’t... you won’t make it.\nBLANCA: Well,

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8040478229522705], 'recall': [0.8306537866592407], 'f1': [0.8171342611312866], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13.448177337646484
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBLANCA: ...How are you doing?\n\n', 'answer': '...OK.... You?', 'gold_tag': 'Everyday Language', 'last_speaker': 'MARIA'}
Last word -> MARIA : "...OK.... You?"
prediction :  Fine. Why not?
Real answer : ...OK.... You?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8769177198410034], 'recall': [0.8371959924697876], 'f1': [0.8565966486930847], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 342.1376647949219
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBLANCA: What are you doing?\nMARIA: We can’t stay here?\nBLANCA: What do you mean?\nMARIA: Don’t you get it? They did something to Lucy? Everything’s totally fucked up. We have to get out of here before they come back.\nBLANCA: But...\n\n', 'answer': 'Hurry up!!', 'gold_tag': 'Everyday Language', 'last_speaker': 'MARIA'}
Last word -> MARIA : "Hurry up!!"
prediction :  What?
Real answer : Hurry up!!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8697062730789185], 'recall': [0.7863094210624695], 'f1': [0.8259079456329346], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 5475.181640625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELANIE: What made it...\n\n', 'answer': "It deliberately came down at you -- you're bleeding...", 'gold_tag': 'MELANIE is currently bleeding', 'last_speaker': 'MITCH'}
Last word -> MITCH : "It deliberately came down at you -- you're bleeding..."
prediction :  You were at the beach.
Real answer : It deliberately came down at you -- you're bleeding...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8822742700576782], 'recall': [0.8281571865081787], 'f1': [0.8543595671653748], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2857142857142857, 'rouge2': 0.0, 'rougeL': 0.14285714285714285, 'rougeLsum': 0.14285714285714285}
bleu 1/2 : 0.08986579282344431 0.03177235575108143
ppl : 109.74031829833984
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCH: You\'ll be able to find your way back, won\'t you?\nMELANIE: Oh, yes. Will I be seeing you again? San Francisco\'s a long way from here.\nMITCH: I\'m in San Francisco five days a week. With a lot of time on my hands. I\'d like to see you. Maybe we could go swimming or something. Mother tells me you like to swim.\nMELANIE: How does Mother know wha

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8823518753051758], 'recall': [0.8439359664916992], 'f1': [0.8627164363861084], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.26666666666666666, 'rouge2': 0.0, 'rougeL': 0.26666666666666666, 'rougeLsum': 0.26666666666666666}
bleu 1/2 : 0.14715177646857694 0.024525296078096157
ppl : 61.203006744384766
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCH: It smelled of the fire.\nMELANIE: It\'s hard to believe anything at all happened yesterday, isn\'t it? It\'s so beautiful and still now. I think I\'ve got it all figured out, by the way.\nMITCH: Really? Tell me about it.\nMELANIE: It\'s an uprising.\nMITCH: Of birds?\nMELANIE: Certainly, of birds. It all started several months ago with a peasan

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8474525213241577], 'recall': [0.8481442332267761], 'f1': [0.8477982878684998], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814814, 'rouge2': 0.08, 'rougeL': 0.14814814814814814, 'rougeLsum': 0.14814814814814814}
bleu 1/2 : 0 0
ppl : 43.2509651184082
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELANIE: They were angry, Mitch. They came out of the chimney in fury. I had the feeling they wanted each and every one of us dead.\nMITCH: Melanie, Melanie...\nMELANIE: I'm frightened, Mitch.\nMITCH: No, no...\nMELANIE: I'm frightened and confused and I... I think I want to go back to San Francisco where there are buildings and... and concrete and...\nMITCH: Melanie...\n\n", 'answer': '.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8087499141693115], 'recall': [0.8465100526809692], 'f1': [0.8271993398666382], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08888888888888888, 'rouge2': 0.0, 'rougeL': 0.08888888888888888, 'rougeLsum': 0.08888888888888888}
bleu 1/2 : 0.03571428571428572 0.011501092655705905
ppl : 13.057580947875977
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCH: That was Al on the phone. He wants me to meet him out at the Fawcett place. Says some detectives from Santa Rosa'll be there in a little while. Will you be all right here?\n\n", 'answer': 'Yes. I was just taking her in some tea. Be careful. Please.', 'gold_tag': "MELANIE is caring, as she was about to take someone tea , MELANIE's immediate plan

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8437373638153076], 'recall': [0.8688933253288269], 'f1': [0.8561305403709412], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07692307692307691, 'rouge2': 0.0, 'rougeL': 0.07692307692307691, 'rougeLsum': 0.07692307692307691}
bleu 1/2 : 0.08300915602566021 0.027531022470118405
ppl : 17.204404830932617
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELANIE: When will they stop?\nMITCH: I thought they'd have stopped by\nMELANIE: What time is it?\nMITCH: Almost two a.m.\nMELANIE: You must be exhausted.\n\n", 'answer': 'How about you? I wonder if Cathy fed them.', 'gold_tag': 'MITCH cares about his family , MITCH remembered to check if Cathy fed the birds', 'last_speaker': 'MITCH'}
Last word -> MIT

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8673031330108643], 'recall': [0.8464511036872864], 'f1': [0.8567502498626709], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473685, 'rouge2': 0.0, 'rougeL': 0.10526315789473685, 'rougeLsum': 0.10526315789473685}
bleu 1/2 : 0 0
ppl : 18.768848419189453
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCH: You should have left when you wanted to. You'd have been safe in San Francisco.\nMELANIE: I don't want to be safe. I want to be with you.\nMITCH: The power.\nMELANIE: Mitch...\nMITCH: Wait here. Don't move. We'd better light some of those lamps.\nMELANIE: No... wait. Hold me.\n\n", 'answer': "I think they're easing off.", 'gold_tag': 'Everyday Language', 'last_speaker': 'MITCH'}
L

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.840863823890686], 'recall': [0.8574971556663513], 'f1': [0.8490990996360779], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857144, 'rouge2': 0.0, 'rougeL': 0.07142857142857144, 'rougeLsum': 0.07142857142857144}
bleu 1/2 : 0.04545454545454546 0.014712247158412498
ppl : 26.93757438659668
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCH: Here we go.\n\n', 'answer': 'Mitch!', 'gold_tag': 'Everyday Language', 'last_speaker': 'MELANIE'}
Last word -> MELANIE : "Mitch!"
prediction :  Yeah, I guess I always do that. I go back. I try to get to know you better.
Real answer : Mitch!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8244595527648926], 'recall': [0.8391187191009521], 'f1': [0.8317245244979858], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 22.412452697753906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEMMETT: She married an old man, she's made a living on telling women they're too fat, she hawks her crap on the Home Shopping Network...\nELLE: But maybe it's not what you think.\nEMMETT: Bu t maybe it is-- . How do you figure?\nELLE: Because people aren't always what they seem to be and you refuse to see that. Have a little faith. You might be surprised.\nEMMETT: I can't believe you called me a butt- head. No one's called me a butt-h

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8297311067581177], 'recall': [0.8065178394317627], 'f1': [0.8179597854614258], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 51.0197639465332
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELLE: I'll cover her -- parts.\nEMMETT: Well, that's what we're trying to prove didn't happen. Do you have any reason to believe it did?\nELLE: She's not!\n\n", 'answer': "Did your daughter ever say anything to you about Brooke and Heyworth's relationship? an infomercial? She said they humped like Much as it is for me, hearing you tell about it. Why do you say that? Ys. . . . e I didn't stick around long enough to watch him stick his sw

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8512599468231201], 'recall': [0.8000807166099548], 'f1': [0.8248772025108337], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12987012987012986, 'rouge2': 0.0, 'rougeL': 0.07792207792207792, 'rougeLsum': 0.07792207792207792}
bleu 1/2 : 0.002555192554286581 0.00048725610176622857
ppl : 44.21149444580078
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELLE: I never trust a woman who's not blonde. Except for my friend Serena, but that's only because she's a blonde at heart. That's the whole reason I ' m starting the Blonde Legal Defense Fund. Blondes are discriminated against worldwide! Brooke's a blonde, and people are saying she's sleeping with the cheesy pool boy and shooting her husband. If sh

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7771592140197754], 'recall': [0.8143231272697449], 'f1': [0.7953071594238281], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 2180.72509765625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nEMMETT: Whoa -- Why? internship was a mistake. What're you talking about? You earned it-\nELLE: I didn't earn anything. I got this\nEMMETT: So now you're --? Going back to LA. Maybe I can fulfill my destiny as a useless bimbo and join the Swedish Bikini Team. No more navy blue suits. No more panty-hose. No more trying to be something I ' m not. What if you're trying to be something you are? The hell with Donovan. Stay.\n\n", 'answer': "

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8607943058013916], 'recall': [0.8534955978393555], 'f1': [0.8571294546127319], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25641025641025644, 'rouge2': 0.05405405405405406, 'rougeL': 0.15384615384615385, 'rougeLsum': 0.15384615384615385}
bleu 1/2 : 0.10526315789473682 0.024182541670333724
ppl : 28.22800636291504
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTREVOR: Hey there, boy. You're a good dog. Yes, you are.\nELLEN: Sorry for reacting the way I did back there.\nTREVOR: Don't apologize. That's how most\nELLEN: Didn't scare me. It just surprised me. ... Thanks for dinner. I had a good time.\nTREVOR: You're welcome.\nELLEN: Here... you need a little life in that place of yours.\nTREVOR: 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8316745758056641], 'recall': [0.9057753086090088], 'f1': [0.8671448230743408], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0.10000000000000002 0.03333333333333335
ppl : 223.73695373535156
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELLEN: ... I have to fly up there for a computer conference.\nTREVOR: When's that?\nELLEN: Day after tomorrow.\n\n", 'answer': 'Never been to San Francisco.', 'gold_tag': 'TREVOR has never been to San Francisco', 'last_speaker': 'TREVOR'}
Last word -> TREVOR : "Never been to San Francisco."
prediction :  You're going to do it.
Real answer : Never been to San Francisco.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8654856085777283], 'recall': [0.8679859638214111], 'f1': [0.8667339086532593], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.0, 'rougeL': 0.1818181818181818, 'rougeLsum': 0.1818181818181818}
bleu 1/2 : 0.2 0.07071067811865477
ppl : 31.60930061340332
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTREVOR: If you want, I'll take care of Jack.\nELLEN: I can't ask you to do that.\n\n", 'answer': 'Sure you can.', 'gold_tag': 'Trevor is helpful and supportive', 'last_speaker': 'TREVOR'}
Last word -> TREVOR : "Sure you can."
prediction :  How about this? How does this sound? I'll set up dinner at a fancy restaurant, you tell me about Jack.
Real answer : Sure you can.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8213028907775879], 'recall': [0.8671197891235352], 'f1': [0.8435896635055542], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333333, 'rouge2': 0.0, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0.05000000000000001 0.01622214211307625
ppl : 29.233844757080078
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELLEN: ... These conferences mainly focus on software now.\nTREVOR: Software sells.\nELLEN: So, how's Jack?\nTREVOR: ... Jack's good.\nELLEN: And what about Trevor? Was Trevor a good boy, too?\nTREVOR: What's that supposed to mean?\nELLEN: Nothing...\nTREVOR: Is that supposed to mean something?\n\n", 'answer': 'Trevor...', 'gold_tag': 'Everyday Languag

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8196873664855957], 'recall': [0.8455556631088257], 'f1': [0.8324205875396729], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 89.47402954101562
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTREVOR: Jack?\n\n', 'answer': 'Here, boy... here, Jack. Come to Mommy, sweetheart. ... Oh, my God. Jack?!', 'gold_tag': "ELLEN calls to Jack like a mother to a child  , ELLEN shows concern when he doesn't respond", 'last_speaker': 'ELLEN'}
Last word -> ELLEN : "Here, boy... here, Jack. Come to Mommy, sweetheart. ... Oh, my God. Jack?!"
prediction :  Who is this?
Real answer : Here, boy... here, Jack. Come to Mommy, sweetheart. ... Oh, 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8283256888389587], 'recall': [0.8076480627059937], 'f1': [0.8178561925888062], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 203.5897216796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTREVOR: It's aspirin...\nELLEN: ... I can't talk to you right now, Trevor. Please. I'm so upset I can't even drive. I just need some time alone.\n\n", 'answer': "I know about what's not fair. I left part of my lung in a damn hospital because some gangbanger didn't like the grade I gave him. Sometimes you do all the right things. You work hard in school... get a good job... pay your taxes. Things still go bad. I'm so sorry, Ellen.", 'go

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.82562255859375], 'recall': [0.8044478893280029], 'f1': [0.8148977160453796], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06557377049180328, 'rouge2': 0.0, 'rougeL': 0.06557377049180328, 'rougeLsum': 0.06557377049180328}
bleu 1/2 : 1.1962793480322522e-06 4.368194559965586e-07
ppl : 92.06372833251953
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELLEN: ... I'd like to apologize for the other night.\nTREVOR: Shouldn't listen to me. I was half asleep.\nELLEN: We really need to talk.\nTREVOR: ... Not tonight. Have to finish some work...\nELLEN: Tomorrow then? It's important to me.\nTREVOR: ... Okay.\n\n", 'answer': "... How 'bout seven o'clock?", 'gold_tag': "ELLEN wants to have a talk with TRE

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8602321743965149], 'recall': [0.8134851455688477], 'f1': [0.8362058401107788], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 139.81185913085938
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHEPARD: You want to go back inside?\nWHITACRE: I think they might have it bugged. We're safer out here.\nSHEPARD: Bugged? What?\nWHITACRE: This used to be Dwayne Andreas' house, you know that?\nSHEPARD: I'm aware of that.\nWHITACRE: There are things I know, but if I decide to tell you what's going on, could I be prosecuted for it?\n\n", 'answer': "I can't provide you with immunity, but any information you tell me about your involveme

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8129141330718994], 'recall': [0.8135450482368469], 'f1': [0.813229501247406], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857142, 'rouge2': 0.0, 'rougeL': 0.07142857142857142, 'rougeLsum': 0.07142857142857142}
bleu 1/2 : 0 0
ppl : 97.8786392211914
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWHITACRE: What if we just move your chair over here.\n\n', 'answer': 'Thatta boy, Mark.', 'gold_tag': 'Everyday Language', 'last_speaker': 'SHEPARD'}
Last word -> SHEPARD : "Thatta boy, Mark."
prediction :  Not at all.
Real answer : Thatta boy, Mark.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.851760745048523], 'recall': [0.8558764457702637], 'f1': [0.8538135886192322], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 171.8991241455078
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWHITACRE: Look at that, look at Bob and Terry. Man, I'd love to be a fly on the wall for that one. I was good wasn't I? I seemed scared.\n\n", 'answer': "Remember what we told you, Mark: tell them that you're cooperating first thing and whatever you do, do not take one of their lawyers.", 'gold_tag': 'SHEPARD is in a position of authority or mentorship , SHEPARD provides guidance and advice to WHITACRE , WHITACRE is possibly involved in

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8413242101669312], 'recall': [0.8296695947647095], 'f1': [0.8354562520980835], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0.012594669908908207 0.004301898749003176
ppl : 99.9083480834961
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHEPARD: Mark, you can't be here.\nWHITACRE: They threw me out, Brian.\nSHEPARD: You're jeopardizing everything.\nWHITACRE: Tossed me out of ADM. I'm out.\nSHEPARD: Not here, Mark-- on the phone we can set something up.\nWHITACRE: You gotta help me.\n\n", 'answer': 'If you need to talk, call me.', 'gold_tag': 'SHEPARD acts in a professional capacity', 'last_speaker': 'SHEPARD'}
Last word -> SH

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8530129194259644], 'recall': [0.8585845232009888], 'f1': [0.8557896018028259], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2857142857142857, 'rouge2': 0.0, 'rougeL': 0.2857142857142857, 'rougeLsum': 0.2857142857142857}
bleu 1/2 : 0.14285714285714285 0.048795003647426664
ppl : 75.0289077758789
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKIRK: Aren't you dead? I assume you are loitering here to learn what efficiency rating I plan to give your cadets.\nSPOCK: I am understandably curious.\nKIRK: They destroyed the simulator room and you with it.\nSPOCK: The Kobayshi Maru scenario frequently wreaks havoc with students and equipment. As I recall you took the test three times yourself. Your fin

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8415678143501282], 'recall': [0.9411358833312988], 'f1': [0.8885713219642639], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 85.16597747802734
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKIRK: We've got a problem. Something may be wrong at Regula I. We've been ordered to investigate.\nSPOCK: Regula I is a scientific research laboratory, if memory serves...\nKIRK: I told Starfleet all we had was a boatload of children but we're the only ship in the quadrant. Spock: those cadets of yours -- how good are they? How will they respond under real pressure?\nSPOCK: Like all living beings, Admiral each according to his gifts. T

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8748766183853149], 'recall': [0.8464181423187256], 'f1': [0.8604121208190918], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07407407407407407, 'rouge2': 0.0, 'rougeL': 0.07407407407407407, 'rougeLsum': 0.07407407407407407}
bleu 1/2 : 0.008152440795673243 0.002882323084921202
ppl : 860.0585327148438
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSPOCK: They knew just where to hit us.\nKIRK: WHO? Who knew just where to hit us? And why?\nSPOCK: One thing is certain; we cannot escape on auxiliary power.\nKIRK: Visual! phasers --\nSPOCK: Too late --\n\n', 'answer': 'Hang on!', 'gold_tag': 'KIRK is a determined character', 'last_speaker': 'KIRK'}
Last word -> KIRK : "Hang on!"
prediction :  What i

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8160533905029297], 'recall': [0.8430032730102539], 'f1': [0.8293094038963318], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 111.69694519042969
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKIRK: What IS working around here?\nSPOCK: Not much, Admiral. We have partial main power...\nKIRK: That's it?\n\n", 'answer': 'Best we could do in two hours.', 'gold_tag': 'Spock can work under time pressure , Spock was given two hours to perform a task', 'last_speaker': 'SPOCK'}
Last word -> SPOCK : "Best we could do in two hours."
prediction :  Kirk, we get no reports...
Real answer : Best we could do in two hours.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8549749851226807], 'recall': [0.8684294819831848], 'f1': [0.8616496920585632], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.13406400920712788 0.04739878501170794
ppl : 1180.984130859375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKIRK: Then I'm coming with you.\nSPOCK: I would cite regulation, but I know you will simply ignore it.\n\n", 'answer': "See, we're getting to know each other.", 'gold_tag': 'Shared understanding that Kirk often ignores regulations', 'last_speaker': 'KIRK'}
Last word -> KIRK : "See, we're getting to know each other."
prediction :  What has Starfleet issu

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8408013582229614], 'recall': [0.8484982252120972], 'f1': [0.8446322679519653], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 970.6597290039062
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSPOCK: What you refer to as the "Red Matter located Captain Pike.\nKIRK: Is he alive?\nSPOCK: Unknown.\n\n', 'answer': "Let's move.", 'gold_tag': 'KIRK takes on a leadership role , KIRK is in a command position', 'last_speaker': 'KIRK'}
Last word -> KIRK : "Let's move."
prediction :  If he's dead, we have to find out -- and in time for a funeral at home station.
Real answer : Let's move.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8230962753295898], 'recall': [0.8664891123771667], 'f1': [0.8442354798316956], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09523809523809525, 'rouge2': 0.0, 'rougeL': 0.09523809523809525, 'rougeLsum': 0.09523809523809525}
bleu 1/2 : 0 0
ppl : 44.689109802246094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROHEIM: . that look on her face, you know the and it's not even me, my body. Some guy, this good-looking guy... he looks like the Marlboro Man, only shirtless. And so... the Marlboro Man's running his banging her... and her knees... by her ears, you know? And that look on her face. That look.\nHORUS: Roheim.\nROHEIM: Bitch.\nHORUS: Come on.\n\n", 'answer': 'Walk with me.', 'gold_tag': 'E

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8709887266159058], 'recall': [0.8755727410316467], 'f1': [0.8732747435569763], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 46.1622314453125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROHEIM: They're going to can me.\nHORUS: No one's canning anyone. No one saw us... watch your step... leave.\n\n", 'answer': 'Should be an extra key over the door.', 'gold_tag': 'ROHEIM expresses awareness of the environment by mentioning the location of the key', 'last_speaker': 'ROHEIM'}
Last word -> ROHEIM : "Should be an extra key over the door."
prediction :  I'll show the council you'll back down to us. I told you, if I have to fa

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8008412718772888], 'recall': [0.8472621440887451], 'f1': [0.8233978748321533], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.04444444444444444, 'rouge2': 0.0, 'rougeL': 0.02222222222222222, 'rougeLsum': 0.02222222222222222}
bleu 1/2 : 0.02702702702702703 0.006084679506313998
ppl : 14.108485221862793
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHORUS: You're going to be okay?\nROHEIM: I don't know. You might have solo act tomorrow night.\nHORUS: Get some sleep.\nROHEIM: You're a good man, Horus.\nHORUS: I have to get home, now. I'll call.\n\n", 'answer': 'You know what Olga said before she walked out on me? Her last words to me. She turned at the front door. Looked around the room and she sa

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8544947504997253], 'recall': [0.8107688426971436], 'f1': [0.8320577144622803], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714288, 'rouge2': 0.0, 'rougeL': 0.09523809523809523, 'rougeLsum': 0.09523809523809523}
bleu 1/2 : 0.016672780383657982 0.0027173307494639666
ppl : 48.81730651855469
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHORUS: Jesus, Aaron. I don't want you to be here.\n\n", 'answer': 'I need a drink.', 'gold_tag': 'ROHEIM is in need of an alcoholic beverage , This need possibly implies stress or a need to relax', 'last_speaker': 'ROHEIM'}
Last word -> ROHEIM : "I need a drink."
prediction :  Why?
Real answer : I need a drink.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8673676252365112], 'recall': [0.872180700302124], 'f1': [0.8697675466537476], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 8849.7822265625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: Wait, hold the door! Going down?\n\n', 'answer': 'Not if I can help it.', 'gold_tag': 'JOHN demonstrates a preference for avoiding the downward direction', 'last_speaker': 'JOHN'}
Last word -> JOHN : "Not if I can help it."
prediction :  Wait
Real answer : Not if I can help it.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8275472521781921], 'recall': [0.7677107453346252], 'f1': [0.7965067625045776], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 5696637.0
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: I saw you at --\nJOHN: -- I remember.\nANGELA: And --\nJOHN: Yeah.\nANGELA: I'd like to ask you a few\nJOHN: I'm not really in the talking mood right now.\nANGELA: Could you just listen then?\nJOHN: Always a catch...\nANGELA: My sister was murdered last week.\nJOHN: Sorry to hear...\nANGELA: Dodson? You don't remember her? You sure?\nJOHN: She look anything like you? I would've remembered.\nANGELA: Then last Tuesday she just decided to

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8420349955558777], 'recall': [0.8201894760131836], 'f1': [0.8309686183929443], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1458.9237060546875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: The cat too?\nANGELA: Yeah, why...?\n\n', 'answer': 'Why, do you have any?', 'gold_tag': 'Everyday Language', 'last_speaker': 'JOHN'}
Last word -> JOHN : "Why, do you have any?"
prediction :  It has to have something to do with my ex.
Real answer : Why, do you have any?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8139528632164001], 'recall': [0.8248412013053894], 'f1': [0.8193608522415161], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.26666666666666666, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0.2 0.04714045207910317
ppl : 31.663713455200195
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: So I took things into my own hands and I found a way out.\nANGELA: You attempted suicide.\n\n', 'answer': 'I never attempt anything.', 'gold_tag': 'JOHN has familiarity with life-threatening situations', 'last_speaker': 'JOHN'}
Last word -> JOHN : "I never attempt anything."
prediction :  I'm not trying to die, Angela . The doctors say I've got months. And I've 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8052210807800293], 'recall': [0.8717341423034668], 'f1': [0.8371585607528687], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.027027027027027025, 'rouge2': 0.0, 'rougeL': 0.027027027027027025, 'rougeLsum': 0.027027027027027025}
bleu 1/2 : 0.015625000000000007 0.004980119205559976
ppl : 11.726683616638184
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: cocky, peddles their influence or hijacks a soul -- I deport their scaly ass right back to Hell. I don't get them all but maybe enough to insure my retirement. Well, what would you do if you were sentenced to a prison where half the inmates were put there by you?\nANGELA: Why you? I mean many go to Hell, why were you able to escape?\nJOHN: I

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8564233183860779], 'recall': [0.799436092376709], 'f1': [0.8269490599632263], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 88.04386901855469
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: She'd tell everyone about things she said she saw. Crazy things. Things that hadn't happened yet. She'd scare my mother to death, talking on and on about Revelations and the end of the world. Then one day she just stopped... never said another word.\n\n", 'answer': "So you put her in here. Angela doesn't need to be reminded. It hurts even more now.", 'gold_tag': "JOHN is empathetic and understanding about ANGELA's situation , JO

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8484760522842407], 'recall': [0.8444048166275024], 'f1': [0.846435546875], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08333333333333333, 'rouge2': 0.0, 'rougeL': 0.08333333333333333, 'rougeLsum': 0.08333333333333333}
bleu 1/2 : 0.018143590657882503 0.006414727994630807
ppl : 49.81509780883789
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: There is no 14h act in Corinthians. John's face goes pale --\n\n", 'answer': 'I need a church.', 'gold_tag': 'John demonstrates a sense of urgency , John is oriented towards religious places', 'last_speaker': 'JOHN'}
Last word -> JOHN : "I need a church."
prediction :  Oh. I get you. That's why there's no place 14-hearse.
Real answer : I need a ch

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8475313186645508], 'recall': [0.8969327211380005], 'f1': [0.8715324997901917], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352941, 'rouge2': 0.0, 'rougeL': 0.11764705882352941, 'rougeLsum': 0.11764705882352941}
bleu 1/2 : 0.10000000000000002 0.03333333333333335
ppl : 46.30514144897461
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: 13:29. 13:30 -- Here... Corinthians 14:01... 'The sins of the father would only be exceeded by the ego of the son.' why I couldn't place it... not a normal possession... But he can't cross over -- impossible for the son to cross over...\nANGELA: Whose son? God's?\n\n", 'answer': 'No. The other one. Lucifer had a son too.', 'gold_tag': 'JOHN is kno

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7841764688491821], 'recall': [0.8711303472518921], 'f1': [0.8253695368766785], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05970149253731343, 'rouge2': 0.0, 'rougeL': 0.05970149253731343, 'rougeLsum': 0.05970149253731343}
bleu 1/2 : 0 0
ppl : 15.252408027648926
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: been in the presence of the Creator either. That goes double for us -- God's most prized creations. Mammon would be the last demon we'd ever want crossing over.\nANGELA: that's what you said.\nJOHN: Some psychic with the chops to reach all the way to Hell and the grit to withstand whatever shit she'd pull out. But even a bridge wouldn't help Mammon. Because to cross over, the myth 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.845048189163208], 'recall': [0.8378884792327881], 'f1': [0.8414531350135803], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0625, 'rouge2': 0.0, 'rougeL': 0.0625, 'rougeLsum': 0.0625}
bleu 1/2 : 0 0
ppl : 59.63328552246094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: Hey, Beeman? Beeman?!\nANGELA: Maybe he's not here.\nJOHN: He's always here.\nANGELA: What?\nJOHN: Oh Jesus, no --\nANGELA: Oh God --\nJOHN: Who are you?! Answer me!! Goddamit -- reveal yourself! I summon you! ... this is my fault... I sent them in... I sent them right to their deaths... a damn one-man plague.\nANGELA: John, you need a doctor.\n\n", 'answer': "I've seen a goddamn doctor!! Get away from me!! Stay the hell 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8206226825714111], 'recall': [0.8175579309463501], 'f1': [0.8190874457359314], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 18140.884765625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: But you knew that already, didn't you? You see something in me... I need to understand -- Why not?\nJOHN: Because you open up to that world, you're inviting whatever's out there back in. That's what happened to Isabel. No, she embraced her gift while you denied yours and that denial is exactly why you're still alive. Stick with me, that'll change. And I don't need another ghost following me around.\nANGELA: Dammit, John -- they k

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8284502029418945], 'recall': [0.8479598164558411], 'f1': [0.8380914926528931], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12121212121212122, 'rouge2': 0.0, 'rougeL': 0.10101010101010101, 'rougeLsum': 0.10101010101010101}
bleu 1/2 : 0.0588235294117647 0.00836827408755723
ppl : 17.122461318969727
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: Why water?\nJOHN: It's the universal conduit. Lubricates the transition from one plane to another. Now ask me if there's water in Hell. Normally only a portion of the body has to be suspended but you wanted the crash course...\nANGELA: What will I see? His eyes say it all. Enough. Lie down? For how long?\n\n", 'answer': 'Last chance... Take a de

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8717656135559082], 'recall': [0.8641213774681091], 'f1': [0.8679265975952148], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 69.40071868896484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: All those people... so sad... ... I've always known... where the bad guys are... where to aim, when to duck... it's not luck... I've always known...\n\n", 'answer': 'Angela?', 'gold_tag': 'Everyday Language', 'last_speaker': 'JOHN'}
Last word -> JOHN : "Angela?"
prediction :  Oh. I hear. Now.
Real answer : Angela?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8908125162124634], 'recall': [0.8648961782455444], 'f1': [0.8776630759239197], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 223.17413330078125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: That half-breed tipped my scales when he started killing my friends. I'm just adding a little counterweight.\nANGELA: What about me? I can't stay here forever.\n\n", 'answer': "You're staying in the car.", 'gold_tag': 'JOHN is authoritative by dictating where ANGELA should stay', 'last_speaker': 'JOHN'}
Last word -> JOHN : "You're staying in the car."
prediction :  You always do what's best for people. Well, it's easy for you to

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8109625577926636], 'recall': [0.8658056259155273], 'f1': [0.8374871611595154], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0606060606060606, 'rouge2': 0.0, 'rougeL': 0.0606060606060606, 'rougeLsum': 0.0606060606060606}
bleu 1/2 : 0 0
ppl : 11.961304664611816
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nANGELA: Something's coming...\nJOHN: What happened to staying in the car?\nANGELA: You were in danger.\n\n", 'answer': "Well now there's a premonition.", 'gold_tag': "JOHN is surprised at ANGELA's intervention", 'last_speaker': 'JOHN'}
Last word -> JOHN : "Well now there's a premonition."
prediction :  Did something happen...?
Real answer : Well now there's a premonition.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8721909523010254], 'recall': [0.8635066747665405], 'f1': [0.8678271174430847], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 4381.99609375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJOHN: soldier's spear. It matches that child's drawing John removed from the fridge -- now lying on his seat. Then you know what this spear would mean to a demon trying to cross over. She doesn't. Mammon needed divine assistance -- how's the blood of God's only son? Not quite. Twins. Here we go --\nANGELA: What is that?\nJOHN: Hang on --\nANGELA: I lost it. He's down --\n\n", 'answer': "He's not down. Gimme your phone.", 'gold_tag': 'Every

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8277535438537598], 'recall': [0.8706135749816895], 'f1': [0.8486427664756775], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10256410256410256, 'rouge2': 0.0, 'rougeL': 0.05128205128205128, 'rougeLsum': 0.05128205128205128}
bleu 1/2 : 0.03125 0.010040241611281232
ppl : 33.7104377746582
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKING KARL: Hi! I'm Karl, and Welcome to my\nEMMET: For Kreativity...\nKING KARL: Exaktly!\nEMMET: Why did you build all this stuff?\n\n", 'answer': 'Well...Why not?', 'gold_tag': 'KING KARL exhibits a whimsical and free-spirited nature with his response of "Why not?" to Emmet\'s question', 'last_speaker': 'KING KARL'}
Last word -> KING KARL : "Well...Why not?"
pred

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8274590969085693], 'recall': [0.8198221921920776], 'f1': [0.8236229419708252], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 97.93363189697266
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKING KARL: If you can conceive it, we will achieve it. Build anything whenever you feel it.\nEMMET: You know we really don't have time--\n\n", 'answer': 'Invent a game, like Roller Cricket. Or buy Bullfighting Tennis tickets...', 'gold_tag': 'KING KARL encourages building and inventing games', 'last_speaker': 'KING KARL'}
Last word -> KING KARL : "Invent a game, like Roller Cricket. Or buy Bullfighting Tennis tickets..."
prediction :  

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8332105278968811], 'recall': [0.8357594013214111], 'f1': [0.8344830274581909], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05405405405405406, 'rouge2': 0.0, 'rougeL': 0.05405405405405406, 'rougeLsum': 0.05405405405405406}
bleu 1/2 : 0.04000000000000001 0.012909944487358061
ppl : 39.84416961669922
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKING KARL: Make any idea thought or spoken. Rules were meant to be broken!\n\n', 'answer': 'No, rules were meant to be followed.', 'gold_tag': 'EMMET believes in following the rules', 'last_speaker': 'EMMET'}
Last word -> EMMET : "No, rules were meant to be followed."
prediction :  I made a mistake..
Real answer : No, rules were meant to be followed.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8849244117736816], 'recall': [0.8651227951049805], 'f1': [0.8749115467071533], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 180.75682067871094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBIZZLEBEK: I've read your stories. They're fantastic.\nKAFKA: I don't know what you could have read.\nBIZZLEBEK: Just what you've published.\nKAFKA: -- In magazines nobody reads.\nBIZZLEBEK: I read the one about the penal colony.\nKAFKA: Did you?\n\n", 'answer': "The needles inscribing the judgement into the flesh of the man. Very good. If I could sculpt as well as that, I'd be quite proud of myself.", 'gold_tag': 'BIZZLEBEK aspires t

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.891973078250885], 'recall': [0.8347669839859009], 'f1': [0.8624224066734314], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0.00011182087596750395 4.33080390380468e-05
ppl : 4585.5703125
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBIZZLEBEK: Where are your friends?\n\n', 'answer': 'Good question. Who are my friends ... would also be of interest.', 'gold_tag': 'KAFKA questions the nature of his friends , KAFKA questions the identity of his friends', 'last_speaker': 'KAFKA'}
Last word -> KAFKA : "Good question. Who are my friends ... would also be of interest."
prediction :  They are

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8644999861717224], 'recall': [0.8506720662117004], 'f1': [0.8575302958488464], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11764705882352942, 'rouge2': 0.0, 'rougeL': 0.11764705882352942, 'rougeLsum': 0.11764705882352942}
bleu 1/2 : 0.0613132401952404 0.021239529438966134
ppl : 326.02496337890625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAFKA: Gabriela was right -- it's easier for me to understand suicide. I'm a practicing suicide.\nBIZZLEBEK: -- In what sense?\nKAFKA: Bachelorhood is just the slow form. The bachelor doesn't sew seeds. Only the moment matters. The space he occupies grows smaller and smaller -- until the only space right for him is his coffin.\nBIZZLEBEK: These strange

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8398914337158203], 'recall': [0.8058245778083801], 'f1': [0.8225054144859314], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05714285714285715, 'rouge2': 0.0, 'rougeL': 0.05714285714285715, 'rougeLsum': 0.05714285714285715}
bleu 1/2 : 0 0
ppl : 24607.076171875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAFKA: You work in the cemetery.\nBIZZLEBEK: A man must eat -- And drink.\n\n', 'answer': 'The Castle cemetery.', 'gold_tag': "KAFKA's persona - KAFKA knows about the specific site, KAFKA's persona - KAFKA is detail-oriented", 'last_speaker': 'KAFKA'}
Last word -> KAFKA : "The Castle cemetery."




prediction : 
Real answer : The Castle cemetery.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.0], 'recall': [0.0], 'f1': [0.0], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : nan
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBIZZLEBEK: Are you sure you wouldn't prefer going through official channels?\nKAFKA: Official channels -- a lot of good they've done me. My only hope is to approach the officials personally.\nBIZZLEBEK: -- I'm flattered, of course, to be considered a friend -- even without knowing all the details. (Kafka doesn't take -- To see such determination is reward enough for me.\nKAFKA: The Kafka men are famous for it, you know. Delivering meat barefoot in the depths of winter, picking up sacks of flou

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8083317279815674], 'recall': [0.8267645239830017], 'f1': [0.8174441456794739], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05714285714285715, 'rouge2': 0.0, 'rougeL': 0.05714285714285715, 'rougeLsum': 0.05714285714285715}
bleu 1/2 : 0.04000000000000001 0.012909944487358061
ppl : 18.513402938842773
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBIZZLEBEK: I was restoring some stones here in the upper section one day -- and I found this.\nKAFKA: An empty tomb?\nBIZZLEBEK: A cenotaph -- a monument to someone whose remains ended up elsewhere. But look -- The Castle gates were blockaded in the time of the great plague -- it's an escape route. Don't get stuck.\nKAFKA: Not a chance. I'm the thinne

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8649399280548096], 'recall': [0.8710637092590332], 'f1': [0.867991030216217], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 205.3004150390625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDAVID: No way... I think. we "ve met. You need a better line than that or I\'ll have to assume you\'re a pervert who likes to stare at Your skirt was so short I couldn\'t help it.\nWOMAN: Dress.\nDAVID: Skirt, dress, same thing.\nWOMAN: If you\'re a pervert, yeah. All\nDAVID: So do shorts, dear.\nWOMAN: Yeah, but there\'s that pesky piece of fabric right here that interferes with your fantasy life. Wake me when you have a good come back

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.848432183265686], 'recall': [0.8279469013214111], 'f1': [0.8380643725395203], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17647058823529413, 'rouge2': 0.0, 'rougeL': 0.058823529411764705, 'rougeLsum': 0.058823529411764705}
bleu 1/2 : 0.15789473684210523 0.029617443887954613
ppl : 29.292903900146484
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWOMAN: Your phone sucks. Don't guys like you get special phones that actually work in the middle of the biggest city in the country?\n\n", 'answer': 'Depends what the telecom lobby thinks of your voting record.', 'gold_tag': 'DAVID is likely a political figure or lawmaker , DAVID made a comment about a voting record impacting the quality of telecommu

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8691840171813965], 'recall': [0.8455837965011597], 'f1': [0.8572215437889099], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 46.33884811401367
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDAVID: I'll pay for the damage.\nWOMAN: Hope your new job pays better than your old one.\nDAVID: Give me your number and I'll send you a check.\nWOMAN: God, that is smooth. Spill coffee on me then ask for my number. Just let me spill some on you and we'll be even.\nDAVID: It's my first day at the office!\nWOMAN: Tell your boss some crazy chick dumped coffee on you on the MI. It's New York; he'll understand.\nDAVID: No!\nWOMAN: Come on.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8436745405197144], 'recall': [0.8168584108352661], 'f1': [0.830049991607666], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.1875, 'rougeLsum': 0.1875}
bleu 1/2 : 0.05629612105077489 0.01287111147310488
ppl : 5.87537956237793
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: And what if I don't want to give up on her?\nROB: You don't call.\nMIKE: But you said I shouldn't call if I wanted to give up on her.\nROB: Right.\nMIKE: So I don't call either way.\nROB: Right.\nMIKE: So what's the difference?\nROB: The only difference between giving up and not giving up is if you take her back when she wants to come back. See, you can't do anything to make her want to co

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8462551832199097], 'recall': [0.89339280128479], 'f1': [0.8691853880882263], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 109.71504211425781
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROB: I don't think I'm gonna take it.\nMIKE: I's a gig.\nROB: I mean, I need the money.\nMIKE: You're an actor. Find the Zen in the role.\nROB: It's definitely a step back for me.\nMIKE: Look, there's not much of a call for Shakespeare in this town.\nROB: There's just something about being\nMIKE: What do you want? You're tall.\nROB: Do you realize how hard it's going to be to tell my parents? I still haven't told them I didn't get the p

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8822161555290222], 'recall': [0.8238328099250793], 'f1': [0.8520255088806152], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0 0
ppl : 80.91272735595703
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMIKE: Haven't you noticed I didn't mention Michelle once today?\nROB: I didn't want to say anything.\nMIKE: Why?\nROB: I don't know. It's like not talking to\nMIKE: What? Like, you didn't want to jinx it?\nROB: Kinda.\nMIKE: I don't talk about her that much.\nROB: Oh no?\nMIKE: I didn't mention her once today.\nROB: Well, until now. Tend the pin.\nMIKE: The only reason I mentioned her at 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8269957304000854], 'recall': [0.8335993885993958], 'f1': [0.8302844166755676], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12500000000000003, 'rouge2': 0.0, 'rougeL': 0.12500000000000003, 'rougeLsum': 0.12500000000000003}
bleu 1/2 : 0 0
ppl : 46.57502365112305
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nROB: So, if the party starts at eight, why are\nMIKE: To get a drink before we meet the guys for a bite at eleven.\nROB: Oh. Where is this place?\nMIKE: It's one of these. For some reason, cool bars in L.A. have to be very hard to find and have no signs out front.\nROB: That doesn't sound too good for business.\nMIKE: It's kinda like a speakeasy kind of thing. It's kinda cool. It's like y

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.839464008808136], 'recall': [0.8565537929534912], 'f1': [0.8479228019714355], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.30769230769230765, 'rouge2': 0.0, 'rougeL': 0.15384615384615383, 'rougeLsum': 0.15384615384615383}
bleu 1/2 : 0 0
ppl : 30.613462448120117
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELVIN: That's some face they left hanging on you. You look like...\nSIMON: Could you take it just a little easy, Mr. Udall? Thank you. Verdell... sweetheart? By the way, thanks for saving me.\nMELVIN: I called. I never touched you. I didn't leave my name or nothing.\n\n", 'answer': 'Verdell? Hi, sweetheart.', 'gold_tag': 'SIMON shows affection for Verdell', 'last_speaker': 'SIMON'}
Last 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7935916185379028], 'recall': [0.8271432518959045], 'f1': [0.8100201487541199], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 569.8502197265625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSIMON: Hello?... yes... sure... finally, huh? Why, "finally"? Because I called you so many times. Maybe Oh, boy... I was hoping it was something like that. You didn\'t get one of them, huh? \'Cause I mean it wasn\'t only your office -- it was your home, hotel and the cigar club you like in San Francisco. No -- Sarcastic... Of course. I believe you. No, don\'t fire anyone... Please. Maybe I\'m wrong about the 20 times. Take a breath... 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8359994888305664], 'recall': [0.8667165637016296], 'f1': [0.8510809540748596], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 153.2182159423828
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELVIN: I took a chance you were up. I brought you Chinese soup.\nSIMON: Thanks.\nMELVIN: I have never been so tired in my life. Okay, if I sit here?\nSIMON: Got any easier questions?\nMELVIN: I haven't been sleeping. I haven't been clear or felt like myself. I'm in trouble. Some son of a bitch is burning my bridges behind my back... But the tiredness -- boy... Not just sleepy.\nSIMON: But sick -- nauseous -- where everything looks dis

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8170756101608276], 'recall': [0.8305859565734863], 'f1': [0.8237753510475159], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12500000000000003, 'rouge2': 0.0, 'rougeL': 0.12500000000000003, 'rougeLsum': 0.12500000000000003}
bleu 1/2 : 0 0
ppl : 142.15109252929688
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELVIN: Can I ask you a personal question? Do you ever get an erection for a woman?\nSIMON: Melvin...\nMELVIN: Wouldn't your lie be a lot easier if you were not...\nSIMON: You consider your life easy.\n\n", 'answer': 'I give you that one... Nice packing.', 'gold_tag': 'Everyday Language', 'last_speaker': 'MELVIN'}
Last word -> MELVIN : "I give you that one... Nice packing."
prediction : 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8363322019577026], 'recall': [0.8437432050704956], 'f1': [0.8400213718414307], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.17391304347826086, 'rouge2': 0.0, 'rougeL': 0.17391304347826086, 'rougeLsum': 0.17391304347826086}
bleu 1/2 : 0.12500000000000003 0.028867513459481287
ppl : 18.96800422668457
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSIMON: Thank you, Melvin. You overwhelm me.\nMELVIN: They did a nice job... Cozy, huh?\nSIMON: I love you.\n\n', 'answer': "I'll tell you, buddy, I'd be the luckiest guy alive if that did it for me.", 'gold_tag': 'Everyday Language', 'last_speaker': 'MELVIN'}
Last word -> MELVIN : "I'll tell you, buddy, I'd be the luckiest guy alive if that did it for 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8393412828445435], 'recall': [0.8471945524215698], 'f1': [0.8432496786117554], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0.05000000000000001 0.011322770341445961
ppl : 20.17217445373535
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSIMON: It's Carol.\nMELVIN: Here. Take the dog.\n\n", 'answer': 'Good luck.', 'gold_tag': 'Everyday Language', 'last_speaker': 'SIMON'}
Last word -> SIMON : "Good luck."
prediction :  Yes. I will. I will. Thank you.
Real answer : Good luck.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8574379682540894], 'recall': [0.8906170129776001], 'f1': [0.8737126588821411], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 28.50046730041504
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMELVIN: You going to come talk to me or not?\nSIMON: I\'m coming. What did she say?\nMELVIN: I\'m a great guy -- "extraordinary"... ... and she doesn\'t want contact with me. I\'m dying here.\nSIMON: Because... ... you love her?\nMELVIN: No... and you\'re supposed to be sensitive and sharp.\nSIMON: Okay... you tell me why --\nMELVIN: I don\'t know... Let me sleep on it and figure it out. Because I\'m stuck! Can\'t go back to what I had

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9414211511611938], 'recall': [0.8615247011184692], 'f1': [0.8997026085853577], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 17029.697265625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLARDNER: You always say that. You always say that. I'm telling you, you never met the man.\nMCCULLERS: Seventeen months ago I was posting a walk in Lisbon, and he was there. He never saw me. But I saw him, though.\nLARDNER: Lisbon?\nMCCULLERS: In Portugal, yes.\nLARDNER: Here's the news: He hasn't been in Portugal since '90. I know that from the file. Why don't you read the file, man?\nMCCULLERS: In fact, I think I talked with him, in Bo

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8600798845291138], 'recall': [0.8447282314300537], 'f1': [0.8523349761962891], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1111111111111111, 'rouge2': 0.0, 'rougeL': 0.1111111111111111, 'rougeLsum': 0.1111111111111111}
bleu 1/2 : 0.0016459494098040061 0.0005819309945812043
ppl : 32.74341583251953
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLARDNER: Well?\nMCCULLERS: I don't think so.\nLARDNER: Well, remember when Frysal's men paid off the Deejay in Cairo to announce a bogus press conference in the --\nMCCULLERS: --Nooo--\nLARDNER: --Yes. And the Munich Olympics in '72. A local radio station started broadcasting news of the massacre two minutes before it happened.\nMCCULLERS: That's stric

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8346803188323975], 'recall': [0.9281543493270874], 'f1': [0.8789390921592712], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 41.731170654296875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMCCULLERS: I wish he'd do his job already so we could do our job.\nLARDNER: We can't do our job unless he does his job.\nMCCULLERS: Why don't we just do his job then, so we can do our job, and get the fuck out of here.\nLARDNER: Do his job? I'm not a cold-blooded killer.\nMCCULLERS: Wait a minute--\nLARDNER: -Look. You want to kill a Good Guy, but not be a Bad Guy, you wait until a Bad Guy kills the Good Guy, and then you come in and 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8091182112693787], 'recall': [0.8672274351119995], 'f1': [0.8371656537055969], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 12.257776260375977
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMCCULLERS: Looks like someone keeps trying to do our job for us.\nLARDNER: If he does our job, he's our job.\n\n", 'answer': 'I get it.', 'gold_tag': 'MCCULLERS shares LARDNER\'s professional space , The use of terms like "our job" signals that they share some professional responsibility', 'last_speaker': 'MCCULLERS'}
Last word -> MCCULLERS : "I get it."
prediction :  What's that?
Real answer : I get it.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8279412984848022], 'recall': [0.8657479286193848], 'f1': [0.8464226722717285], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 64.52029418945312
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLARDNER: Did you see Blank in there?\nMCCULLERS: No...\n\n', 'answer': 'Good. For a second there I thought we were in trouble.', 'gold_tag': "LARDNER's perceived threat from Blank's presence will become irrelevant once they ascertain Blank's absence , MCCULLERS's initial ignorance about Blank's presence becomes irrelevant after LARDNER's clarification", 'last_speaker': 'LARDNER'}
Last word -> LARDNER : "Good. For a second there I thoug

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8882455229759216], 'recall': [0.8839696645736694], 'f1': [0.8861024975776672], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0 0
ppl : 80.36962127685547
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDAN: Dan Soulsby.\nMARTIN: Martin. Nice to meet you.\nDAN: Where you heading to?\nMARTIN: Away from here. Pisswater.\n\n', 'answer': "At least it's cold pisswater. Here's to getting away. That was bad. It's gonna take all day to get another bus here.", 'gold_tag': 'DAN shows a sense of humor even in less ideal situations , DAN implies that the wait for the next bus will be all day', 'last

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8416033983230591], 'recall': [0.822170615196228], 'f1': [0.8317735195159912], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 350.694091796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMARTIN: You have a license?\nDAN: Had one. They took it away.\nMARTIN: I got one. Let's go.\n\n", 'answer': "See that? They're all fake.", 'gold_tag': 'Everyday Language', 'last_speaker': 'DAN'}
Last word -> DAN : "See that? They're all fake."
prediction :  They are not the only ones in the world. It is my right to choose. I get to decide if I want to share my freedom. It is a choice that you made, and now it is up to me.
Real answer : S

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8308693170547485], 'recall': [0.8315646648406982], 'f1': [0.8312168121337891], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08695652173913045, 'rouge2': 0.0, 'rougeL': 0.04347826086956522, 'rougeLsum': 0.04347826086956522}
bleu 1/2 : 0 0
ppl : 12.340144157409668
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMARTIN: You take a hockey puck in the face?\nDAN: A fist. My stepdad socked me. I flew across the room and dented the sheetrock in the kitchen. My mom saw everything and still took his side.\nMARTIN: You got cracked good.\nDAN: I hate that guy. Two hundred pounds of worthless grease.\nMARTIN: Why'd he hit you?\nDAN: I wrecked his sixty-five Mustang and his boat in the same week. His babi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8511891961097717], 'recall': [0.8414509296417236], 'f1': [0.8462921380996704], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12244897959183673, 'rouge2': 0.0425531914893617, 'rougeL': 0.12244897959183673, 'rougeLsum': 0.12244897959183673}
bleu 1/2 : 0.07692307692307693 0.017541160386140592
ppl : 13.531054496765137
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFERGUS: You all right?\nJUDE: Fucking animal.\nFERGUS: You don't know that.\nJUDE: Fucking do. I had him all over me.\nFERGUS: Tough work, that.\nJUDE: Someone's got to do it. Nah, it was a breeze. Just thought of you. And you know what, Fergus? One of you made me want it...\n\n", 'answer': 'Which one?', 'gold_tag': 'Everyday Language',

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8436688184738159], 'recall': [0.8780220746994019], 'f1': [0.8605027198791504], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 59.657745361328125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJUDE: Hello, stranger. You vanished. What was it, Fergus? Did you blow the gaff on us or did you just fuck up?\nFERGUS: Leave me alone, Jude.\nJUDE: No. That's the last thing I'll do. You never asked what happened.\nFERGUS: I heard.\nJUDE: Eddie and Tinker died.\nFERGUS: I know.\nJUDE: Maguire and me got out by the skin of our teeth. No thanks to you.... What you think of the hair?\nFERGUS: Suits you.\nJUDE: Aye, I was sick of being b

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8421579599380493], 'recall': [0.8570654392242432], 'f1': [0.8495463132858276], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14285714285714282, 'rouge2': 0.0, 'rougeL': 0.14285714285714282, 'rougeLsum': 0.14285714285714282}
bleu 1/2 : 0.03790816623203959 0.01238075524708008
ppl : 17.015260696411133
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJUDE: You a handyman, Fergie?\nFERGUS: I take pride in my work.\n\n', 'answer': 'I sincerely hope so. Tools of the trade. And forget about the girl.', 'gold_tag': "JUDE is a discerning individual who values professionalism , JUDE could have some connection to FERGUS's work", 'last_speaker': 'JUDE'}
Last word -> JUDE : "I sincerely hope so. Tools of the

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8639674186706543], 'recall': [0.8526140451431274], 'f1': [0.8582531809806824], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 100.58392333984375
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: Oh -- hello--\nKLAATU: May I see you for a minute?\nHELEN: I -- I was Just going to lunch.\nKLAATU: May I walk out with you?\n\n', 'answer': 'Mrs. Benson speaking. Oh, hello...', 'gold_tag': 'HELEN uses formal language, indicating a professional or educated background', 'last_speaker': 'HELEN'}
Last word -> HELEN : "Mrs. Benson speaking. Oh, hello..."
prediction :  That's fine.
Real answer : Mrs. Benson speaking. Oh, hello...


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8547307252883911], 'recall': [0.8597303628921509], 'f1': [0.8572232723236084], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 70.56001281738281
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKLAATU: I saw Bobby this morning before he went to school--\nHELEN: Yes--?\nKLAATU: I want to know what he told you last night.\nHELEN: I -- I didn't really pay much attention--\n\n", 'answer': 'Did you believe what he told you? I have a reason for asking this -- a very important reason.', 'gold_tag': "KLAATU shows concern for Bobby's actions and the implications of his conversations", 'last_speaker': 'KLAATU'}
Last word -> KLAATU : "D

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.845435619354248], 'recall': [0.8487799167633057], 'f1': [0.8471044898033142], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0625, 'rouge2': 0.0, 'rougeL': 0.0625, 'rougeLsum': 0.0625}
bleu 1/2 : 0.05106188922430991 0.016713926606989293
ppl : 141.2675323486328
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: What is it you want?\nKLAATU: Before I ask you to be honest with me, perhaps I should be completely honest with you--\nHELEN: What happened?\nKLAATU: What time is it?\nHELEN: Just twelve.\nKLAATU: We'll be stuck here for a little while -- about thirty minutes.\nHELEN: We could try pushing the other buttons. I have a flashlight in my purse.\nKLAATU: It won't work.\nHELEN: Why not?\nKLA

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8872337341308594], 'recall': [0.8365087509155273], 'f1': [0.8611249327659607], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.125, 'rouge2': 0.0, 'rougeL': 0.125, 'rougeLsum': 0.125}
bleu 1/2 : 0.12500000000000003 0.04225771273642583
ppl : 105.36029052734375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKLAATU: --I've already told you more than I told Professor Barnhardt, because my life, in a sense, is in your hands. I thought if you knew the facts you'd appreciate the importance of my not being -- apprehended, -- before the meeting tonight.\nHELEN: Yes, of course. Of course I do. You hold great hope for this meeting.\nKLAATU: I can see no other hope for your planet. If the meeting should fa

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7627289295196533], 'recall': [0.8049430251121521], 'f1': [0.783267617225647], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 9243.74609375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHELEN: Where are you going now?\nKLAATU: Back to the boardinghouse. I'll be safe there for the afternoon -- and I can keep an eye on Bobby. He's the only other person who knows anything about--\nHELEN: No, wait a minute -- there's someone else.\nKLAATU: Who?\n\n", 'answer': "Tom... He was there last night when Bobby told me what he saw. Of course, he doesn't know anything definite. He'd talk to me first, anyway, before-- We can't take that 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8509913682937622], 'recall': [0.821648120880127], 'f1': [0.8360623717308044], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0392156862745098, 'rouge2': 0.0, 'rougeL': 0.0392156862745098, 'rougeLsum': 0.0392156862745098}
bleu 1/2 : 0 0
ppl : 78.79364013671875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKLAATU: I'm sure Barnhardt can arrange to hide me until the meeting.\nHELEN: Where is the meeting going to be?\n\n", 'answer': 'At the ship.', 'gold_tag': 'KLAATU is soon going to attend a meeting', 'last_speaker': 'KLAATU'}
Last word -> KLAATU : "At the ship."
prediction :  A warehouse.
Real answer : At the ship.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9336491823196411], 'recall': [0.926976203918457], 'f1': [0.9303007125854492], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 6604.80224609375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKLAATU: Hello.\nHELEN: I -- I thought you were--\nKLAATU: I was.\nHELEN: You mean he has the power of life and death?\nKLAATU: No -- that is a power reserved to This technique, in certain cases, can re-stimulate life for a limited period. It's a refinement of scientific principles known to your own people.\nHELEN: But how -- how long--?\n\n", 'answer': 'How long will I live? That no one can say.', 'gold_tag': 'KLAATU has had his life re-

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8501441478729248], 'recall': [0.8552180528640747], 'f1': [0.8526735901832581], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.14814814814814817, 'rouge2': 0.0, 'rougeL': 0.07407407407407408, 'rougeLsum': 0.07407407407407408}
bleu 1/2 : 0.0588235294117647 0.019174124721184262
ppl : 78.98514556884766
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLERK: Twelve gauge. You need shells? Moss\nMOSS: Uh-huh. Double ought.\nCLERK: They'll give you a wallop.\nMOSS: You have camping supplies?\nCLERK: Tent poles.\nMOSS: Uh-huh.\nCLERK: You already have the tent?\nMOSS: Somethin' like that.\nCLERK: Well you give me the model number of the tent I can order you the poles.\nMOSS: Never mind. I want a tent.\n

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8041415214538574], 'recall': [0.8365483283996582], 'f1': [0.8200249075889587], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 226.30734252929688
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOSS: One room, one night.\nCLERK: That's twenty-six dollars.\nMOSS: You on all night?\nCLERK: Yessir, be here til ten tomorrow morning.\nMOSS: For you. I ain't asking you to do anything illegal.\nCLERK: I'm waitin' to hear your description of that.\n\n", 'answer': "There's somebody lookin' for me. Not police. Just call me if anyone else checks in tonight.", 'gold_tag': 'MOSS is not being sought after by the police , MOSS is being sou

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8370339870452881], 'recall': [0.8325639367103577], 'f1': [0.8347930312156677], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 143.82781982421875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLERK: How those Larries holdin' up?\nMOSS: Good. I need everything else.\nCLERK: Okay.\nMOSS: You get a lot of people come in here with no clothes on?\n\n", 'answer': "No sir, it's unusual.", 'gold_tag': 'CLERK works in a place where they sell or lend clothes', 'last_speaker': 'CLERK'}
Last word -> CLERK : "No sir, it's unusual."
prediction :  Yeah
Real answer : No sir, it's unusual.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8223121166229248], 'recall': [0.8076931238174438], 'f1': [0.8149371147155762], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 7528537.5
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMICHAEL: There's a lot I can't tell you, Tom. I know that's upset you in the past; and you've felt that it was because of some lack of trust or confidence. But it is because I do trust you that I've kept so much secret from you. It's precisely that at this moment, you are the only one that I can completely trust. In time, you'll understand everything.\nHAGEN: But your people... Neri... Rocco; you don't think...\nMICHAEL: No, I have confidence 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8409590721130371], 'recall': [0.8719221949577332], 'f1': [0.8561607003211975], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2564102564102564, 'rouge2': 0.0, 'rougeL': 0.20512820512820512, 'rougeLsum': 0.20512820512820512}
bleu 1/2 : 0.07999999999999999 0.01825741858350553
ppl : 39.58269119262695
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMICHAEL: Do you think they have somebody to back up Cicci?\n\n', 'answer': "No. But if they do have somebody, you'll do three years for perjury if you give them so much as a wrong middle name. Michael, take the Fifth all the way, that way you can't get into trouble.", 'gold_tag': 'HAGEN is a cautious and knowledgeable character , HAGEN is able to provide

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8348674774169922], 'recall': [0.8200139403343201], 'f1': [0.8273740410804749], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15094339622641512, 'rouge2': 0.0, 'rougeL': 0.07547169811320756, 'rougeLsum': 0.07547169811320756}
bleu 1/2 : 0 0
ppl : 26.35854148864746
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMICHAEL: Al, get me a wet towel. Does Kay know I'm back? Did the boy get something from me for Christmas?\nHAGEN: I took care of it.\nMICHAEL: What was it, so I'll know.\nHAGEN: A little care he can ride in with an electric motor.\nMICHAEL: Fellas, can you wait outside a minute? Where's my brother?\nHAGEN: Roth got out on a private boat. He's in a hospital in Miami. Had a stroke but he's 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8936322331428528], 'recall': [0.9351711869239807], 'f1': [0.9139299988746643], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.5454545454545454, 'rouge2': 0.4444444444444444, 'rougeL': 0.5454545454545454, 'rougeLsum': 0.5454545454545454}
bleu 1/2 : 0.16666666666666669 0.05773502691896258
ppl : 19.21445083618164
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMICHAEL: Christ, Tom; I needed more time with him. I really needed him.\nHAGEN: Did he give you his politicians?\nMICHAEL: Not all...I needed another four months and I would have had them all. I guess you've figured it all out?\nHAGEN: How will they come at you?\nMICHAEL: I know now. I'll make them call me Don.\nHAGEN: Have you agreed on a m

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8229873180389404], 'recall': [0.8336078524589539], 'f1': [0.828263521194458], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.24, 'rouge2': 0.041666666666666664, 'rougeL': 0.12, 'rougeLsum': 0.12}
bleu 1/2 : 0.10923565662166056 0.020490112560187432
ppl : 37.63534927368164
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: They have everything we ordered?\nPJ: Glocks, Uzis, Tec-9s...\nSEAN: What the hell is that?\nPJ: Oh, my hair? Shondra just did it. What? You hate it?\nSEAN: No. I just think it's a choice my character would have made.\nPJ: Don't go there, Sean. You're just jealous you didn't think of it first.\n\n", 'answer': 'Whatever.', 'gold_tag': 'Everyday Language', 'last_speaker': 'SEA

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.820443868637085], 'recall': [0.9302842617034912], 'f1': [0.8719183802604675], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 180.97381591796875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: Okay, got the gangsta wear. What's so funny?\nPJ: That wig, man! I didn't know this was Halloween. Bring me back some candy!\n\n", 'answer': 'Whatever.', 'gold_tag': 'SEAN possibly indicates a more serious personality', 'last_speaker': 'SEAN'}
Last word -> SEAN : "Whatever."
prediction :  I was not expecting <i>that</i>.
Real answer : Whatever.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.762710452079773], 'recall': [0.9381797909736633], 'f1': [0.8413941264152527], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 44.1241340637207
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nPJ: Nope, you're not convincing me. Remember your core character. You're an oppressed black man from\nSEAN: Yeah, I know, I'm having trouble finding this one. Lemme try it again. Gimme your ride, punk, or I will dust yo ass!\n\n", 'answer': "Add a 'beeyotch' and I think you're there.", 'gold_tag': 'PJ plays the role of a coach or guide for acting', 'last_speaker': 'PJ'}
Last word -> PJ : "Add a 'beeyotch' and I think you're there."
predi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8227411508560181], 'recall': [0.8207101821899414], 'f1': [0.8217244148254395], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09999999999999999, 'rouge2': 0.0, 'rougeL': 0.09999999999999999, 'rougeLsum': 0.09999999999999999}
bleu 1/2 : 0 0
ppl : 53.790077209472656
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: What's next?\nPJ: How about... number three?\n\n", 'answer': "Yeah, perfect. Let's take it to another level, my man.", 'gold_tag': 'Everyday Language', 'last_speaker': 'SEAN'}
Last word -> SEAN : "Yeah, perfect. Let's take it to another level, my man."
prediction :  Okay, so... 1. What's in 3...
Real answer : Yeah, perfect. Let's take it to another level, my man.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8562705516815186], 'recall': [0.8611749410629272], 'f1': [0.8587157726287842], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1111111111111111, 'rouge2': 0.0, 'rougeL': 0.1111111111111111, 'rougeLsum': 0.1111111111111111}
bleu 1/2 : 0 0
ppl : 57.728328704833984
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSEAN: How much did you have to give this guy, anyway? What are you talking about? When you set this thing up yesterday, what did you pay?\nPJ: I thought you set it up.\nSEAN: You mean the owner doesn't know this is all fake?\nPJ: Oh damn.\nSEAN: Get in there! He could get shot!\n\n", 'answer': "I'm not going in there! You go in there!", 'gold_tag': 'PJ refuses to approach the owner of the e

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8856886625289917], 'recall': [0.8303235769271851], 'f1': [0.8571130037307739], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.15384615384615383, 'rouge2': 0.0, 'rougeL': 0.15384615384615383, 'rougeLsum': 0.15384615384615383}
bleu 1/2 : 0 0
ppl : 185.90574645996094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINSPECTOR: Kafka -- is that your real name?\nKAFKA: Yes. Yes, of course -- why wouldn't it be?\nINSPECTOR: When was the last time you saw Mr. Raban?\nKAFKA: Wednesday. We left the office together.\nINSPECTOR: Did you go anywhere afterwards -- to have a drink perhaps?\nKAFKA: No, we said goodbye outside the building. He went off, as usual, toward his house.\nINSPECTOR: Your office is the 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8457860946655273], 'recall': [0.8709067106246948], 'f1': [0.8581625819206238], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857142, 'rouge2': 0.0, 'rougeL': 0.07142857142857142, 'rougeLsum': 0.07142857142857142}
bleu 1/2 : 0.04000000000000001 0.012909944487358061
ppl : 25.201990127563477
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINSPECTOR: I don't usually involve myself with you people in the Old Quarter -- but the River runs its own course. It won't be the last time it deposits its unwanted debris on my doorstep.\nKAFKA: Probably not.\nINSPECTOR: Anyway, I'd like you to reflect that in me you have -- I won't say a friend, because we're complete fencers, of course -- from dis

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8411655426025391], 'recall': [0.848198652267456], 'f1': [0.8446674346923828], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.05263157894736842, 'rouge2': 0.0, 'rougeL': 0.05263157894736842, 'rougeLsum': 0.05263157894736842}
bleu 1/2 : 0 0
ppl : 26.380319595336914
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINSPECTOR: You said she was extremely upset. People who are extremely upset -- Kafka -- are given to disappearing in a hurry. They go and calm down for a day or so and then they come back.\nKAFKA: But that's just my point -- she was more than upset, she was livid. She slammed every door on her way downstairs -- except that one. I was just behind her and I didn't even hear that one shut --

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8619331121444702], 'recall': [0.8333727121353149], 'f1': [0.8474123477935791], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0.03678794411714424 0.012262648039048083
ppl : 67.69734191894531
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINSPECTOR: Why would someone want to kidnap this woman -- the name is Rossmann?\nKAFKA: You told me to contact you if anything relevant came up -- Gabriela is relevant. When I spoke to you before I didn't know she'd been seeing Eduard.\nINSPECTOR: That's been noted. But where does it lead us? Unless you have something more to add.\nKAFKA: She's missing

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.844226598739624], 'recall': [0.8274899125099182], 'f1': [0.8357744812965393], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0 0
ppl : 50.463897705078125
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKAFKA: Maybe it's true then what she said.\nINSPECTOR: What did she say?\nKAFKA: That the police may have allegiance to something other than truth. -- She didn't think Eduard committed suicide. Any more than I do. She was convinced of it.\n\n", 'answer': "I'm going to say something, and I hope it's quite clear because I won't be repeating it. We don't have to hunt for criminals. We're dra

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8556667566299438], 'recall': [0.8289023637771606], 'f1': [0.8420719504356384], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.04651162790697674, 'rouge2': 0.0, 'rougeL': 0.04651162790697674, 'rougeLsum': 0.04651162790697674}
bleu 1/2 : 0 0
ppl : 136.3125457763672
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGALE: You broke the lock.\nDEWEY: You broke my heart.\nGALE: What are we looking for?\nDEWEY: A bloody knife. A ghost mask. Something like that?\nGALE: Can we turn the light on?\nDEWEY: That's not the way it's done. And it's sexier in the dark.\nGALE: Dewey, you and I would never make it.\nDEWEY: Maybe not. But think about the sex. As frustrated as I am...\nGALE: ... and as lonely as I am

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7971076965332031], 'recall': [0.7893345355987549], 'f1': [0.7932020425796509], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 10.259525299072266
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGALE: Ssshh! What was that?\nDEWEY: Let's find out.\nGALE: This is all too familiar.\n\n", 'answer': 'Over here.', 'gold_tag': 'Everyday Language', 'last_speaker': 'DEWEY'}
Last word -> DEWEY : "Over here."
prediction :  It had to happen this way, I suppose.
Real answer : Over here.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8430286645889282], 'recall': [0.883810818195343], 'f1': [0.8629381656646729], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 37.391780853271484
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDEWEY: Gale! Gale!\n\n', 'answer': 'NOOOOOOO!', 'gold_tag': 'Everyday Language', 'last_speaker': 'GALE'}
Last word -> GALE : "NOOOOOOO!"
prediction :  Yes, what is it? I was about to take out the garbage.
Real answer : NOOOOOOO!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8322161436080933], 'recall': [0.7869114279747009], 'f1': [0.808929979801178], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 40.287540435791016
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDEWEY: They found another note?\nGALE: Uh-huh. Another News clipping. Another one about Sid's mom. Dewey, whether we like it or not it looks like we're going to have to go back into our past.\nDEWEY: You mean OUR past as in Me and Sid. This has nothing to do with you Gale.\nGALE: This has everything to do with me Dewey. I was the one who brought the nation's attention to this shit anyway. I'll be damned if I'm gonna sit around and pass

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8160693645477295], 'recall': [0.8770214319229126], 'f1': [0.8454482555389404], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 12.824409484863281
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDEWEY: Sheriff just radioed me. I'm gonna check out a possible lead. Thought you might like to join me.\nGALE: What kind of lead?\nDEWEY: A car was spotted in the bushes a little ways up the road.\nGALE: I'd love to. If you're sure it's alright?\nDEWEY: Ma'am, I am the Deputy of this town.\nGALE: Can I bring Kenny?\nDEWEY: NO! I mean...I should probably take just you.\n\n", 'answer': "I'll be back.", 'gold_tag': 'GALE will return afte

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8619493246078491], 'recall': [0.8780948519706726], 'f1': [0.8699471950531006], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 121.59587860107422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nGALE: So is Dewey your real name?\nDEWEY: Dwight. Dewey was something I got stuck with a long time ago.\nGALE: I like it. It's... sexy.\nDEWEY: Nah... it's just this town's way of not taking me serious.\nGALE: What about Gale Weathers? I sound like a meteorologist... People treat me like the Antichrist of television journalism.\nDEWEY: I don't think you're so bad.\n\n", 'answer': 'Are al

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8455454111099243], 'recall': [0.8349599838256836], 'f1': [0.8402193188667297], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473685, 'rouge2': 0.0, 'rougeL': 0.10526315789473685, 'rougeLsum': 0.10526315789473685}
bleu 1/2 : 0 0
ppl : 16.767040252685547
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDEWEY: You okay?\nGALE: What's that?\nDEWEY: Looks like a car. Shit. It's Neil Prescott's car.\nGALE: Sidney's father?\n\n", 'answer': "We gotta get back. Jesus. He's here. What the fuck is he doing here?", 'gold_tag': 'DEWEY potentially works in a field that demands quick response times and analytical skills , GALE might be less experienced in high-stake, tense situations compared to De

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8580892086029053], 'recall': [0.874184250831604], 'f1': [0.8660619854927063], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.19354838709677422, 'rouge2': 0.0689655172413793, 'rougeL': 0.19354838709677422, 'rougeLsum': 0.19354838709677422}
bleu 1/2 : 0.07122777528017817 0.023443930918208206
ppl : 34.41203308105469
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDEWEY: I'll call for backup.\n\n", 'answer': "I'll get my camera. Kenny! Camera! Quick! Kenny? Dewey? Dewey? Where are you?", 'gold_tag': 'GALE shows assertive behavior , GALE is involved in the reporting or media industry , GALE intends to get a camera and gives a directive to Kenny', 'last_speaker': 'GALE'}
Last word -> GALE : "I'll get

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8036469221115112], 'recall': [0.8097525238990784], 'f1': [0.806688129901886], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1, 'rouge2': 0.0, 'rougeL': 0.1, 'rougeLsum': 0.1}
bleu 1/2 : 0 0
ppl : 28.836828231811523
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHADOW: You could've crispered half your company with that little stunt, but more importantly you wrecked the physical evidence I use to prove it's arson. You've made my day longer, Probie. Go home and think about that.\nBRIAN: Uh, I'm Brian McCaffrey. Your new assistant.\nSHADOW: Your Dennis' kid. I work alone. Are you still here?\nBRIAN: Get used to me, Inspector. I'm not going anywhere.\nSHADOW: Then go find a corner. I don't want y

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8584979176521301], 'recall': [0.8616971969604492], 'f1': [0.8600946068763733], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 33.696407318115234
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBRIAN: "Trychtichlorate is a binary structured --"\nSHADOW: -- Go to the bottom. Under heat properties.\nBRIAN: "During heat episodes of 2000 Kelvin or higher, Trych breaks down and dissipates. Will consume magnesium".\nSHADOW: Ever burned magnesium? It\'s so hot Son of a bitch tears \'em apart just to eat the oxygen. Wouldn\'t take much at all to melt ten gauge wire. Problem\'s burnt magnesium leaves a powder trace -- unless you coul

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8254165649414062], 'recall': [0.8633016347885132], 'f1': [0.843934178352356], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.060277938842773
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHADOW: Take the top off. Go ahead. Take it off.\nBRIAN: Jesus!\nSHADOW: That's it! Oh, that son of a bitch, he's different, goddamn it! You see what this tells us, huh? Our killer doesn't love fire!\nBRIAN: What?\nSHADOW: I got it after we talked to Ronald. Torches. Want to fry the whole goddamn world. But the fires that killed those guys never really burned up much. -- The burns were all lit in outlets surrounded by double firebreaks

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8364948034286499], 'recall': [0.8703463077545166], 'f1': [0.8530848622322083], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.24324324324324326, 'rouge2': 0.0, 'rougeL': 0.13513513513513514, 'rougeLsum': 0.13513513513513514}
bleu 1/2 : 0.09259259259259259 0.013217526428635742
ppl : 26.171764373779297
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBRIAN: Hey boss, Dekom Trust is owned by Pan Illinois... which is majority controlled by Lakeside Dynamics... which is a division of Windy City Ventures... who's partners are... Alan Seagrave, Donald Cosgrove, and Jeffrey Holcomb.\n\n", 'answer': 'Son of a bitch. They knew each other.', 'gold_tag': 'SHADOW is quick at coming to conclusions', 'last_spe

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8004714846611023], 'recall': [0.8157351016998291], 'f1': [0.8080312013626099], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 135.74156188964844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBRIAN: So Seagrave and Holcomb were accountants...\nSHADOW: And Cosgrove. Coppers figured he laundered money for the mob before getting into real estate. They weren't very high on Seagrave, either.\nBRIAN: Nice bunch of guys.\n\n", 'answer': "Who all ended up wearing candles for faces... Swayzak's up to his ass in this We need to get a look at his files.", 'gold_tag': "SHADOW implies a need to immediately investigate Swayzak's files",

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8258861303329468], 'recall': [0.8202919363975525], 'f1': [0.823079526424408], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0.012594669908908207 0.004301898749003176
ppl : 133.40501403808594
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHADOW: Well Brian, I guess you can say it's arson now...\nBRIAN: How ya feeling? Did you pull me out?\nSHADOW: Yeah.\nBRIAN: Did I say thanks?\nSHADOW: No.\nBRIAN: Just wondering.\nSHADOW: I hate hospitals. You're so... so goddamn useless...\nBRIAN: So what do you want me to do?\n\n", 'answer': "I've been lying here hours... just thinking... We're cl

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8762414455413818], 'recall': [0.8433027267456055], 'f1': [0.8594565987586975], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.041666666666666664, 'rouge2': 0.0, 'rougeL': 0.041666666666666664, 'rougeLsum': 0.041666666666666664}
bleu 1/2 : 0 0
ppl : 25.237382888793945
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nSHADOW: Your brother was a good man.\nBRIAN: Yeah.\nSHADOW: Another couple of good men get burned up for their city? Is that how it's going to read? You're the only one that knows.\nBRIAN: Like it never happened...\n\n", 'answer': 'Want to help me with something?', 'gold_tag': "SHADOW is in a position to seek BRIAN's help", 'last_speaker': 'SHADOW'}
Last word -> SHADOW : "Want to help

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8324520587921143], 'recall': [0.8228148221969604], 'f1': [0.8276053667068481], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 89.67752075195312
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWEBSTER: All I want is the money, Benedict. Just give me what's mine and you can go back to stealing cars and screwing your best friend's wife. Pack it in, Vince, you're only wasting time -- I don't give a damn about you, just the money -- He's reloaded during this -- moving forward as we... Vince, if you don't cut this horseshit I may have to hurt you.\n\n", 'answer': 'Leave my brother alone!', 'gold_tag': 'JULIUS is protective , JULI

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8471094369888306], 'recall': [0.8590595126152039], 'f1': [0.8530426025390625], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 13.384500503540039
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWEBSTER: Just exactly who are you?\nJULIUS: Vincent's twin brother -- we're in this together.\nWEBSTER: Keep your hands that way.\n\n", 'answer': "Don't be nervous -- I hate violence and I never carry guns.", 'gold_tag': 'JULIUS dislikes violence , JULIUS never carries guns', 'last_speaker': 'JULIUS'}
Last word -> JULIUS : "Don't be nervous -- I hate violence and I never carry guns."
prediction :  I need this money for some medical re

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8476439118385315], 'recall': [0.8525525331497192], 'f1': [0.8500910997390747], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16, 'rouge2': 0.0, 'rougeL': 0.16, 'rougeLsum': 0.16}
bleu 1/2 : 0.08333333333333333 0.027524094128159027
ppl : 82.11492156982422
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nWEBSTER: 'Shakespeare's Collected Works'?... 'The Mind of Confucius'?... 'Advanced Quantum Mechanics'?... What is this shit?\nJULIUS: I needed something to read on the airplane.\n\n", 'answer': 'Yeah, well your flying days are over...', 'gold_tag': 'WEBSTER implies that JULIUS will not be flying anymore in the foreseeable future', 'last_speaker': 'WEBSTER'}
Last word -> WEBSTER : "Yeah, well your

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8201748132705688], 'recall': [0.8356374502182007], 'f1': [0.827833890914917], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 46.10697937011719
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKICHIJIRO: After I saw you and Padre Garupe for the first time...I thought...I started to believe...that God might take me back. Because in...in my dreams, the fire was no longer so bright.\nRODRIGUES: Jesus said, "Every one therefore that shall confess me before men, I will also confess him before my Father who is in heaven. But he that shall deny me before men, I will also deny him before my Father who is in heaven." Do you want me to

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8328120112419128], 'recall': [0.8573468327522278], 'f1': [0.8449013829231262], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.08163265306122448, 'rouge2': 0.0, 'rougeL': 0.08163265306122448, 'rougeLsum': 0.08163265306122448}
bleu 1/2 : 0.048780487804878044 0.011043152607484655
ppl : 19.52991485595703
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKICHIJIRO: Why did you come here, Padre? This place is dangerous. Where are you going?\nRODRIGUES: Nowhere.\nKICHIJIRO: We must be careful. There’s a price of three hundred pieces of silver for you.\nRODRIGUES: Three hundred...Judas got only thirty.\n\n', 'answer': 'There are Kirishitans in other places. Not far. We can hide there. I will take care of

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8618282675743103], 'recall': [0.8543314933776855], 'f1': [0.8580635190010071], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857142, 'rouge2': 0.0, 'rougeL': 0.07142857142857142, 'rougeLsum': 0.07142857142857142}
bleu 1/2 : 0.01933361189094467 0.006603685639156038
ppl : 44.558311462402344
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nKICHIJIRO: We won't reach the village today if we can not walk faster. Are you all right, Padre? You seem tired.\nRODRIGUES: Just...no, I'm all right. Just so thirsty. The fish was so salty.\nKICHIJIRO: I will find you some water. Keep walking.\nRODRIGUES: No, that's all right, I...I thirst.\nKICHIJIRO: Our Lord said that.\nRODRIGUES: Yes. I mean...I 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8873302340507507], 'recall': [0.8707884550094604], 'f1': [0.8789815902709961], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0975609756097561, 'rouge2': 0.0, 'rougeL': 0.0975609756097561, 'rougeLsum': 0.0975609756097561}
bleu 1/2 : 0.028298418551649412 0.009314149810376485
ppl : 52.343502044677734
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLARICE: Good afternoon... I wonder if you could help me. I'm looking for Mrs. Lippman's family?\n\n", 'answer': "They don't live here anymore.", 'gold_tag': "Mr. Gumb has knowledge about Mrs. Lippman's family's current residence status , This suggests a familiarity with them or their former residence", 'last_speaker': 'MR. GUMB'}
Last word -> MR. GUMB 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8368517160415649], 'recall': [0.8570733070373535], 'f1': [0.8468418121337891], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 28.5687313079834
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLARICE: Excuse me, but I really do need to talk to you. This was Mrs. Lippman's house. Did you know her?\n\n", 'answer': "Just briefly. What's the problem, Officer?", 'gold_tag': 'MR. GUMB had some interaction with Mrs. Lippman , CLARICE is a law enforcement officer, identified as "Officer" by MR. GUMB', 'last_speaker': 'MR. GUMB'}
Last word -> MR. GUMB : "Just briefly. What's the problem, Officer?"
prediction :  I don't think so. I ca

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8530697226524353], 'recall': [0.862901508808136], 'f1': [0.8579574823379517], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 12.351611137390137
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCLARICE: I'm investigating the death of Fredrica Bimmel. Who are you, please? Mr. Gordon, did you know Fredrica when she worked for Mrs. Lippman?\n\n", 'answer': "No. Wait... Was she a great, fat person? I may have seen her, I'm not sure...", 'gold_tag': 'Everyday Language', 'last_speaker': 'MR. GUMB'}
Last word -> MR. GUMB : "No. Wait... Was she a great, fat person? I may have seen her, I'm not sure..."
prediction :  Mrs. Lippman has 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8309916257858276], 'recall': [0.8356863856315613], 'f1': [0.8333324790000916], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06451612903225806, 'rouge2': 0.0, 'rougeL': 0.06451612903225806, 'rougeLsum': 0.06451612903225806}
bleu 1/2 : 0.02272727272727273 0.007270084086278819
ppl : 39.8862190246582
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMR. GUMB: Mrs. Lippman had a son, maybe he could help you. I have his card somewhere. Do you mind stepping inside, while I looks for it?\n\n', 'answer': 'Thanks.', 'gold_tag': 'Everyday Language', 'last_speaker': 'CLARICE'}
Last word -> CLARICE : "Thanks."
prediction :  I have to see her.
Real answer : Thanks.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8749743103981018], 'recall': [0.9469174146652222], 'f1': [0.9095253944396973], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 94.07807922363281
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHOLLY (v.o.): We had our bad moments, like any couple. Kit accused me of only being along for the ride,while at times I wished he'd fall in the river and drown, so I could watch.\nKIT: Take a break. Red... Life of Riley, huh?\n\n", 'answer': 'Mostly. though. we got along fine and stayed in love.', 'gold_tag': 'HOLLY v.o. stays in love , Shared memories between HOLLY v.o. and KIT', 'last_speaker': 'HOLLY (v.o.)'}
Last word -> HOLLY (v.o

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8244374394416809], 'recall': [0.8736714124679565], 'f1': [0.8483406901359558], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.04000000000000001, 'rouge2': 0.0, 'rougeL': 0.04000000000000001, 'rougeLsum': 0.04000000000000001}
bleu 1/2 : 0.025 0.008006407690254359
ppl : 17.61054039001465
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHOLLY (v.o.): For days afterward I lived in dread. At times I wished I could fall asleep and be taken off to some magical land, but this never happened.\n\n', 'answer': 'Holly!', 'gold_tag': 'Everyday Language', 'last_speaker': 'KIT'}
Last word -> KIT : "Holly!"
prediction :  So when did you finally meet and fall in love with your wife?
Real answer : Holly!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8151286244392395], 'recall': [0.8273200392723083], 'f1': [0.8211790919303894], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 24.35802459716797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nHOLLY (v.o.): We needed supplies, so we went to a rich man's house. Kit figured it'd be safer and quicker than shopping in the downtown... A maid came to the door.\nKIT: Hi, I come to check the meter. My tools are in here. Oh, this Is Holly. She's from Texas.\nHOLLY (v.o.): Later we found out she was deaf and we hadn't even known it.\n\n", 'answer': 'Excuse me.', 'gold_tag': 'Everyday Language', 'last_speaker': 'KIT'}
Last word -> KIT 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8094543218612671], 'recall': [0.833245575428009], 'f1': [0.8211776614189148], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.039215686274509796, 'rouge2': 0.0, 'rougeL': 0.039215686274509796, 'rougeLsum': 0.039215686274509796}
bleu 1/2 : 0 0
ppl : 28.4049072265625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: You want to try that handgun against this rifle, go ahead.\n\n', 'answer': 'Some day you might have to face me without a gun.', 'gold_tag': 'CORBETT believes he could potentially best ERIC in a face-to-face confrontation , The confrontation would be without firearms', 'last_speaker': 'CORBETT'}
Last word -> CORBETT : "Some day you might have to face me without a gun."
prediction : 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8675568103790283], 'recall': [0.8708289861679077], 'f1': [0.8691898584365845], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 42.340484619140625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: How the hell were they smart enough to find us?\nCORBETT: Smart? Sure. That's why I'm sitting in this plane and they're down there blowing me kisses. Been driving long?\nERIC: I needed a pilot's license to take the job here, so I got one in six weeks.\nCORBETT: That makes the flight more interesting. See the blood? Pack of wolves took down a moose. Greedy, gut-ripping sons of bitches. I'd kill the last wolf on earth, right in fr

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8354159593582153], 'recall': [0.8139841556549072], 'f1': [0.8245607614517212], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16842105263157894, 'rouge2': 0.0, 'rougeL': 0.12631578947368421, 'rougeLsum': 0.12631578947368421}
bleu 1/2 : 0.03656495543138742 0.005287283819873442
ppl : 34.39166259765625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: Stay put!\nCORBETT: You got the belly to look me in the eye and pull the trigger?\nERIC: Be no different than shooting a rabid dog. Next time you want to kill yourself, don't include me.\nCORBETT: I took the odds on getting down in one piece, and I made it. Now we're in my territory.\nERIC: With light clothing and no supplies, this is nobody's te

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8964131474494934], 'recall': [0.8258377909660339], 'f1': [0.8596794009208679], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 183.27244567871094
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: It'll be interesting, trying to build a fire without any wood.\nCORBETT: Welcome to the environment, Mr. Ecology. Out here, one mistake is all you get. Why in hell you care enough about me to die taking me in?\n\n", 'answer': "I don't plan on dying.", 'gold_tag': 'ERIC is determined and confident', 'last_speaker': 'ERIC'}
Last word -> ERIC : "I don't plan on dying."
prediction :  I'm from the Environment Commission. We're buildi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.837255597114563], 'recall': [0.8596464395523071], 'f1': [0.8483032584190369], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.058823529411764705, 'rouge2': 0.0, 'rougeL': 0.058823529411764705, 'rougeLsum': 0.058823529411764705}
bleu 1/2 : 0 0
ppl : 19.956518173217773
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: We'll stop here, dig out a snow shelter.\n\n", 'answer': "Snow shelter. Okay. You dig. I'll have a little sit-down. Even in the drifts, this snow's too powdery to make a shelter. When you're done jerking around, reach down the back of my coat.", 'gold_tag': 'CORBETT is tired and possibly older or less physically fit than ERIC , CORBETT requested to sit while ERIC digs , CORBETT i

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8585262298583984], 'recall': [0.847027063369751], 'f1': [0.8527379035949707], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2769230769230769, 'rouge2': 0.031746031746031744, 'rougeL': 0.18461538461538463, 'rougeLsum': 0.18461538461538463}
bleu 1/2 : 0.16964803398269643 0.022335201533141166
ppl : 31.2751522064209
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORBETT: Still quite a hike to Devil's Cauldron. Days. A long stretch to go without sleep, my friend. You can hide behind that pistol for now, but take your eyes off me long enough to sneeze --\n\n", 'answer': '-- Turn around.', 'gold_tag': 'ERIC commands CORBETT to turn around', 'last_speaker': 'ERIC'}
Last word -> ERIC : "-- Turn around

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8510054349899292], 'recall': [0.8706150650978088], 'f1': [0.8606985211372375], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 97.94441986083984
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: We're not sitting here 'til November. There's a cargo plane coming to Devil's Cauldron in four days, and I'm putting you on it.\nCORBETT: We get wet, we freeze to death in a couple hours.\nERIC: I've been on ice like this when I was a kid, skating. Spread your weight, keep moving. Go on.\nCORBETT: Be my guest.\nERIC: I'm right behind you.\nCORBETT: Wait 'til I'm across! Most dangerous thing in the world: A regular Joe, in over hi

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8465545177459717], 'recall': [0.8379065990447998], 'f1': [0.8422083258628845], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09090909090909091, 'rouge2': 0.0, 'rougeL': 0.09090909090909091, 'rougeLsum': 0.09090909090909091}
bleu 1/2 : 0 0
ppl : 109.49052429199219
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORBETT: Have to backtrack, find another way down.\nERIC: Forget it. It would take days.\n\n', 'answer': 'Going to be a bit of a challenge with handcuffs on.', 'gold_tag': 'Everyday Language', 'last_speaker': 'CORBETT'}
Last word -> CORBETT : "Going to be a bit of a challenge with handcuffs on."
prediction :  You’ve got time.
Real answer : Going to be a bit of a challenge with handcuffs 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8666850328445435], 'recall': [0.855934739112854], 'f1': [0.8612763285636902], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 92.29166412353516
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: Too bad we're heading west.\nCORBETT: There's a snowmobile. Inside a day we could be on the Yukon. I got money there. Remember that five thousand? Make it ten. Be smart. Take it and walk away.\nERIC: You don't get it, do you? Stop!\nCORBETT: Nothing personal. Just wanted to see what you'd do. Have to get these wet things off.\nERIC: You're not going to slow us down! Keep moving!\n\n", 'answer': "Inside of three hours you'd be drag

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8292450904846191], 'recall': [0.8386026620864868], 'f1': [0.8338976502418518], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.09090909090909091, 'rouge2': 0.0, 'rougeL': 0.09090909090909091, 'rougeLsum': 0.09090909090909091}
bleu 1/2 : 0 0
ppl : 145.13453674316406
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: Stay awake! You want to go\nCORBETT: If that means freeze my balls off, no thanks. I'll be okay. Told you I'm fine!\nERIC: How many do you see?\nCORBETT: What?! Fuck off. Save yourself.\nERIC: You don't feel cold?\nCORBETT: It's a spring day...\nERIC: Wake up, goddammit! You've got classic hypothermia -- crankiness, fatigue, can't feel your coldness. Worst thing you can do is fade 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.848825216293335], 'recall': [0.8633012175559998], 'f1': [0.8560019731521606], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.3333333333333333, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.12500000000000003 0.04225771273642583
ppl : 24.691381454467773
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORBETT: Talk to that good-looking girl of yours?\nERIC: You broke the radio, remember?\nCORBETT: I'm sure she's fine. Seemed like a clever kid. You were real resourceful out there. Got me thinking of this perimeter man, froze all his fingers one winter. So he hacked the tips off and sharpened the exposed bones. Gets along better than ever. Yeah, maybe I

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8337451815605164], 'recall': [0.8284395337104797], 'f1': [0.8310838341712952], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.10526315789473685, 'rouge2': 0.0, 'rougeL': 0.10526315789473685, 'rougeLsum': 0.10526315789473685}
bleu 1/2 : 0.0909090909090909 0.03015113445777637
ppl : 60.28529739379883
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORBETT: Could've told you no one would help... Maybe you can talk sense into your boyfriend.\nERIC: Shut up!\nCORBETT: Either way, you better clear out. I don't want anyone innocent getting harmed.\n\n", 'answer': "We're moving. Let's go.", 'gold_tag': "ERIC had an immediate response to CORBETT's comment", 'last_speaker': 'ERIC'}
Last word -> ERIC : "We

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8358867764472961], 'recall': [0.8851723670959473], 'f1': [0.8598238825798035], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 23.270755767822266
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCORBETT: They know I'm here. I don't say something, they'll plow this town under. You willing to accept that responsibility?\nERIC: Say you're okay. Tell them I'll kill you if they rush us.\n\n", 'answer': 'Bob?', 'gold_tag': 'Everyday Language', 'last_speaker': 'CORBETT'}
Last word -> CORBETT : "Bob?"
prediction :  I know what you're doing. You want to kill every man in town. It's exactly what they told me. This is a test. Will that 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8078157901763916], 'recall': [0.857059895992279], 'f1': [0.8317095041275024], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 18.800853729248047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nERIC: How come you didn't let him shoot me?\nCORBETT: Like I said, I'd still be up on that mountain, frozen solid, it wasn't for you.\nERIC: We'll get you to a hospital, soon as we get to Fairbanks.\nCORBETT: So they can patch me up and put me in a cage? Forget it. Meyerling's right -- I'm a dinosaur. Greedy bastards like him, it's their turn with this land. Put me in the woods, let me live or die on my own. Look down there, tell me wh

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8538553714752197], 'recall': [0.8397559523582458], 'f1': [0.8467469215393066], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.06666666666666667, 'rouge2': 0.0, 'rougeL': 0.06666666666666667, 'rougeLsum': 0.06666666666666667}
bleu 1/2 : 0 0
ppl : 127.19056701660156
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCHELL: Accident Investigation. Detective Mitchell.\nLISA: Hi, my name-is Lisa Cohen. I was a witness in a bus accident case a few weeks ago...\nMITCHELL: Yeah, hi, Lisa, what can I do for you?\nLISA: Well, this is probably gonna sound a little weird, but are you allowed to tell me how to get in touch with that woman's family? I'm obviously probably too late to go to the funeral, but I

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7948750257492065], 'recall': [0.9158496260643005], 'f1': [0.8510849475860596], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 11.501646041870117
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCHELL: So now you're sayin' he ran the light.\nLISA: Yes. He wasn't even looking at the road. And I was definitely trying to get his attention...\nMITCHELL: No, I get the picture. So you're flirtin' with this guy, he's wavin' at you, he runs the light, hits the decedent, she dies on the scene...the both of youse lie to me on both of your statements, and then somewheres in there you turn around, decide the guy belongs in jail. That 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7794872522354126], 'recall': [0.8345792293548584], 'f1': [0.8060930371284485], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.2, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.2}
bleu 1/2 : 0 0
ppl : 88.00093078613281
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMITCHELL: Detective Mitchell.\nLISA: Oh, hi, it's Lisa Cohen calling.\nMITCHELL: Hi Lisa, what can I do for you?\nLISA: Well, I was just wondering what ever happened, if anything, with the case. You said you might re-interview the bus driver...\nMITCHELL: Yes, we did:.We brought him back in --\nLISA: You did? What happened?\nMITCHELL: Well, he basically stuck to his original representation, and that was pretty much it. I brought it up 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8361716866493225], 'recall': [0.8917528390884399], 'f1': [0.8630682826042175], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 24.935420989990234
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRISTY: A convent? They put me in a convent? What's the problem? Wasn't there any more room in Hell?\nMOTHER SUPERIOR: What is your full name? Your Christian name?\nCHRISTY: Mother Theresa. Gandhi. Don't you understand? This is all a mistake. I'm the good guy. I'm the witness. And I'm being punished!\nMOTHER SUPERIOR: Your name?\nCHRISTY: Christy. Christy Van Cartier.\nMOTHER SUPERIOR: 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8457251787185669], 'recall': [0.8501306772232056], 'f1': [0.8479222059249878], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 49.069435119628906
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRISTY: No! No! I can't do this!\nMOTHER SUPERIOR: Normally dressed, you are a sore thumb, and you endanger us all. While you are here, you will conduct yourself as a nun. Only I shall know who, and what, you truly are. You will not disrupt the convent. You will not influence the Sisters with your colorful street talk. You will draw no attention to yourself in any way.\nCHRISTY: But look at rites Look at this! I'm a nun! I'm invisibl

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8118543028831482], 'recall': [0.8237195014953613], 'f1': [0.8177438378334045], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 98.66117095947266
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOTHER SUPERIOR: Clarence Loyola. There are three vows that every nun must accept. The vow of poverty...\nCHRISTY: No problem.\nMOTHER SUPERIOR: The vow of obedience.\nCHRISTY: Like to who?\nMOTHER SUPERIOR: And the vow of chastity.\n\n', 'answer': "I'm outta here.", 'gold_tag': 'CHRISTY indicates a lack of religious conviction or perhaps a more rebellious nature', 'last_speaker': 'CHRISTY'}
Last word -> CHRISTY : "I'm outta here."
pre

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8195335865020752], 'recall': [0.8597028851509094], 'f1': [0.8391377925872803], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 74.46952056884766
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRISTY: Is this all the food I get?\nMOTHER SUPERIOR: Yes. Until breakfast.\nCHRISTY: That's tomorrow! Don't you have any snacks? Machines?\nMOTHER SUPERIOR: This shall be your cell, Mary Clarence.\nCHRISTY: My what?\n\n", 'answer': 'Your cell. Your room.', 'gold_tag': 'Everyday Language', 'last_speaker': 'MOTHER SUPERIOR'}
Last word -> MOTHER SUPERIOR : "Your cell. Your room."
prediction :  The cell. Mary Clarence.
Real answer : Your

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9279961585998535], 'recall': [0.9323776960372925], 'f1': [0.9301817417144775], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.25, 'rouge2': 0.0, 'rougeL': 0.25, 'rougeLsum': 0.25}
bleu 1/2 : 0.25 0.09128709291752769
ppl : 4154.3291015625
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nMOTHER SUPERIOR: A balloon and a banana! Tell me, Mary Clarence, what were you thinking?\nCHRISTY: Those girls were asking questions. They\'ve got it rough. I wish somebody had talked to me about that stuff when was their age. I wish somebody would talk to me about it now.\nMOTHER SUPERIOR: This is a convent, not a clinic. We offer moral guidance, not hardware.\nCHRISTY: Have you been out in those streets? Do you 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8641891479492188], 'recall': [0.8059753775596619], 'f1': [0.8340677618980408], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 1242.8402099609375
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nCHRISTY: I'm sorry, I'm not a weaver!\nMOTHER SUPERIOR: Months of work, and the loom, absolutely ruined!\n\n", 'answer': "Look, this isn't my kind of thing! I wasn't cut out to make little tapestries, or knit little sweaters! I don't do that! I'm a star!", 'gold_tag': 'CHRISTY does not enjoy craft activities such as knitting or making tapestries', 'last_speaker': 'CHRISTY'}
Last word -> CHRISTY : "Look, this isn't my kind of thing! I 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7997859716415405], 'recall': [0.7710283398628235], 'f1': [0.7851439118385315], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.058823529411764705, 'rouge2': 0.0, 'rougeL': 0.058823529411764705, 'rougeLsum': 0.058823529411764705}
bleu 1/2 : 0.0010216928596160166 0.00037306948403576754
ppl : 69.61486053466797
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFOLEY: Call everyone in - every car patrol, beat cop, off-duty, too! Pull 'em in, close every street! Now! I'm gonna do what Gordon never could.\nBLAKE: What's that?\nFOLEY: I'm gonna take down the Batman.\n\n", 'answer': 'Sir, what about the armed robbers?', 'gold_tag': 'BLAKE is a police officer , BLAKE shows concern for ongoing crimes', 'last

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.858802080154419], 'recall': [0.8795403838157654], 'f1': [0.8690475225448608], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0 0
ppl : 42.318817138671875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFOLEY: Negative! Stay on the Batman!\nBLAKE: But he's getting away!\n\n", 'answer': 'Who do you want to catch? Some robber, or the son-of-a-bitch who killed Harvey Dent?', 'gold_tag': "FOLEY believes Batman is responsible for Harvey Dent's death", 'last_speaker': 'FOLEY'}
Last word -> FOLEY : "Who do you want to catch? Some robber, or the son-of-a-bitch who killed Harvey Dent?"
prediction

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8439286351203918], 'recall': [0.8232322335243225], 'f1': [0.8334519267082214], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 2747.90185546875
This is a wo tag evaluation
Last word -> FOLEY : "Pull out! Pull 'em out!"
prediction :  No, no! Pull out your people, you can all escape! All of you, don't get hurt!
Real answer : Pull out! Pull 'em out!


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8354833126068115], 'recall': [0.8649249076843262], 'f1': [0.8499492406845093], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.1, 'rougeL': 0.1818181818181818, 'rougeLsum': 0.1818181818181818}
bleu 1/2 : 0.0625 0.02041241452319315
ppl : 36.70744705200195
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBLAKE: Foley?\nFOLEY: Jesus, Blake! Every cop in the city's down in those tunnels!\n\n", 'answer': 'Not every cop...', 'gold_tag': 'Blake is a police officer , Blake and Foley share the experience of being police officers dealing with a crisis in the city', 'last_speaker': 'BLAKE'}
Last word -> BLAKE : "Not every cop..."
prediction :  11,733, is that enough for you? Th

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8081783056259155], 'recall': [0.845629096031189], 'f1': [0.8264796733856201], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 46.21240997314453
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFRANK: Morning, Jim.\nJIM: Morning.\n\n', 'answer': 'Listen, have you got a minute?', 'gold_tag': 'Everyday Language', 'last_speaker': 'FRANK'}
Last word -> FRANK : "Listen, have you got a minute?"
prediction :  And how are you feeling today?
Real answer : Listen, have you got a minute?


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8682733178138733], 'recall': [0.8676229119300842], 'f1': [0.8679479956626892], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.16666666666666666, 'rouge2': 0.0, 'rougeL': 0.16666666666666666, 'rougeLsum': 0.16666666666666666}
bleu 1/2 : 0.16666666666666669 0.05773502691896258
ppl : 78.98401641845703
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFRANK: We lost water pressure three weeks ago. For a while, I thought we'd be okay with the water in the other flats. The cistern and tanks. But it vanishes so quick you wouldn't believe. You drink it, it\nJIM: They're all empty...\nFRANK: It hasn't rained in over a week. You can set up a plastic sheet to catch dew and condensation. Trap it somehow. I s

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8507413864135742], 'recall': [0.8488583564758301], 'f1': [0.8497987985610962], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 45.61374282836914
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFRANK: HOLD ON! Fuck.\nJIM: The world's worst place to get a flat.\n\n", 'answer': "Agreed. I think we'd better do this quick.", 'gold_tag': 'Everyday Language', 'last_speaker': 'FRANK'}
Last word -> FRANK : "Agreed. I think we'd better do this quick."
prediction :  Here is your fucking place! Fuck you!
Real answer : Agreed. I think we'd better do this quick.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8227159380912781], 'recall': [0.8406364321708679], 'f1': [0.8315796256065369], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 183.50523376464844
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJIM: Jesus Christ. We're never going to be able to get out of London.\n\n", 'answer': "Don't you worry about that. I've sat twenty years in this seat. I've got my routes.", 'gold_tag': "FRANK has been in the same job for twenty years , FRANK mentions routes , FRANK's job is likely as a bus or taxi driver", 'last_speaker': 'FRANK'}
Last word -> FRANK : "Don't you worry about that. I've sat twenty years in this seat. I've got my routes.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8438674807548523], 'recall': [0.8494679927825928], 'f1': [0.84665846824646], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.11428571428571428, 'rouge2': 0.0, 'rougeL': 0.05714285714285714, 'rougeLsum': 0.05714285714285714}
bleu 1/2 : 0.06236713233544119 0.020414436057463375
ppl : 23.459869384765625
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nFRANK: Hey.\nJIM: What...\nFRANK: Shh. You're having a bad dream, that's all.\n\n", 'answer': 'Okay, Dad.', 'gold_tag': 'JIM is possibly Frank\'s son, as implied by his response "Okay, Dad" , FRANK is possibly older , Jim refers to FRANK as \'Dad\'', 'last_speaker': 'JIM'}
Last word -> JIM : "Okay, Dad."
prediction :  *yawn* I'm just going to work and t

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8149699568748474], 'recall': [0.8824225664138794], 'f1': [0.8473560214042664], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 30.71849822998047
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nJIM: Here you go, Frank. This is how you catch dew. I'm covered in the fucking stuff.\n\n", 'answer': "Okay. Let's get rolling. If we make good time, we'll be there before midday.", 'gold_tag': 'FRANK wants to reach their destination before midday', 'last_speaker': 'FRANK'}
Last word -> FRANK : "Okay. Let's get rolling. If we make good time, we'll be there before midday."
prediction :  Do you want to get it on the road with me, Jim?
Re

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8666090965270996], 'recall': [0.8606510162353516], 'f1': [0.8636198043823242], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.07142857142857144, 'rouge2': 0.0, 'rougeL': 0.07142857142857144, 'rougeLsum': 0.07142857142857144}
bleu 1/2 : 0.07054014374088449 0.023298642673655674
ppl : 44.37327194213867
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTODD: It... grabbed me. It was just like Miss Harmon said. I couldn't believe it. But there it was, the camps, the uniforms, the ...the way they saluted each other. But what am I telling you for? You know more about this stuff than anybody.\nDUSSANDER: Yes, what are you telling me for?\nTODD: Then last year I wrote my big term paper on Dachau. I got an

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8374929428100586], 'recall': [0.8190574645996094], 'f1': [0.8281726241111755], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.04597701149425287, 'rouge2': 0.0, 'rougeL': 0.04597701149425287, 'rougeLsum': 0.04597701149425287}
bleu 1/2 : 1.220274085705462e-12 4.726101211745736e-13
ppl : 1616.1131591796875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTODD: I\'ve never seen anyone drink bourbon with McDonald\'s before. ...So, How did the stuff get to Patin?\nDUSSANDER: In railroad cars marked "Medical Supplies." It came in long crates that looked like coffins.\nTODD: That\'s fitting. Was it always Zyklon-B?\nDUSSANDER: No, from time to time we would be sent something else. Experimental gases. Th

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8829610347747803], 'recall': [0.8443745374679565], 'f1': [0.8632367849349976], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.1818181818181818, 'rouge2': 0.0, 'rougeL': 0.1818181818181818, 'rougeLsum': 0.1818181818181818}
bleu 1/2 : 0.018143590657882503 0.006414727994630807
ppl : 211.57781982421875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDUSSANDER: What for?\nTODD: Just to be safe.\nDUSSANDER: To hell with this. You open it.\nTODD: Oh, relax, old man. Sit down and open it.\nDUSSANDER: You wrapped it yourself, I see. Pete's Quality Costume Clothiers - Serving You Since 1976.\nTODD: Go ahead, try it on.\nDUSSANDER: Are you out of your mind, boy? What makes you think I would put something 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8312356472015381], 'recall': [0.8321101665496826], 'f1': [0.831672728061676], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.42857142857142855, 'rouge2': 0.0, 'rougeL': 0.28571428571428575, 'rougeLsum': 0.28571428571428575}
bleu 1/2 : 0.375 0.07319250547114
ppl : 44.21181869506836
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nTODD: You what?\nDUSSANDER: I told him you were getting back on track by your own accord. I gave him my word. If you get even one flunk card--\nTODD: Are you out of your fucking mind? I'm going to get three or four flunk cards. I failed an economics test today, so there's another\nDUSSANDER: Boy, it was the best I could do without arousing suspicion.\nTODD: I could have 

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.7979593276977539], 'recall': [0.8234558701515198], 'f1': [0.8105071783065796], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.12121212121212122, 'rouge2': 0.0, 'rougeL': 0.06060606060606061, 'rougeLsum': 0.06060606060606061}
bleu 1/2 : 0.037037037037037035 0.01193524790065722
ppl : 20.16913604736328
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDUSSANDER: Come over right away, boy. I'm having a heart attack, a bad one, I think. I understand that you cannot talk, but you can listen. I cannot call 911... at least not yet. There is quite a mess here. I need help... and that means you need help.\nTODD: Well if you put it that way --\nDUSSANDER: Tell your parents I've had a letter, an important le

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.818199872970581], 'recall': [0.8465230464935303], 'f1': [0.8321205377578735], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.057971014492753624, 'rouge2': 0.0, 'rougeL': 0.057971014492753624, 'rougeLsum': 0.057971014492753624}
bleu 1/2 : 0.016949152542372878 0.005405800292019179
ppl : 20.22266960144043
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nDUSSANDER: Hurry up, for God's sake.\nTODD: Don't you die on me, you old bastard. Not now.\nDUSSANDER: I think we're okay.\nTODD: Me too.\nDUSSANDER: Call an ambulance. What are you talking about? For God's sake, boy...\nTODD: I want the key and I want to know where the bank is.\n\n", 'answer': 'Hillcrest Savings on Domingo Avenue. The key is behind

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8918588161468506], 'recall': [0.8320796489715576], 'f1': [0.8609327673912048], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.18181818181818182, 'rouge2': 0.1, 'rougeL': 0.18181818181818182, 'rougeLsum': 0.18181818181818182}
bleu 1/2 : 0.09111615923542012 0.022006643185771023
ppl : 79.45957946777344
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINDY: We cannot afford to take mythology at face value. Next week: "Egyptology." Starting with the excavation of Naukratis by Blinders Petrie in 1885. I will be in my office if anybody\'s got any problems for the next hour and a half. Marcus! I did it!\nBRODY: You\'ve got itl\nINDY: You know how long I\'ve been looking for that?!\nBRODY: All your life.

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8155430555343628], 'recall': [0.9046769738197327], 'f1': [0.8578007221221924], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13333333333333333, 'rouge2': 0.0, 'rougeL': 0.13333333333333333, 'rougeLsum': 0.13333333333333333}
bleu 1/2 : 0 0
ppl : 23.148927688598633
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINDY: Dad? He's an academic A bookworm. He's not a field mans Dad? Dad?\nBRODY: What has the old fool got himself into now?\nINDY: I don't know. But whatever it is, he's in over his head! Dad?\nBRODY: It's today's mail. And it's been opened.\nINDY: Mail! That's it, Marcus! Venice, Italy!\nBRODY: What is it?\nINDY: clue he ever followed. Every discovery he made. A complete record of his s

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.8393226265907288], 'recall': [0.846880316734314], 'f1': [0.8430845141410828], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 47.76873779296875
This is a wo tag evaluation
{'prompt': '\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nINDY: How\'s the head?\nBRODY: It\'s better, now I\'ve seen this. It\'s the name of a city. "Alexandretta?" Hmmm...\nINDY: The Knights of the First Crusade laid siege to the city of Alexandretta for over a year. The entire city was destroyed. The present city of Iskenderun is built on its ruins. Marcus� you remember what the Grail Tablet said. "Across the desert and through the mountain to the But where exactly?\nBRODY: Your father woul

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.9300894141197205], 'recall': [0.9238990545272827], 'f1': [0.9269838929176331], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.6, 'rouge2': 0.25, 'rougeL': 0.6, 'rougeLsum': 0.6}
bleu 1/2 : 0.5 0.12909944487358058
ppl : 28.1750431060791
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBRODY: I've got to talk to you.\nINDY: This isn't really a good time.\nBRODY: Indy, it's important.\n\n", 'answer': 'All right. Come on in.', 'gold_tag': 'Everyday Language', 'last_speaker': 'INDY'}
Last word -> INDY : "All right. Come on in."
prediction :  I know. I'm sorry.
Real answer : All right. Come on in.


  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.884502649307251], 'recall': [0.8905899524688721], 'f1': [0.887535810470581], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}
bleu 1/2 : 0 0
ppl : 26.082244873046875
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nBRODY: They want you to go for it. And they'll pay.\nINDY: Good work, Marcus. I had a feeling this would happen. And, of course, the Museum gets the Ark when we're done.\nBRODY: Of course.\nINDY: Okay, here's the way it's gonna be. First, I'll high-tail it to Shanghai and get the piece from General Hok. Then I think I know where I can find Ravenwood. If only I can get -\nBRODY: General Hok's a tough customer. They don't call him the Wil

  0%|                                                                                                                                                                      | 0/1 [00:00<?, ?it/s]


Bert Score : {'precision': [0.865412712097168], 'recall': [0.8595024347305298], 'f1': [0.8624475002288818], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.41.0.dev0)'}
Rouge Score : {'rouge1': 0.13953488372093023, 'rouge2': 0.04878048780487805, 'rougeL': 0.13953488372093023, 'rougeLsum': 0.13953488372093023}
bleu 1/2 : 0.08070606580348652 0.018638267526751434
ppl : 45.05147933959961
This is a wo tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nELENA: You have a great future as a driver. If they don't arrest you first!!\nSALVATORE: That's nothing to do with it, it's the car that's still being run in...\nELENA: SO now how do we get home?\n\n", 'answer': 'Hello, Dr Mendola...Hem...', 'gold_tag': 'SALVATORE is familiar with Dr. Mendola', 'last_speaker': 'SALVATORE'}
Last word -> S

KeyboardInterrupt: 

In [56]:
def calculate_mean(tensor_list):
    # Move tensors to CPU and convert to numpy arrays
    cpu_tensors = [t.cpu().numpy() for t in tensor_list]
    # Calculate the mean
    mean_value = np.mean(cpu_tensors)
    return mean_value

In [78]:
import pandas as pd
pd.Series([i.cpu().numpy() for i in ppl_list]).dropna()

0      11553.602
1       16.77277
2      395.32742
3      35.409416
4      1100.4786
         ...    
661    23.148928
662    47.768738
663    28.175043
664    26.082245
665     45.05148
Length: 666, dtype: object

In [57]:
import numpy as np

ber = [i['precision'][0] for i in bert]

rouge1 = np.mean([i['rouge1'] for i in rough])

rouge2 = np.mean([i['rouge2'] for i in rough])

rougeL = np.mean([i['rougeL'] for i in rough])

rougeLsum = np.mean([i['rougeLsum'] for i in rough])

mean_bleu = np.mean([i['bleu'] for i in bleu])

print(f'PPL : {calculate_mean(ppl_list)}, \nBertScore : {np.mean(ber)} \nrouge1 : {rouge1} \nrouge2 : {rouge2} \nrougeL : {rougeL} \nrougeLsum : {rougeLsum}, \nbleu : {mean_bleu}')

PPL : nan, 
BertScore : 0.8401927087937031 
rouge1 : 0.08406581663518085 
rouge2 : 0.010053239872923192 
rougeL : 0.07480809852389264 
rougeLsum : 0.07480809852389264, 
bleu : 0.0


In [82]:
np.mean(bleu_1_list)

0.0278158717434742