In [1]:
import warnings
warnings.filterwarnings('ignore')


def read_json_file(file_path):
    """
    Reads a JSON file with multiple JSON objects (one per line) and returns the data as a list of dictionaries.
    
    Args:||
        file_path (str): The path to the JSON file.
    
    Returns:
        list: A list of dictionaries containing the data from the JSON file.
    """
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    json_obj = json.loads(line)
                    data.append(json_obj)
                except json.JSONDecodeError as e:
                    print(f"JSONDecodeError in line: {line.strip()}")
                    print(f"Error message: {e}")
        return data
    except FileNotFoundError:
        print(f"The file at {file_path} does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [2]:
def extract_data_without_tag(prompt):
    prompts = prompt['prompt']

    last_utter = f"{prompt['last_speaker']} :" 

    dialogue = prompts + last_utter
    
    return dialogue, prompt['last_speaker'], prompt['answer']
    

def extract_data_with_tag(prompt):
    prompts = prompt['prompt']

    last_utter = f"{prompt['last_speaker']} : ({prompt['gold_tag']})" 

    dialogue = prompts + last_utter
    
    return dialogue, prompt['last_speaker'], prompt['gold_tag'], prompt['answer']

def distinct_ngrams(sentences, n):
    """
    Calculate the distinct-n metric for a list of sentences.
    
    Args:
        sentences (list of str): The list of sentences generated by the model.
        n (int): The n-gram length.

    Returns:
        float: The distinct-n score.
    """
    ngrams = Counter()
    total_ngrams = 0

    for sentence in sentences:
        tokens = sentence.split()
        sentence_ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams.update(sentence_ngrams)
        total_ngrams += len(tokens) - n + 1
    
    return len(ngrams) / total_ngrams if total_ngrams > 0 else 0


def get_ppl(text ,model, tokenizer):
    encodings = tokenizer(text, return_tensors="pt")

    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)
    
    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
    
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
    
            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1. 
            neg_log_likelihood = outputs.loss
    
        nlls.append(neg_log_likelihood)
    
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl    


In [12]:
import os, sys
import re
import torch
import evaluate
from evaluate import load
current_dir = os.getcwd()
episode_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(episode_dir)

from utils.model_utils import get_peft_checkpoint, generate, get_peft_checkpoint_
from tqdm import tqdm


from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer)

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluation_chat_system(num, prompt , model, tokenizer, device, bert_eval,rough_eval):
    
    if num == 2:
        print("This is a wo tag evaluation")
        print(prompt)
        input__, person, utterance,  = extract_data_without_tag(prompt)
        print(f'Last word -> {person} : "{utterance}"')
        
        input_ = tokenizer(input__, return_tensors = 'pt').to(device)     
        output = generate(model,tokenizer,
                                      input_,
                                      num_beams=1,
                                      num_return_sequences=1,
                                      max_new_tokens=100)


        
        response = output.replace(input__, '')
        response = response.split("\n")[0]
        print(f"prediction : {response}")
        print(f"Real answer : {utterance}")

        reference = [utterance.split()]
        candidate = response.split()

            
        output_list = [response.strip()]
        last_utter_list = [utterance.strip()]
        #evalation
        bert_score = bert_eval.compute(predictions=output_list, references=last_utter_list, lang="en")
        rouge_score = rouge_eval.compute(predictions=output_list, references=last_utter_list)

        ## bleu

        weights_unigram = (1, 0, 0, 0)
        bleu_unigram = sentence_bleu(reference, candidate, weights=weights_unigram, smoothing_function=SmoothingFunction().method1)

        weights_bigram = (0.5, 0.5, 0, 0)
        bleu_bigram = sentence_bleu(reference, candidate, weights=weights_bigram, smoothing_function=SmoothingFunction().method1)

        ### ppl
        ppl = get_ppl(response, model, tokenizer)
        
        print(f"Bert Score : {bert_score}")
        print(f"Rouge Score : {rouge_score}")
        print(f"bleu 1/2 : {bleu_unigram} {bleu_bigram}")
        print(f"ppl : {ppl}")
        return bert_score, rouge_score, bleu_unigram, bleu_bigram, response ,ppl
    
    
    if num == 3:
        print("This is with tag evaluation")
        print(prompt)
        input__ , person, trait, utterance = extract_data_with_tag(prompt)
        print(f'Last word -> {person} : ({trait}) "{utterance}"')
        
        input_ = tokenizer(input__, return_tensors = 'pt').to(device)
     
        output = generate(model,tokenizer,
                                      input_,
                                      num_beams=1,
                                      num_return_sequences=1,
                                      max_new_tokens=100)


        ### utterance : correct answer
        ### response : Model-generated answer

        response = output.replace(input__, '').split("\n")[0]
        
        print(f"prediction : {response}")
        print(f"Real answer : {utterance}")
        
        output_list = [response.strip()]
        last_utter_list = [utterance.strip()]

        reference = [utterance.split()]
        candidate = response.split()

        #evalation
        bert_score = bert_eval.compute(predictions=output_list, references=last_utter_list, lang="en")
        rouge_score = rouge_eval.compute(predictions=output_list, references=last_utter_list)

        ## bleu

        weights_unigram = (1, 0, 0, 0)
        bleu_unigram = sentence_bleu(reference, candidate, weights=weights_unigram, smoothing_function=SmoothingFunction().method1)

        weights_bigram = (0.5, 0.5, 0, 0)
        bleu_bigram = sentence_bleu(reference, candidate, weights=weights_bigram, smoothing_function=SmoothingFunction().method1)

        ### ppl
        ppl = get_ppl(response, model, tokenizer)
        
        print(f"Bert Score : {bert_score}")
        print(f"Rouge Score : {rouge_score}")
        print(f"bleu 1/2 : {bleu_unigram} {bleu_bigram}")
        print(f"ppl : {ppl}")

        
        return bert_score, rouge_score, bleu_unigram, bleu_bigram, response ,ppl

In [5]:
path = 'chano12/gemma_without_tag'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model, tokenizer = get_peft_checkpoint_(path, device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
import json

path = '/home/chanho/Model/SHARE/Refactorizing/result/dataset/test_without_tag.json'

json_data = read_json_file(path)

In [13]:
bert = []
rough = []
bleu_1_list = []
bleu_2_list = []
infer = []
ppl_list = []

In [14]:
bertscore_eval = load("bertscore")
rouge_eval = evaluate.load('rouge') 

for prompt in json_data:
    bert_score, rough_score, bleu_1, bleu_2, infer_sentence, ppl = evaluation_chat_system(3, prompt ,model, tokenizer,device, bertscore_eval, rouge_eval)
    bert.append(bert_score)
    rough.append(rough_score)
    bleu_1_list.append(bleu_1)
    bleu_2_list.append(bleu_2)
    infer.append(infer_sentence)
    ppl_list.append(ppl)

This is with tag evaluation
{'prompt': "\nTask: Generate the next response in the dialogue based on the provided history. The response should logically follow and predict the next reply considering the context of the conversation.\n\n**Dialogue History**:\nLOLA: Hello, Mr. Neff. It's me.\nNEFF: Something the matter?\nLOLA: I've been waiting for you.\nNEFF: For me? What for?\nLOLA: I thought you could let me ride with you, if you're going my way.\n\n", 'answer': "Which way would that be? Oh, sure. Vermont and Franklin. North- west corner, wasn't it? Be glad to, Miss Dietrichson.", 'gold_tag': 'NEFF is familiar with the local geographic area , NEFF references specific streets', 'last_speaker': 'NEFF'}
Last word -> NEFF : (NEFF is familiar with the local geographic area , NEFF references specific streets) "Which way would that be? Oh, sure. Vermont and Franklin. North- west corner, wasn't it? Be glad to, Miss Dietrichson."
prediction :  Why don't you ride with me on the way to dinner? The

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OSError: chano12/gemma_without_tag does not appear to have a file named config.json. Checkout 'https://huggingface.co/chano12/gemma_without_tag/tree/main' for available files.

In [None]:
ber = [i['precision'][0] for i in bert]

In [11]:
def calculate_mean(tensor_list):
    # Move tensors to CPU and convert to numpy arrays
    valid_tensors = [t for t in tensor_list if not torch.isnan(t)]
    cpu_tensors = [t.cpu().numpy() for t in valid_tensors]
    cpu_tensors = [float(i) for i in cpu_tensors]

    return cpu_tensors

In [20]:
np.mean(sorted(calculate_mean(ppl_list))[:-50])

722.6492043087733

In [13]:
import numpy as np

ber = [i['precision'][0] for i in bert]

rouge1 = np.mean([i['rouge1'] for i in rough])

rouge2 = np.mean([i['rouge2'] for i in rough])

rougeL = np.mean([i['rougeL'] for i in rough])

rougeLsum = np.mean([i['rougeLsum'] for i in rough])

mean_bleu_1 = np.mean(bleu_1_list)

mean_bleu_2 = np.mean(bleu_2_list)

print(f'PPL : {np.mean(calculate_mean(ppl_list))}, \nBertScore : {np.mean(ber)} \nrouge1 : {rouge1} \nrouge2 : {rouge2} \nrougeL : {rougeL} \nrougeLsum : {rougeLsum}, \nbleu_1 : {mean_bleu_1} \nbleu_2 : {mean_bleu_2} ')

PPL : 1351.1161067331393, 
BertScore : 0.8786377304240572 
rouge1 : 0.19238813223084816 
rouge2 : 0.07725592091197174 
rougeL : 0.17753343392079152 
rougeLsum : 0.17753343392079152, 
bleu_1 : 0.0851429316663488 
bleu_2 : 0.053337645109041605 


In [15]:
distinct_ngrams(infer, 1)


0.21012395160691755

In [14]:
from collections import Counter
distinct_ngrams(infer, 1)
distinct_ngrams(infer, 2)

0.5886313277654679

In [22]:
np.mean(ber)

0.8492068219184875

In [26]:
rouge1 = np.mean([i['rouge1'] for i in rough])

rouge2 = np.mean([i['rouge2'] for i in rough])

rougeL = np.mean([i['rougeL'] for i in rough])

rougeLsum = np.mean([i['rougeLsum'] for i in rough])

In [29]:
mean_bleu = np.mean([i['bleu'] for i in bleu])

In [33]:
print(f'PPL : {calculate_mean(ppl_list)}, \nBertScore : {np.mean(ber)} \nrouge1 : {rouge1} \nrouge2 : {rouge2} \nrougeL : {rougeL} \nrougeLsum : {rougeLsum}, \nbleu : {mean_bleu}')

PPL : 6.603209018707275, 
BertScore : 0.8492068219184875 
rouge1 : 0.0864367417136198 
rouge2 : 0.016000117526532315 
rougeL : 0.07990215447014093 
rougeLsum : 0.07990215447014093, 
bleu : 0.007389715194612174
