In [None]:
import re
import string
import pandas as pd
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI
from types import SimpleNamespace
import json
import ast
import evaluate

def load_config():
    try:
        with open(r"config.json") as f:
            return json.load(f, object_hook=lambda d: SimpleNamespace(**d))
    except FileNotFoundError:
        raise FileNotFoundError("Config file not found. Please check the path.")

def initialize_azure_client(config):
    client = SecretClient(vault_url=config.key_vault_url, credential=DefaultAzureCredential())
    secret = client.get_secret(config.dev_secret_name)
    return AzureOpenAI(api_key=secret.value, api_version=config.chat.api_version, azure_endpoint=config.chat.azure_endpoint)

# Initialize Azure OpenAI client
config = load_config()
llm = initialize_azure_client(config)

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def normalize_to_set(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    def remove_urls(text):
        """ Removes all URLs starting with 'http://' or 'https://' """
        return re.sub(r'http[s]?://\S+', '', text)
    
    if not isinstance(s, str):
        s = str(s) if s else ""

    text = lower(s)
    text = remove_urls(text)
    text = remove_punc(text)
    text = remove_articles(text)
    text = white_space_fix(text)

    tokens = text.split()
    return set(tokens)

def process_retrieved_items(retrieved_list):
    """
    Convert the 'retrieved result' field into a list of strings
    excluding 'sources'.
    """
    result_strings = []
    for item in retrieved_list:
        for key, val in item.items():
            if key.lower() != "sources":
                result_strings.append(str(val))
    return result_strings

# Function to calculate comprehensiveness using LLM

def calculate_comprehensiveness(query, llm_facts):
    prompt = [
        {
            "role": "system",
            "content": f"""
    ---Role---
    You are a helpful assistant responsible for evaluating the system response on the basis of comprehensiveness.
    ---Goal---
    Given a user query and system response, assess if the response is suited according to
    the following measure:
    ---Measure---
    Comprehensiveness: "How much detail does the answer provide to cover all the aspects and details of the
    question? A comprehensive answer should be thorough and complete, without being redundant or irrelevant.
    For example, if the question is 'What are the benefits and drawbacks of nuclear energy?', a comprehensive
    answer would provide both the positive and negative aspects of nuclear energy, such as its efficiency,
    environmental impact, safety, cost, etc. A comprehensive answer should not leave out any important points
    or provide irrelevant information. For example, an incomplete answer would only provide the benefits of
    nuclear energy without describing the drawbacks, or a redundant answer would repeat the same information
    multiple times."

    Your assessment should include two parts:
    Score: between 0 and 1 , depending how comprehensive the response is. Where 0 is not comprehensive and 1 is the most comprehensive.
    Reasoning: a short explanation of why you gave the score with respect to the measure described above.

    ---query---
    User query: {query}
    ---system response---
    System Response: {llm_facts}

    """
        }]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    result = response.choices[0].message.content.strip()
    try:
        score_line = next((line for line in result.splitlines() if line.lower().startswith("score:")), None)
        if score_line:
            score = float(score_line.split(":")[1].strip())
            return score
        else:
            raise ValueError("Score not found in response.")
    except (ValueError, IndexError) as e:
        return 0

# Function to calculate relevance using LLM
def calculate_relevance(query, llm_facts):
    prompt = [
        {
            "role": "system",
            "content": f"""
    ---Role---
    You are a helpful assistant responsible for evaluating the system response on the basis of relevance.
    ---Goal---
    Given a user query and system response, assess if the response is suited according to
    the following measure:
    ---Measure---
    Relevance: "How well does the answer address the user query? A relevant answer should be directly related
    to the question asked and provide information that is useful and on-topic. For example, if the question is
    'What is the capital of France?', a relevant answer would be 'Paris'. A relevant answer should not provide
    information that is unrelated to the question or that does not directly address the user's information need.
    For example, an irrelevant answer would be 'The Eiffel Tower is located in Paris'."
    Your assessment should include two parts:
    Score: between 0 and 1 , depending how relevant the response is. Where 0 is not relevant and 1 is the most relevant.
    Reasoning: a short explanation of why you gave the score with respect to the measure described above.

    ---query---
    User query: {query}
    ---system response---
    System Response: {llm_facts}
    
    """
        }]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    result = response.choices[0].message.content.strip()
    try:
        score_line = next((line for line in result.splitlines() if line.lower().startswith("score:")), None)
        if score_line:
            score = float(score_line.split(":")[1].strip())
            return score
        else:
            raise ValueError("Score not found in response.")
    except (ValueError, IndexError) as e:
        return 0

# Function to calculate directness using LLM
def calculate_directness(query, llm_facts):
    prompt = [
        {
            "role": "system",
            "content": f"""
    ---Role---
    You are a helpful assistant responsible for evaluating the system response on the basis of directness.
    ---Goal---
    Given a user query and system response, assess if the response is suited according to
    the following measure:
    ---Measure---
    directness: "How specifically and clearly does the answer address the question? A direct answer should
    provide a clear and concise answer to the question. For example, if the question is 'What is the capital
    of France?', a direct answer would be 'Paris'. A direct answer should not provide any irrelevant or
    unnecessary information that does not answer the question. For example, an indirect answer would be 'The
    capital of France is located on the river Seine'."
    
    Your assessment should include two parts:
    Score: between 0 and 1 , depending how direct the response is. Where 0 is not direct and 1 is the most direct.
    Reasoning: a short explanation of why you gave the score with respect to the measure described above.

    ---query---
    User query: {query}
    ---system response---
    System Response: {llm_facts}

    """
        }]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    result = response.choices[0].message.content.strip()
    try:
        score_line = next((line for line in result.splitlines() if line.lower().startswith("score:")), None)
        if score_line:
            score = float(score_line.split(":")[1].strip())
            return score
        else:
            raise ValueError("Score not found in response.")
    except (ValueError, IndexError) as e:
        return 0
    
from collections import defaultdict
import string

def separate_characters(line):
    return list(line.strip().replace(" ", ""))

def separate_punctuation(line):
    words = line.strip().split()
    tokenized = []
    for w in words:
        if len(w) == 1:
            tokenized.append(w)
        else:
            lastChar = w[-1] 
            firstChar = w[0]
            if lastChar in string.punctuation:
                tokenized += [w[:-1], lastChar]
            elif firstChar in string.punctuation:
                tokenized += [firstChar, w[1:]]
            else:
                tokenized.append(w)
    return tokenized
    
def ngram_counts(wordList, order):
    counts = defaultdict(lambda: defaultdict(float))
    nWords = len(wordList)
    for i in range(nWords):
        for j in range(1, order+1):
            if i+j <= nWords:
                ngram = tuple(wordList[i:i+j])
                counts[j-1][ngram]+=1
    return counts

def ngram_matches(ref_ngrams, hyp_ngrams):
    matchingNgramCount = defaultdict(float)
    totalRefNgramCount = defaultdict(float)
    totalHypNgramCount = defaultdict(float)
    for order in ref_ngrams:
        for ngram in hyp_ngrams[order]:
            totalHypNgramCount[order] += hyp_ngrams[order][ngram]
        for ngram in ref_ngrams[order]:
            totalRefNgramCount[order] += ref_ngrams[order][ngram]
            if ngram in hyp_ngrams[order]:
                matchingNgramCount[order] += min(ref_ngrams[order][ngram], hyp_ngrams[order][ngram])
    return matchingNgramCount, totalRefNgramCount, totalHypNgramCount

def ngram_precrecf(matching, reflen, hyplen, beta):
    factor = beta**2
    totalPrec = sum(matching.values()) / sum(hyplen.values()) if sum(hyplen.values()) else 1e-16
    totalRec = sum(matching.values()) / sum(reflen.values()) if sum(reflen.values()) else 1e-16
    denom = factor * totalPrec + totalRec
    totalF = ((1+factor)*totalPrec*totalRec / denom) if denom > 0 else 1e-16
    return totalF

def calculate_chrF(reference, hypothesis, nworder=2, ncorder=6, beta=2.0):
    ref_words = separate_punctuation(reference)
    hyp_words = separate_punctuation(hypothesis)
    ref_chars = separate_characters(reference)
    hyp_chars = separate_characters(hypothesis)

    ref_word_ngram = ngram_counts(ref_words, nworder)
    hyp_word_ngram = ngram_counts(hyp_words, nworder)
    ref_char_ngram = ngram_counts(ref_chars, ncorder)
    hyp_char_ngram = ngram_counts(hyp_chars, ncorder)

    matching_word, ref_word_total, hyp_word_total = ngram_matches(ref_word_ngram, hyp_word_ngram)
    matching_char, ref_char_total, hyp_char_total = ngram_matches(ref_char_ngram, hyp_char_ngram)

    word_fscore = ngram_precrecf(matching_word, ref_word_total, hyp_word_total, beta)
    char_fscore = ngram_precrecf(matching_char, ref_char_total, hyp_char_total, beta)

    # Average word and char f-scores
    total_fscore = (word_fscore + char_fscore) / 2
    return total_fscore

def evaluate_dataset_row(gold_str, final_ans_str, retrieved_list):
    """
    Compute metrics for a single query/row:
    1) Accuracy: check if all gold tokens appear in final_ans_tokens
    2) Precision/Recall for retrieved
    3) F1 for retrieved
    4) Hits@1: check if any retrieved item matches gold tokens
    5) MRR: check if any retrieved item matches gold tokens
    6) BLEU: check if any retrieved item matches gold tokens
    7) ROUGE: check if any retrieved item matches gold tokens
    8) chrF: check if any retrieved item matches gold tokens
    9) Comprehensiveness: check if any retrieved item matches gold tokens
    10) Relevance: check if any retrieved item matches gold tokens
    11) Directness: check if any retrieved item matches gold tokens

    Returns a dict of these metrics for that query.
    """

    # Convert 'retrieved_list' -> list of non-source strings
    retrieved_clean = process_retrieved_items(retrieved_list)

    # Normalize tokens
    gold_tokens = normalize_to_set(gold_str)
    final_ans_tokens = normalize_to_set(final_ans_str)

    # 1) Accuracy: check if all gold tokens appear in final_ans_tokens
    acc = 1.0 if gold_tokens.issubset(final_ans_tokens) else 0.0
    print("accuracy", acc)
    # 2) Precision/Recall for retrieved
    retrieved_str = " ".join(retrieved_clean)
    pred_set = normalize_to_set(retrieved_str)
    intersection = pred_set.intersection(gold_tokens)
    print("predicted", pred_set)
    print("gold", gold_tokens)
    print("intersection", intersection)
    num_common = len(intersection)
    if len(pred_set) == 0 and len(gold_tokens) == 0:
        p_ret, r_ret = 1.0, 1.0
    else:
        p_ret = num_common / len(pred_set) if len(pred_set) > 0 else 0.0
        r_ret = num_common / len(gold_tokens) if len(gold_tokens) > 0 else 0.0
    print("precision", p_ret)
    print("recall", r_ret)
    f1_score = 2 * (p_ret * r_ret) / (p_ret + r_ret) if (p_ret + r_ret) > 0 else 0.0
    print("f1", f1_score)
    hits1 = 1.0 if num_common > 0 else 0.0
    print("hits1", hits1)
    # 5) MRR: check if any retrieved item matches gold tokens
    rank = None
    for idx, candidate_str in enumerate(retrieved_clean):
        cand_set = normalize_to_set(candidate_str)
        if cand_set.intersection(gold_tokens):
            rank = idx + 1
            break

    if rank is None:
        mrr_val = 0.0

    else:
        mrr_val = 1.0 / rank
    print("mrr", mrr_val)   
    predicted = str(final_ans_tokens).strip()
    print("predicted string for blue and rouge", predicted)
    ground_truth = str(gold_tokens).strip()
    print("ground truth string for blue and rouge", ground_truth)
    predicted_sentence = " ".join(sorted(predicted))  # Convert back to sorted sentence
    ground_truth_sentence = " ".join(sorted(ground_truth))  # Convert back to sorted sentence

    try:
        bleu_result = bleu_metric.compute(predictions=[predicted_sentence], references=[[ground_truth_sentence]])
        rouge_result = rouge_metric.compute(predictions=[predicted_sentence], references=[ground_truth_sentence])
        bleu_score = bleu_result["bleu"]
        rouge_1 = rouge_result["rouge1"]
        rouge_2 = rouge_result["rouge2"]
        rouge_l = rouge_result["rougeL"]
        chrf_score = calculate_chrF(ground_truth_sentence, predicted_sentence) 

    except ZeroDivisionError as e:
        print(f"Error computing BLEU for row : {e}")
        bleu_score = 0
        rouge_1 = rouge_2 = rouge_l = chrf_score = 0
    
    print("bleu, rouge1, rouge2, rougeL", bleu_score, rouge_1, rouge_2, rouge_l)
    
    return {
        "Accuracy": acc,
        "Precision(Retrieved)": p_ret,
        "Recall(Retrieved)": r_ret,
        "F1": f1_score,
        "Hits": hits1,
        "MRR": mrr_val,
        "BLEU Score": bleu_score,
        "ROUGE-1": rouge_1,
        "ROUGE-2": rouge_2,
        "ROUGE-L": rouge_l,
        "chrF": chrf_score
    }

def evaluate_dataset_per_row(df):
    """
    For each row in df, compute the row-level metrics and store them in new columns.
    """
    # # Initialize empty columns
    df["Accuracy"] = 0.0
    df["Precision(Retrieved)"] = 0.0
    df["Recall(Retrieved)"] = 0.0
    df["Hits"] = 0.0
    df["MRR"] = 0.0
    df["Comprehensiveness"] = 0.0
    df["Relevance"] = 0.0   
    df["Directness"] = 0.0
    df["BLEU Score"] = 0.0
    df["ROUGE-1"] = 0.0
    df["ROUGE-2"] = 0.0
    df["ROUGE-L"] = 0.0
    df["chrF"] = 0.0

    for idx, row in df.iterrows():
        query = row["Query"]
        gold_str = row["Ground Truth Answer"]
        final_ans_str = row["LLM Answer"]
        retrieved_list = row["Blazegraph Response"]

        metrics_dict = evaluate_dataset_row(gold_str, final_ans_str, retrieved_list)
        compre_score = calculate_comprehensiveness(query, final_ans_str)
        rel_score = calculate_relevance(query, final_ans_str)
        dir_score = calculate_directness(query, final_ans_str)
        df.at[idx, "Accuracy"] = metrics_dict["Accuracy"]
        df.at[idx, "Precision(Retrieved)"] = metrics_dict["Precision(Retrieved)"]
        df.at[idx, "Recall(Retrieved)"] = metrics_dict["Recall(Retrieved)"]
        df.at[idx, "Hits"] = metrics_dict["Hits"]
        df.at[idx, "MRR"] = metrics_dict["MRR"]
        df.at[idx, "BLEU Score"] = metrics_dict["BLEU Score"]
        df.at[idx, "ROUGE-1"] = metrics_dict["ROUGE-1"]
        df.at[idx, "ROUGE-2"] = metrics_dict["ROUGE-2"]
        df.at[idx, "ROUGE-L"] = metrics_dict["ROUGE-L"]
        df.at[idx, "Comprehensiveness"] = compre_score
        df.at[idx, "Relevance"] = rel_score
        df.at[idx, "Directness"] = dir_score
        df.at[idx, "chrF"] = metrics_dict["chrF"]

    return df  

################
# Example usage  
################
if __name__ == "__main__":
    import json
    df_in = pd.read_excel(r"Outputs/LLM_responses_blazegraph_multiHop.xlsx")

    # Convert 'Retrieved result' to list of dicts
    for idx, row in df_in.iterrows():
        raw_ret = row["Blazegraph Response"]
        if isinstance(raw_ret, str):
            try:
                # parse JSON or literal
                df_in.at[idx, "Blazegraph Response"] = json.loads(raw_ret)
            except:
                df_in.at[idx, "Blazegraph Response"] = []
        elif pd.isna(raw_ret) or not isinstance(raw_ret, list):
            df_in.at[idx, "Blazegraph Response"] = []
    df_with_metrics = evaluate_dataset_per_row(df_in)
    df_with_metrics.to_excel(r"Outputs/LLM_metrics_blazegraph_multihop.xlsx", index=False)


In [None]:
import re
import string
import json
import pandas as pd
import evaluate
import six
from types import SimpleNamespace
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from openai import AzureOpenAI

##############################################
# 1) Azure OpenAI Client Initialization
##############################################\

def load_config():
    try:
        with open(r"config.json") as f:
            return json.load(f, object_hook=lambda d: SimpleNamespace(**d))
    except FileNotFoundError:
        raise FileNotFoundError("Config file not found. Please check the path.")

def initialize_azure_client(config):
    client = SecretClient(vault_url=config.key_vault_url, credential=DefaultAzureCredential())
    secret = client.get_secret(config.dev_secret_name)
    return AzureOpenAI(api_key=secret.value, api_version=config.chat.api_version, azure_endpoint=config.chat.azure_endpoint)

config = load_config()
llm = initialize_azure_client(config)


##############################
# 2) Normalization and Parsing helpers
##############################

def normalize_to_set(s):
    """
    Tokenizes a string by:
    1) Lowercasing
    2) Removing punctuation
    3) Removing URLs
    4) Removing articles (a, an, the)
    5) Collapsing extra whitespace
    Returns a set of tokens.
    """
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    
    def remove_urls(text):
        return re.sub(r'http[s]?://\S+', '', text)
    
    def lower(text):
        return text.lower()

    if not isinstance(s, str):
        s = str(s) if s else ""
    text = lower(s)
    text = remove_urls(text)
    text = remove_punc(text)
    text = remove_articles(text)
    text = white_space_fix(text)
    tokens = text.split()
    return set(tokens)

def parse_documents_context(retrieved_text):
    """
    Parses a string that represents a list of Document objects.
    Extracts all `page_content` values via regex and joins them together.
    """
    
    pattern = r"page_content=['\"]([^'\"]+)['\"]"
    matches = re.findall(pattern, retrieved_text)
    return " ".join(matches)

def parse_colon_delimited_data(doc_text):
    """
    Parse colon-delimited blocks (e.g. "PROCESS DATA:", "EMPLOYEE DATA:", etc.)
    and return a combined string of key/value tokens.
    """
    blocks = doc_text.split('----')
    all_tokens = []
    possible_headers = ["PROCESS DATA:", "EMPLOYEE DATA:", "APPLICATION DATA:", "ORGANISATION DATA:"]
    for block in blocks:
        block = block.strip()
        if not block:
            continue
        for hdr in possible_headers:
            if block.startswith(hdr):
                block = block[len(hdr):].strip()
                break
        lines = block.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
            segments = line.split('|')
            for seg in segments:
                seg = seg.strip()
                if not seg:
                    continue
                kv = seg.split(':', 1)
                if len(kv) == 2:
                    key = kv[0].strip()
                    val = kv[1].strip()
                    all_tokens.append(key)
                    all_tokens.append(val)
                else:
                    all_tokens.append(seg)
    return " ".join(all_tokens)

def parse_triples(triples_text):
    """
    Parse text assumed to be in triple format (subject predicate object per line)
    and return the combined object tokens.
    """
    lines = triples_text.split('\n')
    all_objects = []
    for line in lines:
        parts = line.split(maxsplit=2)
        if len(parts) == 3:
            obj_tokens = parts[2]
            all_objects.append(obj_tokens)
    return " ".join(all_objects)

def parse_subgraphs(subgraph_text):
    """
    Parses subgraph text (each line representing a subgraph) and
    returns the joined object tokens.
    """
    subgraphs = re.split(r'(?i)Subgraph for .*', subgraph_text)
    all_objects = []
    for subg_block in subgraphs:
        subg_block = subg_block.strip()
        if not subg_block:
            continue
        lines = subg_block.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
            parts = line.split(maxsplit=2)
            if len(parts) == 3:
                obj_tokens = parts[2]
                all_objects.append(obj_tokens)
    return " ".join(all_objects)

def parse_retrieved_results(retrieved_text):
    """
    Dynamically processes retrieved results:
    1) If it contains " DATA:" then use parse_colon_delimited_data.
    2) Else if it contains "Subgraph for" then use parse_subgraphs.
    3) Else if it appears to be a list of Document objects (containing "Document(" and "page_content"),
       then parse using parse_documents_context.
    4) Otherwise parse as old-style triples.
    """
    if not isinstance(retrieved_text, str):
        retrieved_text = str(retrieved_text) if retrieved_text else ""

    if " DATA:" in retrieved_text:
        return parse_colon_delimited_data(retrieved_text)
    elif "Subgraph for" in retrieved_text:
        return parse_subgraphs(retrieved_text)
    elif "Document(" in retrieved_text and "page_content" in retrieved_text:
        return parse_documents_context(retrieved_text)
    else:
        return parse_triples(retrieved_text)

##############################
# 3) The LLM-based scoring functions
##############################

# Load metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def calculate_comprehensiveness(query, llm_facts):
    prompt = [
        {
            "role": "system",
            "content": f"""
    ---Role---
    You are a helpful assistant responsible for evaluating the system response on the basis of comprehensiveness.
    ---Goal---
    Given a user query and system response, assess if the response is suited according to
    the following measure:
    ---Measure---
    Comprehensiveness: "How much detail does the answer provide to cover all the aspects and details of the
    question? A comprehensive answer should be thorough and complete, without being redundant or irrelevant.
    For example, if the question is 'What are the benefits and drawbacks of nuclear energy?', a comprehensive
    answer would provide both the positive and negative aspects of nuclear energy, such as its efficiency,
    environmental impact, safety, cost, etc. A comprehensive answer should not leave out any important points
    or provide irrelevant information. For example, an incomplete answer would only provide the benefits of
    nuclear energy without describing the drawbacks, or a redundant answer would repeat the same information
    multiple times."

    Your assessment should include two parts:
    Score: between 0 and 1 , depending how comprehensive the response is. Where 0 is not comprehensive and 1 is the most comprehensive.
    Reasoning: a short explanation of why you gave the score with respect to the measure described above.

    ---query---
    User query: {query}
    ---system response---
    System Response: {llm_facts}

    """
        }]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    result = response.choices[0].message.content.strip()
    try:
        score_line = next((line for line in result.splitlines() if line.lower().startswith("score:")), None)
        if score_line:
            score = float(score_line.split(":")[1].strip())
            return score
        else:
            raise ValueError("Score not found in response.")
    except (ValueError, IndexError) as e:
        return 0

# Function to calculate relevance using LLM
def calculate_relevance(query, llm_facts):
    prompt = [
        {
            "role": "system",
            "content": f"""
    ---Role---
    You are a helpful assistant responsible for evaluating the system response on the basis of relevance.
    ---Goal---
    Given a user query and system response, assess if the response is suited according to
    the following measure:
    ---Measure---
    Relevance: "How well does the answer address the user query? A relevant answer should be directly related
    to the question asked and provide information that is useful and on-topic. For example, if the question is
    'What is the capital of France?', a relevant answer would be 'Paris'. A relevant answer should not provide
    information that is unrelated to the question or that does not directly address the user's information need.
    For example, an irrelevant answer would be 'The Eiffel Tower is located in Paris'."
    Your assessment should include two parts:
    Score: between 0 and 1 , depending how relevant the response is. Where 0 is not relevant and 1 is the most relevant.
    Reasoning: a short explanation of why you gave the score with respect to the measure described above.
    ---query---
    User query: {query}
    ---system response---
    System Response: {llm_facts}
    
    """
        }]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    result = response.choices[0].message.content.strip()
    try:
        score_line = next((line for line in result.splitlines() if line.lower().startswith("score:")), None)
        if score_line:
            score = float(score_line.split(":")[1].strip())
            return score
        else:
            raise ValueError("Score not found in response.")
    except (ValueError, IndexError) as e:
        return 0

# Function to calculate directness using LLM
def calculate_directness(query, llm_facts):
    prompt = [
        {
            "role": "system",
            "content": f"""
    ---Role---
    You are a helpful assistant responsible for evaluating the system response on the basis of directness.
    ---Goal---
    Given a user query and system response, assess if the response is suited according to
    the following measure:
    ---Measure---
    directness: "How specifically and clearly does the answer address the question? A direct answer should
    provide a clear and concise answer to the question. For example, if the question is 'What is the capital
    of France?', a direct answer would be 'Paris'. A direct answer should not provide any irrelevant or
    unnecessary information that does not answer the question. For example, an indirect answer would be 'The
    capital of France is located on the river Seine'.",
    "empowerment": "How well does the answer help the reader understand and make informed judgements about
    the topic without being misled or making fallacious assumptions. Evaluate each answer on the quality of
    answer as it relates to clearly explaining and providing reasoning and sources behind the claims in the
    answer."
    Your assessment should include two parts:
    Score: between 0 and 1 , depending how direct the response is. Where 0 is not direct and 1 is the most direct.
    Reasoning: a short explanation of why you gave the score with respect to the measure described above.

    ---query---
    User query: {query}
    ---system response---
    System Response: {llm_facts}

    """
        }]
    response = llm.chat.completions.create(model=config.chat.model, messages=prompt)
    result = response.choices[0].message.content.strip()
    try:
        score_line = next((line for line in result.splitlines() if line.lower().startswith("score:")), None)
        if score_line:
            score = float(score_line.split(":")[1].strip())
            return score
        else:
            raise ValueError("Score not found in response.")
    except (ValueError, IndexError) as e:
        return 0

###########################################
# 4) chrF Helper Functions and Implementation
###########################################

def separate_characters(line):
    return list(line.strip().replace(" ", ""))

def separate_punctuation(line):
    words = line.strip().split()
    tokenized = []
    for w in words:
        if len(w) == 1:
            tokenized.append(w)
        else:
            lastChar = w[-1] 
            firstChar = w[0]
            if lastChar in string.punctuation:
                tokenized += [w[:-1], lastChar]
            elif firstChar in string.punctuation:
                tokenized += [firstChar, w[1:]]
            else:
                tokenized.append(w)
    return tokenized

def ngram_counts(wordList, order):
    from collections import defaultdict
    counts = defaultdict(lambda: defaultdict(float))
    nWords = len(wordList)
    for i in range(nWords):
        for j in range(1, order+1):
            if i+j <= nWords:
                ngram = tuple(wordList[i:i+j])
                counts[j-1][ngram] += 1
    return counts

def ngram_matches(ref_ngrams, hyp_ngrams):
    from collections import defaultdict
    matchingNgramCount = defaultdict(float)
    totalRefNgramCount = defaultdict(float)
    totalHypNgramCount = defaultdict(float)
    for order in ref_ngrams:
        for ngram in hyp_ngrams[order]:
            totalHypNgramCount[order] += hyp_ngrams[order][ngram]
        for ngram in ref_ngrams[order]:
            totalRefNgramCount[order] += ref_ngrams[order][ngram]
            if ngram in hyp_ngrams[order]:
                matchingNgramCount[order] += min(ref_ngrams[order][ngram], hyp_ngrams[order][ngram])
    return matchingNgramCount, totalRefNgramCount, totalHypNgramCount

def ngram_precrecf(matching, reflen, hyplen, beta):
    factor = beta**2
    totalPrec = sum(matching.values()) / sum(hyplen.values()) if sum(hyplen.values()) > 0 else 1e-16
    totalRec = sum(matching.values()) / sum(reflen.values()) if sum(reflen.values()) > 0 else 1e-16
    denom = factor * totalPrec + totalRec
    totalF = ((1+factor)*totalPrec*totalRec / denom) if denom > 0 else 1e-16
    return totalF

def calculate_chrF(reference, hypothesis, nworder=2, ncorder=6, beta=2.0):
    # Prepare tokens for word and character level evaluation
    ref_words = separate_punctuation(reference)
    hyp_words = separate_punctuation(hypothesis)
    ref_chars = separate_characters(reference)
    hyp_chars = separate_characters(hypothesis)

    ref_word_ngram = ngram_counts(ref_words, nworder)
    hyp_word_ngram = ngram_counts(hyp_words, nworder)
    ref_char_ngram = ngram_counts(ref_chars, ncorder)
    hyp_char_ngram = ngram_counts(hyp_chars, ncorder)

    matching_word, ref_word_total, hyp_word_total = ngram_matches(ref_word_ngram, hyp_word_ngram)
    matching_char, ref_char_total, hyp_char_total = ngram_matches(ref_char_ngram, hyp_char_ngram)

    word_fscore = ngram_precrecf(matching_word, ref_word_total, hyp_word_total, beta)
    char_fscore = ngram_precrecf(matching_char, ref_char_total, hyp_char_total, beta)

    total_fscore = (word_fscore + char_fscore) / 2
    return total_fscore


def evaluate_single(gold_str, retrieved_str, final_answer_str):
    """
    Compute evaluation metrics:
    - Accuracy
    - Precision/Recall (retrieved tokens)
    - F1 Score
    - Hits@1
    - BLEU and ROUGE scores
    - chrF score
    """
    if not isinstance(retrieved_str, str):
        retrieved_str = str(retrieved_str) if retrieved_str else ""
    if not isinstance(final_answer_str, str):
        final_answer_str = str(final_answer_str) if final_answer_str else ""
    
    gold_tokens = normalize_to_set(gold_str)
    parsed_objects_str = parse_retrieved_results(retrieved_str)
    retrieved_tokens = normalize_to_set(parsed_objects_str)
    final_ans_tokens = normalize_to_set(final_answer_str)

    print("Gold tokens:", gold_tokens)
    print("Retrieved tokens:", retrieved_tokens)
    print("Final answer tokens:", final_ans_tokens)

    # Accuracy
    accuracy = 1.0 if gold_tokens.issubset(final_ans_tokens) else 0.0

    # Precision / Recall
    intersection_retrieved = retrieved_tokens.intersection(gold_tokens)
    num_common_ret = len(intersection_retrieved)
    p_ret = num_common_ret / len(retrieved_tokens) if retrieved_tokens else 1.0
    r_ret = num_common_ret / len(gold_tokens) if gold_tokens else 1.0
    f1_score = 2 * (p_ret * r_ret) / (p_ret + r_ret) if (p_ret + r_ret) > 0 else 0.0

    # Hits@1: if any common token is found, count as hit
    hits1 = 1.0 if num_common_ret > 0 else 0.0

    # BLEU and ROUGE calculations (using normalized and sorted tokens)
    predicted_sentence = " ".join(sorted(final_ans_tokens))
    ground_truth_sentence = " ".join(sorted(gold_tokens))
    try:
        bleu_result = bleu_metric.compute(predictions=[predicted_sentence], references=[[ground_truth_sentence]])
        rouge_result = rouge_metric.compute(predictions=[predicted_sentence], references=[ground_truth_sentence])
        bleu_score = bleu_result["bleu"]
        rouge_1 = rouge_result["rouge1"]
        rouge_2 = rouge_result["rouge2"]
        rouge_l = rouge_result["rougeL"]
        chrf_score = calculate_chrF(ground_truth_sentence, predicted_sentence)
    except ZeroDivisionError:
        bleu_score = rouge_1 = rouge_2 = rouge_l = chrf_score = 0

    print("Intersection (retrieved vs gold):", intersection_retrieved)
    print("Precision (retrieved):", p_ret)
    print("Recall (retrieved):", r_ret)
    print("F1 Score (retrieved):", f1_score)
    print("Hits@1:", hits1)
    print("BLEU, ROUGE1, ROUGE2, ROUGEL, chrf_score:", bleu_score, rouge_1, rouge_2, rouge_l, chrf_score)

    return {
        'Accuracy': accuracy,
        'Precision(Retrieved)': p_ret,
        'Recall(Retrieved)': r_ret,
        'F1': f1_score,
        'Hits': hits1,
        "BLEU Score": bleu_score,
        "ROUGE-1": rouge_1,
        "ROUGE-2": rouge_2,
        "ROUGE-L": rouge_l,
        "chrF Score": chrf_score
    }

##############################
# 5) Example evaluation over an Excel file
##############################

def evaluate_topk_excel(input_excel, output_excel):
    """
    Reads columns including:
      - 'Query'
      - 'Ground Truth Answer'
      - 'Top-K Subgraphs' or 'Top-K Answer' or 'Top-K Triples' etc.

    For each row, computes evaluation metrics and writes them to new columns.
    """
    top_k_values = [1,3,5,8,13,15,21]
    df = pd.read_excel(input_excel)
    col_names = [
        'Accuracy',
        'Precision(Retrieved)',
        'Recall(Retrieved)',
        'Hits',
        "BLEU Score",
        "ROUGE-1",
        "ROUGE-2",
        "ROUGE-L",
        "chrF Score"
    ]
    for k in top_k_values:
        for c in col_names:
            df[f"{c}@{k}"] = None
    df["MRR"] = None

    for i, row in df.iterrows():
        gold_str = row.get("Ground Truth Answer", "")
        gold_tokens = normalize_to_set(gold_str)
        rank = None
        for k in top_k_values:
            triple_key = f"Top-{k} Subgraphs"
            answer_key = f"Top-{k} Answer"
            if triple_key not in df.columns or answer_key not in df.columns:
                continue
            retrieved_str = row.get(triple_key, "")
            final_ans_str = row.get(answer_key, "")
            parsed_objects_str = parse_retrieved_results(retrieved_str)
            retrieved_tokens = normalize_to_set(parsed_objects_str)
            metrics = evaluate_single(gold_str, retrieved_str, final_ans_str)
            for c in col_names:
                df.at[i, f"{c}@{k}"] = metrics[c]
            if k == 13:
                query = row.get("Query", "")
                df.at[i, f"Comprehensiveness@{k}"] = calculate_comprehensiveness(query, final_ans_str)
                df.at[i, f"Relevance@{k}"] = calculate_relevance(query, final_ans_str)
                df.at[i, f"Directness@{k}"] = calculate_directness(query, final_ans_str)
            if rank is None and gold_tokens.issubset(retrieved_tokens):
                rank = k
        df.at[i, "MRR"] = 1.0 / rank if rank else 0.0

    df.to_excel(output_excel, index=False)
    print(f"Done. Metrics written to {output_excel}")

#########################################
# 5) Example usage with an Excel file as input
#########################################

if __name__ == "__main__":
    # Example data for testing
    # data = [
    #     {
    #         "Query": "What is Michael Woerther's GID",
    #         "Ground Truth Answer": "Z002H0WF",
    #         "Final Answer": "Michael Wörther's GID is **Z002H0WF**.",
    #         "Retrieved response": """[Document(metadata={'source': ['https://example.com/1'], 'search_space': 'organization'}, page_content='Z002H0WF is the gid of Michael Wörther'), Document(metadata={'source': ['https://example.com/2'], 'search_space': 'organization'}, page_content='Z000MJDW is the gid of Michael Goerz'), Document(metadata={'source': ['https://example.com/3'], 'search_space': 'organization'}, page_content='Z0006BDZ is the gid of Michael Wedemeyer'), Document(metadata={'source': ['https://example.com/4'], 'search_space': 'organization'}, page_content='Z000KR4F is the gid of Michael Hölzl'), Document(metadata={'source': ['https://example.com/5'], 'search_space': 'organization'}, page_content='Z000GGGG is the gid of Michaela Lehning')]"""
    #     }
    # ]
    # df = pd.DataFrame(data)
    # # Evaluate a single row (for demonstration)
    # for i, row in df.iterrows():
    #     metrics_dict = evaluate_single(row["Ground Truth Answer"], row["Retrieved response"], row["Final Answer"])
    #     print("Evaluation Metrics:", metrics_dict)

    # Evaluate an entire dataset
    input_file = r"Outputs/LLM_metrics_subgraph_multiHop.xlsx"
    output_file = r"Outputs/LLM_metrics_subgraph_multiHop_complete.xlsx"
    evaluate_topk_excel(input_file, output_file)