# Evaluation of ChroKnowPrompt

In [None]:
from collections import defaultdict

from tqdm import tqdm

from sources.utils import *
from sources.process import *

## Results of Total Span

In [None]:
def transition_rule_last_entry(entry, benchmark):
    """
    Define the rule for moving an entry from 'Partial_known' to 'Known',
    but only consider the last chrono_ans in each year for fuzzy matching.
    
    Returns:
        - all_known: Whether all years are 'chrono_known' or 'correct'
    """
    all_known = True

    for year, year_data in entry.items():
        if 'chrono_ans' in year_data:
            # Clean chrono_ans by removing invalid entries
            chrono_ans_list = clean_chrono_ans_list(year_data['chrono_ans'])
            year_data['chrono_ans'] = chrono_ans_list

            # Check only the last entry in the chrono_ans list for fuzzy matching
            if chrono_ans_list:
                last_ans = chrono_ans_list[-1]  # Get the last chrono_ans entry

                # Perform fuzzy matching with the last chrono_ans entry
                if is_fuzz_match(last_ans, benchmark.get(year, set())):                    
                    year_data['category'] = 'chrono_known'
        # Check if the updated category is not 'chrono_known' or 'correct'
        if year_data['category'] not in ['chrono_known', 'correct']:
            all_known = False
    
    return all_known


def results_of_total_span(new_data, bench_entries, bench_name):
    """
    Classify data entries into Known, Unknown, Cut-off, and Partial_known categories,
    while calculating the increase in Known items. The classification is based only
    on the last entry in the chrono_ans list.
    
    Returns:
    - fine_grained_results: A dictionary with the counts of each category
    - classification_indices: A list of items per category
    - partial_known_to_known: The number of items moved from Partial_known to Known
    - final_known_count: The final Known count after processing
    """
    fine_grained_results = defaultdict(int)
    classification_indices = defaultdict(list)
    partial_known_to_known = 0 
    moved_items_to_known = [] 

    for category in ['Known', 'Unknown', 'Cut-off', 'Partial_known']:
        for idx, year_classifications in new_data.get(category, []):
            benchmark = get_benchmark(bench_entries[idx], bench_name)
            
            if category == 'Partial_known':
                all_known = transition_rule_last_entry(year_classifications, benchmark)
                if all_known:
                    final_category = 'Known'
                    partial_known_to_known += 1
                    moved_items_to_known.append((idx, year_classifications))
    
                else:
                    final_category = 'Partial_known'
            else:
                final_category = category  

            fine_grained_results[final_category] += 1
            classification_indices[final_category].append((idx, year_classifications))

    final_known_count = len(classification_indices['Known'])
    previous_known_count = final_known_count - partial_known_to_known  

    print(f"Total items moved from Partial_known to Known: {partial_known_to_known}")
    
    # Ensure no duplicate items during transition
    for idx, year_classifications in moved_items_to_known:
        classification_indices['Partial_known'] = [item for item in classification_indices['Partial_known'] if item[0] != idx]
        if (idx, year_classifications) not in classification_indices['Known']:  # Prevent duplicates
            classification_indices['Known'].append((idx, year_classifications))

    # Debug final counts
    print(f"Final Known count: {len(classification_indices['Known'])}")
    print(f"Final Partial_known count: {len(classification_indices['Partial_known'])}")

    # Ensure total count consistency
    total_items = len(classification_indices['Known']) + len(classification_indices['Partial_known']) + \
                  len(classification_indices['Unknown']) + len(classification_indices['Cut-off'])
    expected_total = len(new_data.get('Known', [])) + len(new_data.get('Partial_known', [])) + \
                     len(new_data.get('Unknown', [])) + len(new_data.get('Cut-off', []))

    # Check if the total count is consistent
    if total_items != expected_total:
        raise ValueError(f"Total item count mismatch! Expected: {expected_total}, Got: {total_items}")

    return fine_grained_results, classification_indices, previous_known_count

In [None]:
def get_results_total_span(model_name_list, domain):
    results_list_temp_state = []
    
    for t_state in ["Dynamic", "Static"]:
        results_list = []

        for model_name in tqdm(model_name_list):
            bench, temp0_parsed_time, temp7_parsed_time = load_result(
                                                                    model_name=model_name,
                                                                    domain=domain,
                                                                    temp_state=t_state,   
                                                                    mode="generation"
                                                                    )
            
            new_data = read_json_file(f"./ChronoGap/{model_name}/Updated_Timestamp_{domain}_{t_state}.json")

            results, indices, previous_known_count = results_of_total_span(
                new_data, bench, domain
            )

            total = sum(results.values())

            previous_known_percentage = (previous_known_count / total) * 100 if total > 0 else 0
            current_known_count = results.get('Known', 0)
            current_known_percentage = (current_known_count / total) * 100 if total > 0 else 0
            known_increase_percentage = current_known_percentage - previous_known_percentage

            print(f"\n[model: {model_name}, temp_state: {t_state}] ChroKnowPrompt results:")
            for category, count in results.items():
                percentage = (count / total) * 100 if total > 0 else 0
                if category == 'Known':
                    print(f"{category}: {count} ({percentage:.2f}%) (+{known_increase_percentage:.2f}% overall increase)")
                else:
                    print(f"{category}: {count} ({percentage:.2f}%)")
            print(f"Total: {total}")

            results_list.append({
                "model_name": model_name,
                "results": results,
                "indices": indices,
                "previous_known_count": previous_known_count,
                "total": total,
                "previous_known_percentage": previous_known_percentage,
                "current_known_count": current_known_count,
                "current_known_percentage": current_known_percentage,
                "known_increase_percentage": known_increase_percentage
            })

        results_list_temp_state.append(results_list)

    return results_list_temp_state

In [None]:
model_name_list = ["Llama3.1_8B", "Mistral7B", "Phi3.5_Mini", "SOLAR_10.7B", "Gemma2_9B", 'gpt-4o-mini']

# time variant domains: General, Biomedical, Legal
domain = "General"

results_list_temp_state = get_results_total_span(model_name_list=model_name_list,
                 domain=domain)

## Results of Previous Span

In [None]:
def transition_rule_previous_only_last_entry(entry, benchmark):
    """
    Define the rule for moving an entry from 'Partial_known' to 'Known',
    but only consider the last 'previous' step in chrono_ans for fuzzy matching.
    
    Returns:
        - all_known: Whether all years are 'chrono_known' or 'correct'
        - unknown_to_chrono_known_years: List of years where 'unknown' changed to 'chrono_known'
    """
    all_known = True

    for year, year_data in entry.items():
        if 'chrono_ans' in year_data and 'steps' in year_data:
            # Clean chrono_ans by removing invalid entries
            chrono_ans_list = clean_chrono_ans_list(year_data['chrono_ans'])
            steps_list = year_data['steps']
            year_data['chrono_ans'] = chrono_ans_list

            # Filter only the 'previous' steps
            previous_steps = [(step, ans) for step, ans in zip(steps_list, chrono_ans_list) if step == "previous"]

            # Only perform fuzzy matching on the last 'previous' step
            if previous_steps:
                last_previous_ans = previous_steps[-1][1]  # Get the last 'previous' step's chrono_ans

                # Perform fuzzy matching with the last 'previous' chrono_ans entry
                if is_fuzz_match(last_previous_ans, benchmark.get(year, set())):                    
                    year_data['category'] = 'chrono_known'
            # If no match or no 'previous' steps, retain the original category

        # Check if the updated category is not 'chrono_known' or 'correct'
        if year_data['category'] not in ['chrono_known', 'correct']:
            all_known = False
    
    return all_known


def results_of_previous_span(new_data, bench_entries, bench_name):
    """
    Classify data entries into Known, Unknown, Cut-off, and Partial_known categories,
    while considering only the last 'previous' step in chrono_ans for matching.
    
    Returns:
    - fine_grained_results: A dictionary with the counts of each category
    - classification_indices: A list of items per category
    - partial_known_to_known: The number of items moved from Partial_known to Known
    - final_known_count: The final Known count after processing
    """
    fine_grained_results = defaultdict(int)
    classification_indices = defaultdict(list)
    partial_known_to_known = 0  
    moved_items_to_known = []  

    for category in ['Known', 'Unknown', 'Cut-off', 'Partial_known']:
        for idx, year_classifications in new_data.get(category, []):
            benchmark = get_benchmark(bench_entries[idx], bench_name)
            
            if category == 'Partial_known':
                all_known  = transition_rule_previous_only_last_entry(year_classifications, benchmark)
                if all_known:
                    final_category = 'Known'
                    partial_known_to_known += 1
                    moved_items_to_known.append((idx, year_classifications))
                
                else:
                    final_category = 'Partial_known'
            else:
                final_category = category

            fine_grained_results[final_category] += 1
            classification_indices[final_category].append((idx, year_classifications))


    final_known_count = len(classification_indices['Known'])
    previous_known_count = final_known_count - partial_known_to_known  

    print(f"Total items moved from Partial_known to Known: {partial_known_to_known}")
    
    for idx, year_classifications in moved_items_to_known:
        classification_indices['Partial_known'] = [item for item in classification_indices['Partial_known'] if item[0] != idx]
        if (idx, year_classifications) not in classification_indices['Known']:  # Prevent duplicates
            classification_indices['Known'].append((idx, year_classifications))

    # Debug final counts
    print(f"Final Known count: {len(classification_indices['Known'])}")
    print(f"Final Partial_known count: {len(classification_indices['Partial_known'])}")

    # Ensure total count consistency
    total_items = len(classification_indices['Known']) + len(classification_indices['Partial_known']) + \
                  len(classification_indices['Unknown']) + len(classification_indices['Cut-off'])
    expected_total = len(new_data.get('Known', [])) + len(new_data.get('Partial_known', [])) + \
                     len(new_data.get('Unknown', [])) + len(new_data.get('Cut-off', []))

    # Check if the total count is consistent
    if total_items != expected_total:
        raise ValueError(f"Total item count mismatch! Expected: {expected_total}, Got: {total_items}")

    return fine_grained_results, classification_indices, previous_known_count

In [None]:
def get_results_previous_span(model_name_list, domain):
    results_list_temp_state = []
    
    for t_state in ["Dynamic", "Static"]:
        results_list = []

        for model_name in tqdm(model_name_list):
            bench, temp0_parsed_time, temp7_parsed_time = load_result(
                                                                    model_name=model_name,
                                                                    domain=domain,
                                                                    temp_state=t_state,   
                                                                    mode="generation"
                                                                    )
            
            new_data = read_json_file(f"./ChronoGap/{model_name}/Updated_Timestamp_{domain}_{t_state}.json")

            results, indices, previous_known_count = results_of_previous_span(
                new_data, bench, domain
            )

            total = sum(results.values())

            previous_known_percentage = (previous_known_count / total) * 100 if total > 0 else 0
            current_known_count = results.get('Known', 0)
            current_known_percentage = (current_known_count / total) * 100 if total > 0 else 0
            known_increase_percentage = current_known_percentage - previous_known_percentage

            print(f"\n[model: {model_name}, temp_state: {t_state}] ChroKnowPrompt results:")
            for category, count in results.items():
                percentage = (count / total) * 100 if total > 0 else 0
                if category == 'Known':
                    print(f"{category}: {count} ({percentage:.2f}%) (+{known_increase_percentage:.2f}% overall increase)")
                else:
                    print(f"{category}: {count} ({percentage:.2f}%)")
            print(f"Total: {total}")

            results_list.append({
                "model_name": model_name,
                "results": results,
                "indices": indices,
                "previous_known_count": previous_known_count,
                "total": total,
                "previous_known_percentage": previous_known_percentage,
                "current_known_count": current_known_count,
                "current_known_percentage": current_known_percentage,
                "known_increase_percentage": known_increase_percentage
            })

        results_list_temp_state.append(results_list)

    return results_list_temp_state

In [None]:
model_name_list = ["Llama3.1_8B", "Mistral7B", "Phi3.5_Mini", "SOLAR_10.7B", "Gemma2_9B", 'gpt-4o-mini']

# time variant domains: General, Biomedical, Legal
domain = "General"

results_list_temp_state = get_results_previous_span(model_name_list=model_name_list,
                 domain=domain)