In [None]:
import pandas as pd

TASKNAMES = ["subtask1", "subtask2"]
TYPES = ["test", "test_preprocess"]

dataset_loader = dict()

for taskname in TASKNAMES:
    for type in TYPES:
        print(f"Processing {taskname} for {type}")
        prefix = "dataset"
        # Load the data
        
        if taskname == "subtask2" and type == "test_preprocess":
            continue
        data = pd.read_csv(f"{prefix}/{taskname}_{type}.csv")
        dataset_loader[f"{taskname}_{type}"] = data

dataset_loader["subtask1_test"]["claim"] = dataset_loader["subtask1_test_preprocess"]["claim"]

In [None]:
from nltk.tokenize import sent_tokenize
from rapidfuzz import fuzz
from utils import get_latest_result_by_prefix

from kg_compare import get_src_tgt_dict, get_map_src_tgt_dict, compare_knowledge_graphs_similarity
test_data = dataset_loader["subtask1_test"]

index = 6
claim_nodes, claim_edges = get_latest_result_by_prefix(f"claim_test_{index}_")
reference_nodes, reference_edges = get_latest_result_by_prefix(f"reference_test_{index}_")
reference_content = test_data.at[index, "reference"]

claim_node_list = list(claim_nodes.keys())
reference_node_list = list(reference_nodes.keys())

if claim_node_list == [] or reference_node_list == []:
    print("Claim or reference is empty")
    
    
similarity_matrix = compare_knowledge_graphs_similarity(claim_node_list,
                                                        reference_node_list,
                                                        if_plot=False)
src_tgt_dict = get_src_tgt_dict(similarity_matrix, claim_node_list, reference_node_list)

claim_reference_edge_map = get_map_src_tgt_dict(src_tgt_dict, claim_edges)

def find_best_span_for_all_evidences(reference_edges,
                                     claim_reference_edge_map,
                                     paragraph,
                                     max_window_size=5):
    sentences = sent_tokenize(paragraph)

    best_span_list = []
    for _, reference_edge_list in claim_reference_edge_map.items():
        evidence_edges = set(reference_edge_list) & set(reference_edges.keys())

        for evidence_edge in evidence_edges:
            reference_edge = reference_edges[evidence_edge]

            best_score = -1
            best_span = None

            for window_size in range(1, min(max_window_size, len(sentences)) + 1):
                for i in range(len(sentences) - window_size + 1):
                    window = ' '.join(sentences[i:i+window_size])
                    score = fuzz.ratio(reference_edge[0]["description"], window)
                    if score > best_score:
                        best_score = score
                        best_span = (i, i+window_size)
            best_span_list.append(best_span)


    intervals = sorted(set(best_span_list), key=lambda x: x[0])
    merged = []
    for interval in intervals:
        if not merged or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
            merged[-1] = (merged[-1][0], max(merged[-1][1], interval[1]))

    print(merged)
    evidence_sentences = [' '.join(sentences[merged[i][0]:merged[i][1]]) for i in range(len(merged))]
    return ' '.join(evidence_sentences)


find_best_span_for_all_evidences(reference_edges,
                                 claim_reference_edge_map,
                                 reference_content)


In [None]:
intervals = [
    (1, 2), (9, 10), (13, 14), (15, 16), (13, 14),
    (1, 2), (9, 10), (13, 14), (15, 16), (13, 14),
    (1, 2), (9, 10), (13, 14), (15, 16), (13, 14),
    (5, 6), (9, 10)
]

# Step 1: Sort and remove duplicates
intervals = sorted(set(intervals), key=lambda x: x[0])

# Step 2: Merge intervals
merged = []
for interval in intervals:
    if not merged or merged[-1][1] < interval[0]:
        merged.append(interval)
    else:
        merged[-1] = (merged[-1][0], max(merged[-1][1], interval[1]))

print(merged)

In [None]:
claim_reference_edge_map

In [None]:
import pandas as pd

TASKNAMES = ["subtask1", "subtask2"]
BATCHNAMES = ["batch1", "batch2", "batch3"]

dataset_loader = dict()

for taskname in TASKNAMES:
    for batchname in BATCHNAMES:
        print(f"Processing {taskname} for {batchname}")
        prefix = "dataset/"
        # Load the data
        data = pd.read_csv(f"{prefix}/{taskname}_train_{batchname}.csv")
        dataset_loader[f"{taskname}_{batchname}"] = data

In [None]:
print(dataset_loader["subtask1_batch2"].answer.tolist()[0])

In [None]:
print(dataset_loader["subtask1_batch2"].claim.tolist()[0])

In [None]:
nodes_reference, edges_reference = read_knowledge_graph_from_pickle(files[0])
nodes_claim, edges_claim = read_knowledge_graph_from_pickle(files[1])
for key in nodes_claim.keys():
    if key in nodes_reference.keys():
        print(f"`{key}`")
    else:
        print(f"Key `{key}` not found in nodes_reference")

In [None]:
for key_src, key_tgt in edges_claim.keys():
    print(f"`{key_src}` -> `{key_tgt}`")

In [1]:
from geminillm import gemini_complete_if_cache
from json_kv_iml import JsonKVStorage
from operationCheatSheet import preprocessing_claim
from openaillm import openai_embed

LLM_MODEL_NAME = "gemini-2.5-flash-preview-05-20"


kv_global_config = {
    "working_dir": "/tmp",
    "llm_model_name": LLM_MODEL_NAME,
    "embedding_batch_num": 64,  # or another integer suitable for your setup
    "vector_db_storage_cls_kwargs": {
        "cosine_better_than_threshold": 0.2  # or another float threshold you want
    },
    "base_url": "https://api.openai.com/v1",
}

llm_cache = JsonKVStorage(namespace="llm_cache", global_config=kv_global_config, embedding_func=openai_embed)
    
async def llm_wrapper(prompt, history_messages=None, max_tokens=None, **kwargs):
    if history_messages is None:
        history_messages = []

    # Use Google GenAI
    return await gemini_complete_if_cache(
        model=LLM_MODEL_NAME,
        prompt=prompt,
        history_messages=history_messages,
        hashing_kv=llm_cache,
        temperature=0.2,
        max_tokens=max_tokens or 1024,
    )
# answer = dataset_loader["subtask1_batch2"].answer.tolist()[0]
# claim = dataset_loader["subtask1_batch2"].claim.tolist()[0]
# response = await preprocessing_claim(claim, answer, llm_wrapper, llm_cache, 1024, [])
# response

In [None]:
all_answer_batch_2 = dataset_loader["subtask1_batch2"].answer.tolist()
all_claim_batch_2 = dataset_loader["subtask1_batch2"].claim.tolist()

preprocess_path = "dataset/subtask1_train_batch2_preprocess.csv"
df = pd.read_csv(preprocess_path)
for index, (answer, claim) in enumerate(zip(all_answer_batch_2, all_claim_batch_2)):
    if index % 200 == 0:
        print(f"Processing index {index}")
    try:
        response = await preprocessing_claim(claim, answer, llm_wrapper, llm_cache, 1024, [])
        if response is None:
            print(f"Index {index} has no response")
            continue
        else:
            df.at[index, "claim"] = response
    except Exception as e:
        print(f"Index {index} has error")
        print(e)

df.to_csv(preprocess_path, index=False)

In [None]:
from pretty_prompt_compare import PrettyCompare

pretty_compare = PrettyCompare(compare_response=True, target=["they", "They", 
                                                              "them", "Them",
                                                              "their", "Their",
                                                              "theirs", "Theirs",
                                                              "them", "Them",
                                                              "these", "These",
                                                              "those", "Those",
                                                              "this", "This",
                                                              "that", "That",
                                                              "these", "These",
                                                              "those", "Those",
                                                              "this", "This"])

claim_processed = df.claim.tolist()
all_claim_batch_2 = dataset_loader["subtask1_batch2"].claim.tolist()

index = 21

all_claim_batch_2[index] |pretty_compare| claim_processed[index]

In [None]:
df.at[0, "claim"] = "2"
df.head()

In [None]:
#!/usr/bin/env python3

import os
import re
from datetime import datetime
from collections import defaultdict

def find_and_rename_newest_files_by_number():
    """
    Find the newest claim_batch2 files for each number (0-999) and rename them to claim_test format
    """
    outputs_dir = "outputs"
    
    # Pattern to match: claim_batch2_{number}_result_{timestamp}.pkl
    pattern = r'claim_batch2_(\d+)_result_(\d{8}_\d{6})\.(pkl|txt)'
    
    # Dictionary to store files by number, then by timestamp
    files_by_number = defaultdict(lambda: defaultdict(list))
    
    # Scan all files in outputs directory
    for filename in os.listdir(outputs_dir):
        match = re.match(pattern, filename)
        if match:
            number = match.group(1)
            timestamp = match.group(2)
            extension = match.group(3)
            
            files_by_number[number][timestamp].append({
                'filename': filename,
                'number': number,
                'timestamp': timestamp,
                'extension': extension,
                'full_path': os.path.join(outputs_dir, filename)
            })
    
    if not files_by_number:
        print("No files matching the pattern found!")
        return
    
    print(f"Found files for {len(files_by_number)} different numbers")
    
    # For each number, find the newest timestamp and rename those files
    renamed_count = 0
    numbers_processed = []
    
    for number in sorted(files_by_number.keys(), key=int):
        timestamps_for_number = files_by_number[number]
        
        # Find the newest timestamp for this number
        newest_timestamp = max(timestamps_for_number.keys())
        newest_files = timestamps_for_number[newest_timestamp]
        
        print(f"\nNumber {number}: Found {len(newest_files)} files with newest timestamp {newest_timestamp}")
        
        # Rename the files for this number
        for file_info in newest_files:
            old_path = file_info['full_path']
            
            # Create new filename: claim_test_result_{number}_{timestamp}.{extension}
            new_filename = f"claim_test_result_{file_info['number']}_{file_info['timestamp']}.{file_info['extension']}"
            new_path = os.path.join(outputs_dir, new_filename)
            
            try:
                os.rename(old_path, new_path)
                print(f"  Renamed: {file_info['filename']} -> {new_filename}")
                renamed_count += 1
            except Exception as e:
                print(f"  Error renaming {file_info['filename']}: {e}")
        
        numbers_processed.append(number)
    
    print(f"\nSuccessfully renamed {renamed_count} files for {len(numbers_processed)} different numbers!")
    
    # Show summary
    print(f"\nProcessed numbers: {', '.join(sorted(numbers_processed, key=int))}")
    
    # Show which numbers from 0-999 are missing
    all_numbers = set(str(i) for i in range(1000))
    found_numbers = set(files_by_number.keys())
    missing_numbers = all_numbers - found_numbers
    
    if missing_numbers:
        missing_sorted = sorted(missing_numbers, key=int)
        print(f"\nNumbers with no files found: {', '.join(missing_sorted[:20])}")
        if len(missing_numbers) > 20:
            print(f"... and {len(missing_numbers) - 20} more")


find_and_rename_newest_files_by_number()


find_and_rename_newest_files()

In [None]:
from utils import get_latest_result_by_prefix
index  = 0
claim_nodes, claim_edges = get_latest_result_by_prefix(f"claim_test_result_{index}_")


In [None]:
import os
import re

def rename_claim_test_files():
    """
    Rename files from claim_test_result_{number}_{timestamp}.{ext} 
    to claim_test_{number}_result_{timestamp}.{ext}
    """
    outputs_dir = "outputs"
    
    # Pattern to match: claim_test_result_{number}_{timestamp}.{extension}
    pattern = r'claim_test_result_(\d+)_(\d{8}_\d{6})\.(pkl|txt)'
    
    # Get all files in outputs directory
    files_to_rename = []
    
    for filename in os.listdir(outputs_dir):
        match = re.match(pattern, filename)
        if match:
            number = match.group(1)
            timestamp = match.group(2)
            extension = match.group(3)
            
            # Create new filename: claim_test_{number}_result_{timestamp}.{extension}
            new_filename = f"claim_test_{number}_result_{timestamp}.{extension}"
            
            files_to_rename.append({
                'old_filename': filename,
                'new_filename': new_filename,
                'old_path': os.path.join(outputs_dir, filename),
                'new_path': os.path.join(outputs_dir, new_filename)
            })
    
    if not files_to_rename:
        print("No files matching the pattern 'claim_test_result_*' found!")
        return
    
    print(f"Found {len(files_to_rename)} files to rename:")
    for file_info in files_to_rename:
        print(f"  {file_info['old_filename']} -> {file_info['new_filename']}")
    
    print(f"\nProceeding with renaming {len(files_to_rename)} files...")
    
    # Perform the renaming
    renamed_count = 0
    for file_info in files_to_rename:
        try:
            os.rename(file_info['old_path'], file_info['new_path'])
            print(f"✓ Renamed: {file_info['old_filename']} -> {file_info['new_filename']}")
            renamed_count += 1
        except Exception as e:
            print(f"✗ Error renaming {file_info['old_filename']}: {e}")
    
    print(f"\nSuccessfully renamed {renamed_count} out of {len(files_to_rename)} files!")

rename_claim_test_files()

In [None]:
from utils import get_latest_result_by_prefix

index = 1
claim_nodes, claim_edges = get_latest_result_by_prefix(f"claim_test_{index}_")

claim_nodes

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import numpy as np
from kg_compare import get_src_tgt_dict, get_map_src_tgt_dict, find_best_span

from kg_compare import compare_knowledge_graphs_similarity, find_best_span_for_all_evidences
from utils import get_latest_result_by_prefix
import seaborn as sns

import numpy as np
from sklearn.cluster import KMeans
import os
all_responses = []

import pandas as pd

from geminillm import gemini_complete_if_cache
from json_kv_iml import JsonKVStorage
from operationCheatSheet import preprocessing_claim
from openaillm import openai_embed

In [2]:
from geminillm import gemini_complete_if_cache
from json_kv_iml import JsonKVStorage
from operationCheatSheet import preprocessing_claim
from openaillm import openai_embed

LLM_MODEL_NAME = "gemini-2.5-flash-preview-05-20"


kv_global_config = {
    "working_dir": "/tmp",
    "llm_model_name": LLM_MODEL_NAME,
    "embedding_batch_num": 64,  # or another integer suitable for your setup
    "vector_db_storage_cls_kwargs": {
        "cosine_better_than_threshold": 0.2  # or another float threshold you want
    },
    "base_url": "https://api.openai.com/v1",
}

llm_cache = JsonKVStorage(namespace="llm_cache", global_config=kv_global_config, embedding_func=openai_embed)
    
async def llm_wrapper(prompt, history_messages=None, max_tokens=None, **kwargs):
    if history_messages is None:
        history_messages = []

    # Use Google GenAI
    return await gemini_complete_if_cache(
        model=LLM_MODEL_NAME,
        prompt=prompt,
        history_messages=history_messages,
        hashing_kv=llm_cache,
        temperature=0.2,
        max_tokens=max_tokens or 10000,
    )
# answer = dataset_loader["subtask1_batch2"].answer.tolist()[0]
# claim = dataset_loader["subtask1_batch2"].claim.tolist()[0]
# response = await preprocessing_claim(claim, answer, llm_wrapper, llm_cache, 1024, [])
# response

In [3]:
TASKNAMES = ["subtask1", "subtask2"]
TYPES = ["test", "test_preprocess", "train_batch2", "train_batch2_preprocess"]

dataset_loader = dict()

for taskname in TASKNAMES:
    for type in TYPES:
        print(f"Processing {taskname} for {type}")
        prefix = "dataset"
        # Load the data
        
        if taskname == "subtask2" and type == "test_preprocess":
            continue
        if taskname == "subtask2" and type == "train_batch2_preprocess":
            continue
        data = pd.read_csv(f"{prefix}/{taskname}_{type}.csv")
        dataset_loader[f"{taskname}_{type}"] = data

dataset_loader["subtask1_test"]["claim"] = dataset_loader["subtask1_test_preprocess"]["claim"]
dataset_loader["subtask1_train_batch2"]["claim"] = dataset_loader["subtask1_train_batch2_preprocess"]["claim"]

output_file = "outputs/response_test.txt"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

Processing subtask1 for test
Processing subtask1 for test_preprocess
Processing subtask1 for train_batch2
Processing subtask1 for train_batch2_preprocess
Processing subtask2 for test
Processing subtask2 for test_preprocess
Processing subtask2 for train_batch2
Processing subtask2 for train_batch2_preprocess


In [4]:
LLM_MODEL_NAME = "gemini-2.5-flash-preview-05-20"
def write_prediction_to_file(index, row_id, prediction, output_file="outputs/response_test.txt"):
    """
    Write a line with row ID and prediction to the output file
    
    Args:
        row_id: The ID/index of the row being processed
        prediction: The prediction result
        output_file: Path to the output file
    """
    # Ensure outputs directory exists
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Write the prediction with row ID to file
    with open(output_file, 'a', encoding='utf-8') as f:
        f.write(f"{index}, {row_id}, {prediction}\n")

async def llm_wrapper(prompt, history_messages=None, max_tokens=None, **kwargs):
    if history_messages is None:
        history_messages = []


    # Use Google GenAI
    return await gemini_complete_if_cache(
        model=LLM_MODEL_NAME,
        prompt=prompt,
        history_messages=history_messages,
        hashing_kv=llm_cache,
        temperature=0.2,
        max_tokens=max_tokens or 1024,
    )

In [5]:
import os
import pandas as pd
import glob
from collections import Counter

folder_path = "outputs/kaggle_results/"

prefix = "response_clean"


# Step 1: Set up folder and file pattern
pattern = os.path.join(folder_path, f"{prefix}*.csv")
csv_files = glob.glob(pattern)

# Step 2: Identify main file and other files
main_file = [f for f in csv_files if 'jun26_1158' in f][0]
other_files = [f for f in csv_files if f != main_file]

# Step 2: Read and split "ID\tlabel" column into two columns: ID, label
dfs = []
def load_and_split(file):
    df = pd.read_csv(file)
    return df

dfs = [load_and_split(file) for file in csv_files]
main_df = load_and_split(main_file)
other_dfs = [load_and_split(file) for file in other_files]

# Step 4: Concatenate other files for majority voting
other_data = pd.concat(other_dfs)

# Step 5: Group by ID and get all labels (for checking if all disagree)
other_labels = other_data.groupby("ID")["label"].apply(list)

# Step 6: Merge with main_df to align labels
main_df = main_df.merge(other_labels, on="ID", how="left")  # Now has 'label_x' and 'label_y'

# Rename for clarity
main_df = main_df.rename(columns={"label_x": "main_label", "label_y": "other_labels"})

# Step 7: Apply conditional rule
def decide_label(row):
    main_label = row['main_label']
    other_votes = row['other_labels']
    if other_votes and all(label != main_label for label in other_votes):
        # Override with majority vote
        return Counter(other_votes).most_common(1)[0][0]
    else:
        return main_label

main_df['final_label'] = main_df.apply(decide_label, axis=1)

# Step 8: Build final result
final_df = main_df[['ID', 'final_label']].rename(columns={'final_label': 'label'})
best_df = pd.read_csv("outputs/kaggle_results/response_clean_jun26_1158.csv")
unver_list = [index for index in range(len(best_df)) if best_df.at[index, "label"] == "unver"]

In [6]:
import csv

# This script demonstrates how to read and parse the summary_test.txt file
# and store the data in a dictionary for easy access by the line number identifier from the file.

file_path = 'outputs/summary/summary_test.txt'
data_by_linenum = {}

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if not row:
                continue
            
            # The first column is the line number identifier
            line_number = row[0].strip()
            item_id = row[1].strip()
            content = ', '.join(row[2:]).strip()
            
            # Store the data in a dictionary with the line number as the key.
            data_by_linenum[line_number] = {
                'id': item_id,
                'content': content
            }

    # Now you can easily access data by its line number.
    print(f"Total items loaded: {len(data_by_linenum)}\n")

    # --- Example Usage ---
    # Let's get the content for the line number '994'
    target_line = '994'
    
    if target_line in data_by_linenum:
        print(f"Content for line number '{target_line}':")
        print(data_by_linenum[target_line]['content'])
    else:
        print(f"Could not find data for line number '{target_line}'.")
        
    # Another example for line '6'
    target_line_2 = '6'
    if target_line_2 in data_by_linenum:
        print(f"\nContent for line number '{target_line_2}':")
        print(data_by_linenum[target_line_2]['content'])
    else:
        print(f"\nCould not find data for line number '{target_line_2}'.")

except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

Total items loaded: 473

Content for line number '994':
Reference [4] focuses on the diagnosis of tooth surface damage faults in gears using induction machine electrical signature analysis,  emphasizing the crucial role of gear condition monitoring in mechanical power transmission. Additionally,  Reference [2] notes that for applications like compressors,  the interaction between the rotor and the surrounding medium can be significant.

Content for line number '6':
Unpleasant odor is identified as one of the major problems encountered in patients with chronic wounds.


In [7]:
test_data = dataset_loader["subtask1_test"]

pickle_prefix = "test"
# pickle_prefix = "batch2"

for index in range(len(test_data)):
    if index % 50 == 0:
        print(f"Processing index {index}")

    row_id = test_data.at[index, "ID"]
    
    if index not in unver_list:
        response = best_df.at[index, "label"]
        all_responses.append(response)
        write_prediction_to_file(index, row_id, response)
        continue
    
    claim_content = test_data.at[index, "claim"]

    claim_nodes, claim_edges = await get_latest_result_by_prefix(f"claim_{pickle_prefix}_{index}_")
    reference_nodes, reference_edges = await get_latest_result_by_prefix(f"reference_{pickle_prefix}_{index}_")
    reference_content = test_data.at[index, "reference"]

    claim_node_list = list(claim_nodes.keys())
    reference_node_list = list(reference_nodes.keys())

    if claim_node_list == [] or reference_node_list == []:
        if str(index) in data_by_linenum:
            evidence_text = data_by_linenum[str(index)]["content"]
        else:
            evidence_text = reference_content
    else:
        similarity_matrix = compare_knowledge_graphs_similarity(claim_node_list,
                                                                reference_node_list,
                                                                if_plot=False)
        src_tgt_dict = get_src_tgt_dict(similarity_matrix, claim_node_list, reference_node_list)

        claim_reference_edge_map = get_map_src_tgt_dict(src_tgt_dict, claim_edges)

        evidence_text = find_best_span_for_all_evidences(reference_edges,
                                    claim_reference_edge_map,
                                    reference_content)
    
    if evidence_text == "":
        if str(index) in data_by_linenum:
            evidence_text = data_by_linenum[str(index)]["content"]
        else:
            evidence_text = reference_content
    
    classification_prompt = f"""
    ### INSTRUCTION ###
    You are an expert fact-checking AI. Your task is to meticulously analyze a claim against a piece of evidence and classify their relationship. Follow these steps:

    1.  **Analyze the Claim**: Identify the core assertion being made in the claim.
    2.  **Analyze the Evidence**: Understand what information the evidence provides and what it does not.
    3.  **Compare**: Compare the claim's core assertion to the information in the evidence.
    4.  **Classify**: Based on your comparison, classify the claim into ONE of the three categories defined below.

    ### CATEGORY DEFINITIONS ###

    1.  **Entailment**: Choose this if the evidence **directly supports, paraphrases, or is a logical consequence of** the claim. The evidence doesn't need to be a word-for-word match. If the core assertion of the claim is validated by the evidence, it is Entailment.

    2.  **Contradiction**: Choose this if the evidence **directly contradicts** the claim. This includes claims that **exaggerate or overstate** what the evidence says (e.g., claim says "will completely solve" but evidence says "will help assist").

    3.  **Unverifiable**: Choose this ONLY if a **key piece of the claim's main assertion is not mentioned** in the evidence, making it impossible to confirm or deny. Do NOT use this just because the wording is different. If the evidence supports the *gist* of the claim, it is likely Entailment, not Unverifiable.

    ### EXAMPLES ###

    **Example 1 (Clear Entailment):**
    Claim: Machine learning in VR can create intelligent agents that adapt to user behavior for a more personalized experience.
    Evidence: A novel approach for automated navigation and searching is proposed by incorporating machine learning in virtual reality. An intelligent virtual agent learns objects of interest along with the paths followed for navigation.
    Classification: Entailment

    **Example 2 (Unverifiable - Key Info Missing):**
    Claim: AI-driven VR environments are used for safe training in healthcare and engineering.
    Evidence: Virtual Reality frameworks are a promising innovation relevant in areas like preparing test systems, therapeutic and human services, training, and the stimulation industry.
    *Reasoning*: The evidence mentions VR for training and healthcare, but it crucially omits any mention of "AI-driven" or "engineering". A key component of the claim is missing.
    Classification: Unverifiable

    **Example 3 (Contradiction - Overstatement):**
    Claim: Chatbots can assist students in choosing a major, which may **completely eliminate** the stress and confusion associated with this decision.
    Evidence: The making of this chatbot aims to **assist** prospective students in determining majors according to their personality. The majors' questionnaire is one way to **assist** students in recommending what majors they should have.
    *Reasoning*: The claim's use of "completely eliminate" is a massive exaggeration of the evidence's "assist". This is a contradiction.
    Classification: Contradiction

    **Example 4 (Entailment - Paraphrasing/Inference):**
    Claim: The system makes it easier for users to navigate virtual spaces.
    Evidence: A novel approach for automated navigation and searching is proposed by incorporating machine learning in virtual reality.
    *Reasoning*: "Easier to navigate" is a reasonable summary of "automated navigation and searching". This is Entailment.
    Classification: Entailment

    ### YOUR TASK ###
    Now, perform your step-by-step analysis on the following claim and evidence, then provide your final classification.

    Claim:
    {claim_content}

    Evidence:
    {evidence_text}

    ### OUTPUT REQUIREMENTS ###
    - You MUST respond with ONLY one of the three words: "Entailment", "Contradiction", or "Unverifiable".
    - Your entire response must be EXACTLY one of these three words.
    - Do NOT provide explanations, justifications, or any additional text in your final output.

    Classification:
    """

    response = await llm_wrapper(classification_prompt)
    all_responses.append(response)
    write_prediction_to_file(index, row_id, response)
    
    

Processing index 0
Processing index 50
Processing index 100
Processing index 150
Processing index 200
Processing index 250
Processing index 300
Processing index 350
Processing index 400
Processing index 450
Processing index 500
Processing index 550
Processing index 600
Processing index 650
Processing index 700
Processing index 750
Processing index 800
Processing index 850
Processing index 900
Processing index 950


In [8]:
import pandas as pd

# Read the original file
with open("outputs/response_test.txt", "r") as f:
    lines = f.readlines()

# Split each line and extract ID and label
full_data = [line.strip().split(", ")[1:] for line in lines]  # skip the first field (index)

# Create DataFrame
df = pd.DataFrame(full_data, columns=["ID", "label"])
df["label"] = df["label"].replace({
    "Unverifiable": "unver",
    "Contradiction": "contra",
    "Entailment": "entail"
})

# Drop exact duplicate rows (same ID and label)
df = df.drop_duplicates()

df.to_csv("outputs/kaggle_results/response_clean.csv", index=False)

In [None]:
df

In [None]:
results_df = dataset_loader["subtask1_train_batch2"][:100].copy()
results_df["true_label"] = results_df["label"]
results_df["predicted_label"]=  df.label[-100:].values
results_df

In [None]:
from sklearn.metrics import f1_score, classification_report


# Calculate evaluation metrics
true_labels = results_df['true_label'].tolist()
predicted_labels = results_df['predicted_label'].tolist()

# Calculate weighted F1 score (main evaluation metric)
weighted_f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"\n=== EVALUATION RESULTS ===")
print(f"Total samples processed: {len(results_df)}")
print(f"Weighted F1 Score: {weighted_f1:.4f}")

# Calculate accuracy
accuracy = (results_df['true_label'] == results_df['predicted_label']).mean()
print(f"Accuracy: {accuracy:.4f}")

# Print detailed classification report
print(f"\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(true_labels, predicted_labels))

# Show results by batch
print(f"\n=== RESULTS BY BATCH ===")
for batch in results_df['batch'].unique():
    batch_results = results_df[results_df['batch'] == batch]
    batch_accuracy = (batch_results['true_label'] == batch_results['predicted_label']).mean()
    batch_f1 = f1_score(batch_results['true_label'], batch_results['predicted_label'], average='weighted')
    print(f"{batch}: Accuracy={batch_accuracy:.4f}, Weighted F1={batch_f1:.4f}")

# Calculate F1 scores for batch2 and batch3 separately
print(f"\n=== INDIVIDUAL BATCH F1 SCORES ===")
if 'batch2' in results_df['batch'].values:
    batch2_results = results_df[results_df['batch'] == 'batch2']
    batch2_f1 = f1_score(batch2_results['true_label'], batch2_results['predicted_label'], average='weighted')
    print(f"Batch2 Weighted F1 Score: {batch2_f1:.4f}")

if 'batch3' in results_df['batch'].values:
    batch3_results = results_df[results_df['batch'] == 'batch3']
    batch3_f1 = f1_score(batch3_results['true_label'], batch3_results['predicted_label'], average='weighted')
    print(f"Batch3 Weighted F1 Score: {batch3_f1:.4f}")

# Display first few results
print(f"\n=== SAMPLE RESULTS ===")
print(results_df.head(10))


In [None]:
set(df.index.unique())