In [3]:
import json
import os
import pandas as pd

In [4]:
# Static file paths.
GROUND_TRUTH_PATH = '../../data/duo-student-finance/cases.json'
RESULTS_PATH = '../../results/duo-student-finance/'

In [5]:
# Specific results files.
result_files = [
    'deepseek-r1:8b-0.8-no-20250129110014.json',
    'llama3.2-0.8-no-20250129091911.json',
    'qwen2.5:1.5b-0.8-no-20250129094820.json',
    'deepseek-r1:8b-0.8-yes-20250129105858.json',
    'llama3.2-0.8-yes-20250129085941.json',
    'qwen2.5:1.5b-0.8-yes-20250129093907.json'
]

In [6]:
# Load ground truth data.
with open(GROUND_TRUTH_PATH, 'r') as f:
    ground_truth_data = json.load(f)

df_truth = pd.DataFrame(ground_truth_data)

In [7]:
# Load results from each file.
df_results = {}

for file in result_files:
    result_path = os.path.join(RESULTS_PATH, file)
    with open(result_path, 'r') as f:
        result_data = json.load(f)
    df_result = pd.DataFrame(result_data)
    # Pre-process the prediction field.
    if 'prediction' in df_result.columns:
        df_result['prediction'] = df_result['prediction'].replace('Not Eligible', 'NotEligible')
    # Merge the current results with the ground truth data.
    df_merged = df_truth.merge(df_result, left_on='id', right_on='case_id', how='inner')
    # Update the results dictionary.
    model_key = file.rsplit('-', 1)[0]
    model_key = model_key.replace('.json', '')
    df_results[model_key] = df_merged

In [8]:
# MODEL SETUP: Qwen-2.5, params: 1.5B, temperature: 0.8, decision_tree: no.
df_qwen_no = df_results['qwen2.5:1.5b-0.8-no']
correct_predictions = (df_qwen_no['decision'] == df_qwen_no['prediction']).sum()
total_cases = len(df_qwen_no)

# ACCURACY RATIO: correct predictions / total cases.
accuracy_ratio_qwen_no = correct_predictions / total_cases if total_cases > 0 else 0.0
print(f'Accuracy Ratio for Qwen-2.5 (no decision tree): {accuracy_ratio_qwen_no:.2f}')

Accuracy Ratio for Qwen-2.5 (no decision tree): 0.57


In [9]:
# MODEL SETUP: Qwen-2.5, params: 1.5B, temperature: 0.8, decision_tree: yes.
df_qwen_yes = df_results['qwen2.5:1.5b-0.8-yes']
correct_predictions = (df_qwen_yes['decision'] == df_qwen_yes['prediction']).sum()
total_cases = len(df_qwen_yes)

# ACCURACY RATIO: correct predictions / total cases.
accuracy_ratio_qwen_yes = correct_predictions / total_cases if total_cases > 0 else 0.0
print(f'Accuracy Ratio for Qwen-2.5 (with decision tree): {accuracy_ratio_qwen_yes:.2f}')

Accuracy Ratio for Qwen-2.5 (with decision tree): 0.51


In [10]:
# MODEL SETUP: LLaMa-3.2, params: 3.2B, temperature: 0.8, decision_tree: no.
df_llama_no = df_results['llama3.2-0.8-no']
correct_predictions = (df_llama_no['decision'] == df_llama_no['prediction']).sum()
total_cases = len(df_llama_no)

# ACCURACY RATIO: correct predictions / total cases.
accuracy_ratio_llama_no = correct_predictions / total_cases if total_cases > 0 else 0.0
print(f'Accuracy Ratio for LLaMa-3.2 (no decision tree): {accuracy_ratio_llama_no:.2f}')

Accuracy Ratio for LLaMa-3.2 (no decision tree): 0.56


In [11]:
# MODEL SETUP: LLaMa-3.2, params: 3.2B, temperature: 0.8, decision_tree: yes.
df_llama_yes = df_results['llama3.2-0.8-yes']
correct_predictions = (df_llama_yes['decision'] == df_llama_yes['prediction']).sum()
total_cases = len(df_llama_yes)

# ACCURACY RATIO: correct predictions / total cases.
accuracy_ratio_llama_yes = correct_predictions / total_cases if total_cases > 0 else 0.0
print(f'Accuracy Ratio for LLaMa-3.2 (with decision tree): {accuracy_ratio_llama_yes:.2f}')

Accuracy Ratio for LLaMa-3.2 (with decision tree): 0.59


In [14]:
# MODEL SETUP: DeepSeek-R1, params: 8B, temperature: 0.8, decision_tree: no.
df_deepseek_no = df_results['deepseek-r1:8b-0.8-no']
correct_predictions = (df_deepseek_no['decision'] == df_deepseek_no['prediction']).sum()
total_cases = len(df_deepseek_no)

# ACCURACY RATIO: correct predictions / total cases.
accuracy_ratio_deepseek_no = correct_predictions / total_cases if total_cases > 0 else 0.0
print(f'Accuracy Ratio for DeepSeek-R1 (no decision tree): {accuracy_ratio_deepseek_no:.2f}')

Accuracy Ratio for DeepSeek-R1 (no decision tree): 0.68


In [15]:
# MODEL SETUP: DeepSeek-R1, params: 8B, temperature: 0.8, decision_tree: yes.
df_deepseek_yes = df_results['deepseek-r1:8b-0.8-yes']
correct_predictions = (df_deepseek_yes['decision'] == df_deepseek_yes['prediction']).sum()
total_cases = len(df_deepseek_yes)

# ACCURACY RATIO: correct predictions / total cases.
accuracy_ratio_deepseek_yes = correct_predictions / total_cases if total_cases > 0 else 0.0
print(f'Accuracy Ratio for DeepSeek-R1 (with decision tree): {accuracy_ratio_deepseek_yes:.2f}')

Accuracy Ratio for DeepSeek-R1 (with decision tree): 0.87
