In [4]:
import sys
sys.path.insert(0, '/home/dom/AIR/air25')

import joblib
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from src.config import CLAIMS_PATH
from src.a2.pipeline import run_pipeline
from src.a2.eval import evaluate_voting_results

In [5]:
# config

RANDOM_STATE = 21
TEST_SIZE = 0.2

K_RETRIEVE = 100
K_VOTE = 10
USE_WEIGHTED = False

In [6]:
# create validation batch
all_claims = pd.read_csv(CLAIMS_PATH)

train_set, val_set = train_test_split(
    all_claims,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=all_claims['claim_label']
)

print(f"Validation batch size: {len(val_set)}")

Validation batch size: 276


In [7]:
# run

results = run_pipeline(claims_df=val_set, k_retrieve=K_RETRIEVE, k_vote=K_VOTE,use_weighted=USE_WEIGHTED)

joblib.dump(results, 'pipeline_results_2_ret.joblib')
#results = joblib.load('pipeline_results_2_ret.joblib')

KeyboardInterrupt: 

In [None]:
acc, f1_macro, f1_weighted, report, y_pred, y_true = evaluate_voting_results(results, CLAIMS_PATH)

print(f"Accuracy: {acc:.4f}")
print(f"F1_Macro: {f1_macro:.4f}")
print(f"F1_Weighted: {f1_weighted:.4f}")
print(f"Classification report: {report}")

In [None]:
# cm

VALID_LABELS = ['SUPPORTS', 'REFUTES', 'NOT_ENOUGH_INFO']
cm = confusion_matrix(y_true, y_pred, labels=VALID_LABELS)
print("Confusion Matrix:")
print(f"Labels: {VALID_LABELS}")
print(cm)

In [None]:
from matplotlib import pyplot as plt
from collections import Counter

# Ground truth
gt_counts = Counter(y_true)
plt.bar(gt_counts.keys(), gt_counts.values(), color='steelblue')

# Pred
pred_counts = Counter(y_pred)
plt.bar(pred_counts.keys(), pred_counts.values(), color='coral')

# TODO: This plot is fucked

plt.show()

In [None]:
from sklearn.metrics import precision_recall_fscore_support

# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_true, y_pred, labels=VALID_LABELS, zero_division=0
)

per_class_df = pd.DataFrame({
    'Class': VALID_LABELS,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
})
print("Per-Class Metrics:")
print(per_class_df.to_string(index=False))

In [None]:
import numpy as np

# Save for comparison notebook
results_dict = {
    'results': results,
    'y_true': y_true,
    'y_pred': y_pred,
    'accuracy': acc,
    'f1_macro': f1_macro,
    'f1_weighted': f1_weighted,
    'confusion_matrix': cm,
    'per_class': per_class_df.to_dict(),
}

np.save('eval_results_a2.npy', results_dict, allow_pickle=True)