In [155]:
import os
import json
from typing import List
from pydantic import BaseModel, Field

class TransactionAnalysisDetail(BaseModel):
    txn_fraud_category: str = Field(...)
    reasoning: str = Field(...)

class TransactionAnalysis(BaseModel):
    txn_id: str = Field(...)
    txn_analysis_detail: TransactionAnalysisDetail = Field(...)

class AnalysisResponse(BaseModel):
    transactions: List[TransactionAnalysis]
    run_number: int
    accuracy_score: float
    duration: float  # Store as float (seconds)
    consistency_score: float = 0.0  # Consistency score

class TransactionRequestDetail(BaseModel):
    card_number: str = Field(...)
    timestamp: str = Field(...)
    amount: float = Field(...)
    merchant: str = Field(...)
    category: str = Field(...)
    location: str = Field(...)

class TransactionRequest(BaseModel):
    txn_id: str = Field(...)
    txn_details: TransactionRequestDetail = Field(...)
    txn_fraud_category: str = Field(...)

class AnalysisRequest(BaseModel):
    transactions: List[TransactionRequest]

results_file_name = "transactions_feed_results_full.json"
if os.path.exists(results_file_name):
    with open(results_file_name, 'r') as rf:
        actual_result = json.load(rf)
        ground_truth_list = AnalysisRequest.model_validate({"transactions": actual_result})
print("ground_truth_list: \n", ground_truth_list)

ground_truth_list: 
 transactions=[TransactionRequest(txn_id='TXN_101', txn_details=TransactionRequestDetail(card_number='1234-5678-9012-3456', timestamp='2023-10-27T09:00:00Z', amount=4.5, merchant='Daily Grind Coffee', category='Food & Drink', location='London, UK'), txn_fraud_category='legitimate'), TransactionRequest(txn_id='TXN_104', txn_details=TransactionRequestDetail(card_number='1234-5678-9012-3456', timestamp='2023-10-23T08:15:00Z', amount=2.5, merchant='TFL', category='Transport', location='London, UK'), txn_fraud_category='legitimate'), TransactionRequest(txn_id='TXN_105', txn_details=TransactionRequestDetail(card_number='1234-5678-9012-3456', timestamp='2023-10-23T18:30:00Z', amount=15.75, merchant='Tesco', category='Groceries', location='London, UK'), txn_fraud_category='legitimate'), TransactionRequest(txn_id='TXN_106', txn_details=TransactionRequestDetail(card_number='1234-5678-9012-3456', timestamp='2023-10-24T12:45:00Z', amount=8.99, merchant='Chicken World', category

In [156]:
def calculate_accuracy_score(model_result: AnalysisResponse) -> AnalysisResponse:
    model_predictions = model_result.transactions
    truth_lookup = {txn.txn_id: txn for txn in ground_truth_list.transactions}

    correct_count = 0
    total_count = 0

    for prediction in model_predictions:
        txn_id = prediction.txn_id
        predicted_label = prediction.txn_analysis_detail.txn_fraud_category

        if txn_id in truth_lookup:
            actual_label = truth_lookup[txn_id].txn_fraud_category
            if predicted_label.lower() == actual_label.lower():
                correct_count += 1
                print(f"✅ {txn_id}: Match!")
            else:
                print(f"❌ {txn_id}: Mismatch (Model: {predicted_label}, Actual: {actual_label})")
            total_count += 1
        else:
            print(f"⚠️ Warning: {txn_id} not in ground truth!")

    accuracy = (correct_count / total_count) * 100
    # print(f"\nFinal Accuracy: {accuracy}%")
    model_result.accuracy_score = round(accuracy, 2)
    return model_result

In [163]:
def evaluate_benchmarks(model_runs: dict):
    evaluation_results = []
    
    for model, runs in model_runs.items():
        print(f"\n{'='*60}")
        print(f"Evaluating model: {model}")
        print(f"{'='*60}")
        
        total_accuracy = sum(run.accuracy_score for run in runs)
        total_duration = sum(run.duration for run in runs)
        
        avg_accuracy = total_accuracy / len(runs)
        avg_duration = total_duration / len(runs)
        
        # Calculate consistency: check if predictions are the same across all runs
        txn_predictions = {}  # {txn_id: [prediction1, prediction2, ...]}
        
        for run in runs:
            for txn in run.transactions:
                txn_id = txn.txn_id
                prediction = txn.txn_analysis_detail.txn_fraud_category.lower()
                
                if txn_id not in txn_predictions:
                    txn_predictions[txn_id] = []
                txn_predictions[txn_id].append(prediction)
        
        # Print per-transaction consistency
        print("\nTransaction-level Consistency:")
        consistent_count = 0
        for txn_id, predictions in txn_predictions.items():
            unique_predictions = set(predictions)
            is_consistent = len(unique_predictions) == 1
            
            if is_consistent:
                consistent_count += 1
                print(f"  ✅ {txn_id}: Consistent ({predictions[0]}) across all {len(runs)} runs")
            else:
                print(f"  ❌ {txn_id}: Inconsistent - predictions varied: {predictions}")
        
        consistency_score = (consistent_count / len(txn_predictions)) * 100
        
        model_eval = {
            "model": model,
            "avg_duration_seconds": round(avg_duration, 2),
            "avg_accuracy": round(avg_accuracy, 2),
            "consistency_score": round(consistency_score, 2)
        }
        evaluation_results.append(model_eval)
        print(f"\nOverall Consistency: {consistency_score:.2f}% ({consistent_count}/{len(txn_predictions)} transactions)")
        print(f"Average Accuracy: {avg_accuracy:.2f}%")
        print(f"Average Duration: {avg_duration:.2f}s")
    
    return evaluation_results

In [158]:
import os
import json

txn_file_name= "transactions_feed_full.json"
if os.path.exists(txn_file_name):
    with open(txn_file_name, 'r') as f:
        txns = json.load(f)
    serialized_txn = json.dumps(txns, indent=2)


In [159]:
import ollama

models_to_evaluate = ['llama3.2:latest',
                      'phi3:mini',
                      'mistral:7b'
                      ]

print("Checking available models...")
available_models = [mod['model'] for mod in ollama.list()['models']]
available_models

Checking available models...


['llama3.2:latest',
 'storyteller:latest',
 'mistral:7b',
 'llama3.1:latest',
 'phi3:mini',
 'llama3.2:3b']

In [160]:
system_prompt = "You are a Financial Fraud Investigator. Analyse the transactions submitted and categorize as Legitimate or Suspicious. As output, for each transaction, give Category (as txn_fraud_category) and Reasoning (as reasoning) in JSON"

payload = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": serialized_txn}
]

options = {"temperature": 0.1, "top_p": 0.95}

In [164]:
model_runs = {}

for model in models_to_evaluate:
    if model not in available_models:
        print(f"\nPulling model '{model}'...")
        ollama.pull(model)
    else:
        print(f"Model '{model}' available, running benchmark...\n")
        
        model_runs[model] = []
        for run_num in range(3):
            result = ollama.chat(
                model=model,
                messages=payload,
                stream=False,
                options=options,
                format=AnalysisResponse.model_json_schema()
            )
            
            analysis_response = AnalysisResponse.model_validate_json(result.message.content)
            analysis_response = calculate_accuracy_score(analysis_response)
            analysis_response.duration = result['total_duration'] / 1_000_000_000  # Convert to seconds
            analysis_response.run_number = run_num + 1
            
            print(f"Run {run_num + 1}: Time={analysis_response.duration:.2f}s, Accuracy={analysis_response.accuracy_score}%\n")
            
            model_runs[model].append(analysis_response)

evaluation_results = evaluate_benchmarks(model_runs)

# Print summary
print("\n" + "="*60)
print("FINAL BENCHMARK RESULTS")
print("="*60)
for result in evaluation_results:
    print(f"\nModel: {result['model']}")
    print(f"  Average Accuracy: {result['avg_accuracy']}%")
    print(f"  Average Duration: {result['avg_duration_seconds']}s")
    print(f"  Consistency Score: {result['consistency_score']}%")

Model 'llama3.2:latest' available, running benchmark...

✅ TXN_101: Match!
❌ TXN_104: Mismatch (Model: Suspicious, Actual: legitimate)
✅ TXN_105: Match!
❌ TXN_106: Mismatch (Model: Suspicious, Actual: legitimate)
✅ TXN_107: Match!
❌ TXN_108: Mismatch (Model: Suspicious, Actual: legitimate)
❌ TXN_109: Mismatch (Model: Suspicious, Actual: legitimate)
✅ TXN_110: Match!
❌ TXN_111: Mismatch (Model: Suspicious, Actual: legitimate)
✅ TXN_112: Match!
✅ TXN_102: Match!
❌ TXN_103: Mismatch (Model: Legitimate, Actual: suspicious)
✅ TXN_113: Match!
✅ TXN_114: Match!
✅ TXN_115: Match!
✅ TXN_116: Match!
❌ TXN_117: Mismatch (Model: Legitimate, Actual: suspicious)
❌ TXN_118: Mismatch (Model: Suspicious, Actual: legitimate)
❌ TXN_119: Mismatch (Model: Legitimate, Actual: suspicious)
✅ TXN_120: Match!
Run 1: Time=58.52s, Accuracy=55.0%

✅ TXN_101: Match!
❌ TXN_104: Mismatch (Model: Suspicious, Actual: legitimate)
✅ TXN_105: Match!
❌ TXN_106: Mismatch (Model: Suspicious, Actual: legitimate)
✅ TXN_107: Ma

In [165]:
import pandas as pd

# Ensure all rows are displayed
pd.set_option('display.max_rows', None)

# Create detailed dataframe with individual runs
detailed_data = []

for model, runs in model_runs.items():
    # Add individual run rows
    for run in runs:
        detailed_data.append({
            'model': model,
            'run': f'Run {run.run_number}',
            'accuracy': run.accuracy_score,
            'duration_seconds': round(run.duration, 2),
            'consistency': '-'  # Individual runs don't have consistency
        })
    
    # Add average row for this model
    eval_result = next(r for r in evaluation_results if r['model'] == model)
    detailed_data.append({
        'model': model,
        'run': 'AVERAGE',
        'accuracy': eval_result['avg_accuracy'],
        'duration_seconds': eval_result['avg_duration_seconds'],
        'consistency': eval_result['consistency_score']
    })

# Create DataFrame
df_detailed = pd.DataFrame(detailed_data)
print(f"Total rows: {len(df_detailed)}")
df_detailed

Total rows: 12


Unnamed: 0,model,run,accuracy,duration_seconds,consistency
0,llama3.2:latest,Run 1,55.0,58.52,-
1,llama3.2:latest,Run 2,60.0,52.7,-
2,llama3.2:latest,Run 3,55.0,60.85,-
3,llama3.2:latest,AVERAGE,56.67,57.36,95.0
4,phi3:mini,Run 1,73.68,101.65,-
5,phi3:mini,Run 2,73.68,103.9,-
6,phi3:mini,Run 3,77.78,109.36,-
7,phi3:mini,AVERAGE,75.05,104.97,100.0
8,mistral:7b,Run 1,90.0,143.96,-
9,mistral:7b,Run 2,90.0,146.54,-
