In [261]:
from typing import List
from pydantic import BaseModel, Field

class Transaction(BaseModel):
    txn_id:str = Field(...)
    txn_fraud_category:str=Field(...)
    reasoning:str = Field(...)

class TransactionAnalysis(BaseModel):
    transactions : List[Transaction]


In [262]:
def calculate_accuracy_score(model_result)->float:
    print("model_result: \n",model_result)
    results_file_name = "../transactions_feed_results.json"
    if os.path.exists(results_file_name):
        with open(results_file_name, 'r') as rf:
            actual_result=json.load(rf)
    print("actual_result: \n",actual_result)
    # 1. Prepare the Data
    # Assume 'model_output' is the dictionary you received from the LLM
    model_predictions = model_result['transactions']

    # Assume 'ground_truth_list' is your actual_result list
    ground_truth_list = actual_result

    # ---------------------------------------------------------
    # 2. Create the Lookup Dictionary (The Magic Step ✨)
    # We turn the list into a dict keyed by 'txn_id'
    # ---------------------------------------------------------
    truth_lookup = {item['txn_id']: item for item in ground_truth_list}

    # Now truth_lookup looks like:
    # {
    #    'TXN_101': {'txn_id': 'TXN_101', 'txn_fraud_category': 'legitimate', ...},
    #    'TXN_102': {'txn_id': 'TXN_102', 'txn_fraud_category': 'legitimate', ...}
    # }

    # ---------------------------------------------------------
    # 3. Iterate and Compare
    # ---------------------------------------------------------
    correct_count = 0
    total_count = 0

    for prediction in model_predictions:
        txn_id = prediction['txn_id']

        # Get the model's guess
        predicted_label = prediction['txn_fraud_category']

        # Retrieve the TRUTH instantly using the ID
        if txn_id in truth_lookup:
            actual_label = truth_lookup[txn_id]['txn_fraud_category']

            # --- CRITICAL: Case-Insensitive Comparison ---
            # Your model returned "Legitimate" (Title Case)
            # Your data has "legitimate" (Lower Case)
            if predicted_label.lower() == actual_label.lower():
                correct_count += 1
                print(f"✅ {txn_id}: Match!")
            else:
                print(f"❌ {txn_id}: Mismatch (Model: {predicted_label}, Actual: {actual_label})")

            total_count += 1
        else:
            print(f"⚠️ Warning: Model predicted {txn_id}, but it's not in our ground truth!")

    # 4. Final Score
    accuracy = (correct_count / total_count) * 100
    print(f"\nFinal Accuracy: {accuracy}%")
    return round(accuracy,2)

In [263]:
import os
import json

txn_file_name= "../transactions_feed.json"
if os.path.exists(txn_file_name):
    with open(txn_file_name, 'r') as f:
        txns = json.load(f)
    serialized_txn = json.dumps(txns, indent=2)
    # print(serialized_txn)



In [264]:
import ollama
# --- 1. Setup Models ---
models_to_evaluate = [
                        'llama3.2:latest',
                       # 'mistral:7b',
                       #  'phi3:mini'
]
print("Checking available models...")
available_models = [mod['model'] for mod in ollama.list()['models']]
available_models

Checking available models...


['llama3.2:latest',
 'storyteller:latest',
 'mistral:7b',
 'llama3.1:latest',
 'phi3:mini',
 'llama3.2:3b']

In [265]:
system_prompt = "You are a Financial Fraud Investigator. Analyse the transactions submitted and categorize as Legitimate or Suspicious. As output, for each transaction, give Category (as txn_fraud_category) and Reasoning (as reasoning) in JSON"
payload=[
    {"role" :"system",
     "content":system_prompt},
    {"role":"user",
     "content":serialized_txn}
]

# options_map = {
#     "deterministic": {"temperature": 0.1, "top_p": 0.95},
#     # "balanced": {"temperature": 0.5, "top_p": 0.9},
#     # "creative": {"temperature": 0.9, "top_p": 0.85}

options= {
   "temperature": 0.1, "top_p": 0.95}

In [266]:
evaluation_results = []

for model in models_to_evaluate:
    if model not in available_models:
        # print(f"\nModel '{model}' is missing. Pulling now...")
        response = ollama.pull(model, stream=True)
        for progress in response:
            print(f"  {progress.get('status')}", end='\r')
        # print(f"\nFinished pulling {model}")
    else:
        print(f"Model '{model}' is already available, processing the request now.....")
        model_eval={}
        model_eval["model"]=model
        result = ollama.chat(model=model, messages=payload, stream=False, options=options, format="json")
        assistant_msg = result['message']['content']
        # print(f"result type :\n {type(assistant_msg)}")
        # print(f"result :\n {assistant_msg}")
        accuracy_score=calculate_accuracy_score(json.loads(assistant_msg))
        # Convert to seconds first
        duration_ns = result['total_duration']
        total_seconds = duration_ns / 1_000_000_000
        # Calculate minutes and remaining seconds
        minutes, seconds = divmod(total_seconds, 60)
        speed_display = f"{int(minutes)}m {seconds:.2f}s"
        print(f"Time taken: {speed_display}")
        print(f"Accuracy: {accuracy_score}")
        model_eval["duration"]=speed_display
        model_eval["accuracy"]=accuracy_score
        evaluation_results.append(model_eval)


Model 'llama3.2:latest' is already available, processing the request now.....
model_result: 
 {'transactions': [{'txn_id': 'TXN_101', 'txn_fraud_category': 'Legitimate', 'reasoning': 'The merchant and location are consistent with a legitimate transaction.'}, {'txn_id': 'TXN_102', 'txn_fraud_category': 'Suspicious', 'reasoning': 'The location is inconsistent with the merchant, suggesting potential identity theft or fake address.'}]}
actual_result: 
 [{'txn_id': 'TXN_101', 'txn_details': {'card_number': '1234-5678-9012-3456', 'timestamp': '2023-10-27T09:00:00Z', 'amount': 4.5, 'merchant': 'Daily Grind Coffee', 'category': 'Food & Drink', 'location': 'London, UK'}, 'txn_fraud_category': 'legitimate'}, {'txn_id': 'TXN_102', 'txn_details': {'card_number': '1234-5678-9012-3456', 'timestamp': '2023-10-27T09:00:00Z', 'amount': 4.5, 'merchant': 'Daily Grind Coffee', 'category': 'Food & Drink', 'location': 'India, UK'}, 'txn_fraud_category': 'legitimate'}]
✅ TXN_101: Match!
❌ TXN_102: Mismatch (

In [267]:
import pandas as pd

df = pd.DataFrame(evaluation_results)

df.round(2)

Unnamed: 0,model,duration,accuracy
0,llama3.2:latest,0m 19.35s,50.0
