<H2><center>Uncertainty Quantification


<H2>Description</H2>This notebook presents a method to quantify confidence score based on uncertainty in the predicted risk score. Since external data sources (SEC EDGAR, OFAC, WikiData) are deterministic, uncertainty propagates only from the LLM-predicted risk score.


### **Numerical Quantification**
- The LLM prediction is repeated **5 times**, and the **standard deviation** of the scores indicates uncertainty.

### **Scaling Confidence Score**
- Since risk values lie in **[0,1]**, the standard deviation (**σ**) falls within **0 < σ < 0.5**.
- The confidence score is computed as:
  **Confidence Score = 1 - 2σ**
  
- If all predictions are identical (**σ = 0**), confidence is **1**.
- If predictions vary widely (**σ ≈ 0.5**), confidence is **0**.


In [1]:
import json
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from thefuzz import fuzz, process
from openai import OpenAI
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import numpy as np
import requests
import random
import csv

In [2]:
class EntityExtractor:
    def __init__(self, base_url="https://openrouter.ai/api/v1"):
        self.client = OpenAI(
            base_url=base_url,
            api_key="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")
        self.model = "mistralai/mistral-small-3.1-24b-instruct:free"
        self.max_retries = 5
        
    def extract_entities(self, transaction_text: str) -> list:
        prompt_text = f"""
            Extract transaction id and all named entities from the following transaction text and classify each entity into one of these categories:
            Person, Politically Exposed Person, Corporation, Bank, Government Agency, Non-Profit Organization, or Shell Company.
            - If the entity's category is ambiguous, classify it as "Corporation".
            - Do NOT include IBANs, VPNs, IPs, addresses, account numbers, tax IDs, location, cities, or countries.
            - Banks should be extracted separately (e.g., "Swiss Bank", "Cayman National Bank") without IBAN/account numbers.
            - Include people with titles (Mr., Mrs., Dr., etc.) as Person.
            - Output only a JSON list of objects. Each object must have exactly two keys: 
              "entity" (the entity name) and "category" (the classified category).
            - Do not output into code block, print as raw text

            Transaction Text:
            {transaction_text}

            Output Format **(Do not output into code block, print as raw text):**
            Eg: 
            [   {{"Transaction ID": "TXN0234"}}
                {{
                    "entity": "Acme Corp",
                    "category": "Corporation"
                }},
                {{
                    "entity": "SovCo Capital Partners",
                    "category": "Corporation"
                }}
            ]
        """
        for attempt in range(self.max_retries):
            try:
                completion = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [{"type": "text", "text": prompt_text}]
                        }
                    ]
                )
                extracted_text = completion.choices[0].message.content
                extracted_entities = json.loads(extracted_text)
                return extracted_entities

            except Exception:
                lines = extracted_text.splitlines()
                if len(lines) > 2:
                    trimmed = "\n".join(lines[1:-1]).strip()
                    try:
                        extracted_entities = json.loads(trimmed)
                        return extracted_entities
                    except Exception:
                        pass


        return {"error": "Failed to parse entity extraction response"}

In [3]:
class CompanyMatcher:
    def __init__(self, database):
        self.lemmatizer = WordNetLemmatizer()
        self.raw_database = database  
        self.database = [self.preprocess_text(name) for name in database]  # Preprocessed database
        
        self.vectorizer = TfidfVectorizer()
        if self.database:
            self.vectors = self.vectorizer.fit_transform(self.database)
        else:
            self.vectors = None 

    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)  
        text = re.sub(r'\s+', ' ', text).strip() 
        
        stopwords = {"corporation", "limited", "ltd", "solutions", "technologies", 
                     "consulting", "consultancy", "services", "systems", "group", 
                     "inc", "pvt", "plc", "co"}
        
        words = text.split()
        words = [self.lemmatizer.lemmatize(word) for word in words if word not in stopwords]  # Apply lemmatization
        return " ".join(words)

    def get_top_cosine_matches(self, query, top_n=5):
        if not self.database:  
            return []

        query_cleaned = self.preprocess_text(query)
        query_vector = self.vectorizer.transform([query_cleaned])
        similarity_scores = cosine_similarity(query_vector, self.vectors).flatten()

        top_n = min(top_n, len(similarity_scores)) 
        top_indices = np.argsort(-similarity_scores)[:top_n]
        top_matches = [(self.raw_database[i], similarity_scores[i]) for i in top_indices]

        return top_matches

    def apply_fuzzy_matching(self, query, candidates):
        if not candidates:
            return None, None

        query_cleaned = self.preprocess_text(query)
        best_match, best_score = max(
            ((name, fuzz.ratio(query_cleaned, self.preprocess_text(name))) for name, _ in candidates),
            key=lambda x: x[1]
        )

        return (best_match, best_score)

    def find_best_match(self, query, top_n=5):
        top_matches = self.get_top_cosine_matches(query, top_n)
        best_match, best_score = self.apply_fuzzy_matching(query, top_matches)
        
        return best_match, best_score

In [4]:
class CompanyScreening:
    def __init__(self, ofac_list_file):
        self.SEC_BASE_URL = "https://www.sec.gov/cgi-bin/browse-edgar"
        self.WIKIDATA_URL = "https://query.wikidata.org/sparql"
        self.NEWS_API_KEY = "b77ab45e7bab4db7bad3108d90de4b25"
        self.OFAC_LIST_FILE = ofac_list_file
        with open(self.OFAC_LIST_FILE, "r") as f:
            self.ofac_companies = [line.strip().lower() for line in f]
        self.matcher = CompanyMatcher(self.ofac_companies)
        
    def check_sec_edgar(self, company_name):
        params = {"action": "getcompany", "company": company_name, "output": "atom"}
        headers = {"User-Agent": "XXX (xxx@yyy.com)"}
        response = requests.get(self.SEC_BASE_URL, params = params, headers = headers)
        if response.status_code == 200 and "No matching companies" not in response.text:
            soup = BeautifulSoup(response.text, "xml")
            cik_tag = soup.find("cik")
            edgar_profile_url = None
            recent_8k_filings = []
            
            if cik_tag:
                cik = cik_tag.text.strip()

                three_years_ago = datetime.now() - timedelta(days=3*365)
                filings = soup.find_all("entry")

                for filing in filings:
                    title = filing.find("title").text
                    date_str = filing.find("updated").text[:10]
                    filing_date = datetime.strptime(date_str, "%Y-%m-%d")

                    if "8-K" in title and filing_date >= three_years_ago:
                        recent_8k_filings.append(title)

                return {
                    "SEC Registered": "Yes",
                    "CIK": cik if cik_tag else "Not available",
                    "Recent 8-K Filings": recent_8k_filings[:3] if recent_8k_filings else "None"
                }
        return {
            "SEC Registered": "No"
        }

    def check_ofac_sanctions(self, company_name, threshold=85):
        match, score = self.matcher.find_best_match(company_name)
        return {
            "OFAC Sanctioned": "Yes" if score >= threshold else "No",
            "Closest OFAC Database Match": match if score >= threshold else "None",
        }

    def check_wikidata_scandals(self, company_name):
        
        query = f"""
        SELECT ?company ?companyLabel ?industry ?industryLabel ?scandal ?scandalLabel ?description WHERE {{
          ?company rdfs:label "{company_name}"@en.
          ?company wdt:P31 wd:Q4830453.  # Instance of (Business/Company)
          OPTIONAL {{ ?company wdt:P452 ?industry. }}  # Industry type
          OPTIONAL {{ ?company schema:description ?description. FILTER (LANG(?description) = "en") }}

          # Looking for scandals
          OPTIONAL {{ ?company wdt:P793 ?scandal. }}  # Significant events (may include fraud cases, controversies)
          OPTIONAL {{ ?company wdt:P5053 ?scandal. }} # Cause of dissolution (bankruptcy, fraud)
          OPTIONAL {{ ?company wdt:P2416 ?scandal. }} # Scandals (direct connection)

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        """
        response = requests.get(self.WIKIDATA_URL, params={"query": query, "format": "json"})

        if response.status_code != 200:
            return {"Error": "Failed to fetch data"}

        data = response.json().get("results", {}).get("bindings", [])

        if not data:
            return {"Status": "Not Found in Wikidata"}

        result = data[0] 
        company_qid = result["company"]["value"].split("/")[-1]
        
        description = result.get("description", {}).get("value", "No description available")
        if "scandal" in result:
            scandal_name = result["scandalLabel"]["value"]
            scandal_qid = result["scandal"]["value"].split("/")[-1]
            scandal_link = f"https://www.wikidata.org/wiki/{scandal_qid}"

        return {
            "Wikidata_QID": company_qid,
            "Description": description,
            "Scandals": (scandal_name, scandal_link) if "scandal" in result else "No known scandals",
        }
    def get_news(self, company_name):
        url = f"https://newsapi.org/v2/everything?q={company_name}&language=en&sortBy=publishedAt&apiKey={self.NEWS_API_KEY}"
        response = requests.get(url)
        if response.status_code != 200:
            return {"Recent News": "Error fetching news"}

        news_data = response.json()
        articles = news_data.get("articles", [])

        filtered_articles = [article["title"] for article in articles if company_name.lower() in article["title"].lower()]
        return {"Recent News": filtered_articles[:3] if filtered_articles else "No relevant news found"}

    def screen_company(self, company_name):
        result = {"Company": company_name}
        result.update(self.check_sec_edgar(company_name))
        result.update(self.check_ofac_sanctions(company_name))
        result.update(self.check_wikidata_scandals(company_name))
        result.update(self.get_news(company_name))
        return result

In [5]:
class TransactionAnalyzer:
    def __init__(self, ofac_list_file):
        self.entity_extractor = EntityExtractor()
        self.company_screening = CompanyScreening(ofac_list_file)

    def analyze_transaction(self, transaction_text):
        extracted_entities = self.entity_extractor.extract_entities(transaction_text)

        transaction_id = None
        for entity in extracted_entities:
            if "Transaction ID" in entity:
                transaction_id = entity["Transaction ID"]
                break

        corporation_details = {"Corporation Details": []}
        for entity in extracted_entities:
            if entity.get("category") == "Corporation":
                company_details = self.company_screening.screen_company(entity["entity"])
                corporation_details["Corporation Details"].append(company_details)

        extracted_entities_list = [entity["entity"] for entity in extracted_entities if "entity" in entity]
        entity_types = [entity["category"] for entity in extracted_entities if "category" in entity]

        transaction_summary = {
            "Transaction ID": transaction_id,
            "Extracted Entities": extracted_entities_list,
            "Entity Type": entity_types
        }

        return json.dumps({"corporation_details": corporation_details, "transaction_summary": transaction_summary}, indent=4)


In [6]:
class RiskAnalyzer:
    def __init__(self):
        self.api_key = "7f4f86c39168c8929cf40312d13d926e94fc4fd7f22293a75085761c2f55a263" 
        self.api_url = "https://api.together.xyz/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        
    def analyze_risk(self, test_input):

        icl_prompt = f"""Input: {test_input}\nOutput (Float): **Output only risk score between 0 and 1 without additional details***
        Output format: 0.xx **Dont print reason. Score each transaction independently**
        """
            
        data = {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "messages": [{"role": "user", "content": icl_prompt}],
        }
        
        response = requests.post(self.api_url, headers=self.headers, json=data)
        
        try:
            return response.json()["choices"][0]["message"]["content"]
        except KeyError:
            return f"Error: {response.json()}"


In [7]:
sample_transaction = """ Transaction ID: TXN-2023-5A95

Date: 2023-08-15 14:22:00

Sender:
Name: Global Horizona Consulting LLC
Account: IBAN CH56 0483 5012 3456 7800 9 (Swiss bank)
Address: Rue du Marché 17, Geneva, Switzerland
Notes: Consulting fees for project Aurora

Receiver:
Name: Bright Future Nonprofit Inc
Account: 987654321 (Cayman National Bank, KYJ)
Address: P.O. Box 1234, George Town, Cayman Islands
Tax ID: RY-45678

Amount: $49,850.00 (USD)
Currency Exchange: N/A
Transaction Type: Wire Transfer
Reference: Charitable Donation Ref #DR-2023-0815

Additional Notes:
Urgent transfer approved by Mr. Ali Al-Mansoori (Director).
Linked invoice missing. Processed via intermediary Quantum Holding.

Sender IP: 192.168.09.123 (VPN detected: NordVPN, exit node in Panama)
"""

In [8]:
risk = []
for i in range(5):
    analyzer = TransactionAnalyzer("ofac_list.txt")
    result_json = analyzer.analyze_transaction(sample_transaction)
    corporation_details = json.loads(result_json)["corporation_details"]
    risk_model_input = f"{sample_transaction}\n{result_json}"
    analyzer = RiskAnalyzer()
    result = analyzer.analyze_risk(risk_model_input)
    matches = re.findall(r"\d+\.\d+", result)
    risk_score = [float(m) for m in matches]
    risk.append(risk_score[0])

In [9]:
print(f"LLM Predicted Risk Scores: {risk}")

LLM Predicted Risk Scores: [0.6, 0.55, 0.65, 0.7, 0.72]


In [10]:
std_dev = np.std(risk)
confidence_score = 1 - 2 * std_dev
print("Standard Deviation:", round(std_dev,2))
print("Confidence Score:", round(confidence_score,2))

Standard Deviation: 0.06
Confidence Score: 0.87
