In [55]:
import requests, json, re
from pymongo import MongoClient
from rapidfuzz import fuzz
from newsapi import NewsApiClient

# Entity Extraction Layer

In [25]:
def generate_prompt(raw_data):
    """
    Generates a few-shot prompt that instructs the LLM to convert raw transaction data
    into a canonical JSON format with additional extraction improvements:
    - Extract extra entities from free-text fields (e.g., additional_notes, raw_content).
    - Normalize entity names (e.g., convert abbreviations like 'Acme Corp' to 'Acme Corporation').
    - Flag uncertain or ambiguous extractions with an explanatory note.
    - Preserve context by indicating which field an entity was extracted from.
    - Provide an extra field "risk_evaluation" where you assess potential risk based on the data.

    The canonical JSON format should follow this structure:
    {
        "transaction_id": <string>,
        "timestamp": <string or null>,
        "sender": {
            "name": <string or null>,
            "account": <string or null>,
            "address": <string or null>,
            "notes": <string or null>,
            "metadata": {}
        },
        "receiver": {
            "name": <string or null>,
            "account": <string or null>,
            "address": <string or null>,
            "tax_id": <string or null>,
            "metadata": {}
        },
        "amount": <string or null>,
        "currency_exchange": <string or null>,
        "transaction_type": <string or null>,
        "reference": <string or null>,
        "additional_notes": [<list of strings>],
        "raw_content": <original input text>,
        "extra_fields": {},
        "risk_evaluation": <string or null>
    }
    """
    few_shot_prompt = f"""
You are an expert data extraction assistant. Your task is to convert the provided transaction data into a canonical JSON format with additional extraction improvements:
- Extract extra entities from free-text fields (e.g., additional_notes, raw_content).
- Normalize entity names (e.g., convert abbreviations like 'Acme Corp' to 'Acme Corporation').
- Flag uncertain or ambiguous extractions with an explanatory note.
- Preserve context by indicating which field an entity was extracted from.
- Provide an extra field "risk_evaluation" where you assess potential risk based on the data.
- Provide a field entities: [] containing all the extracted entities (Entities refer to the sender, receiver, intermediaries and any other individuals mentioned in the transaction data).
- Also note that titles like Mr., Mrs., Dr., etc. should not be included in the extracted names.
- For any individuals found as entities, if their "country", or "phone" or "email" is found in transaction data.
The canonical JSON format should follow this structure:
{{
    "transaction_id": <string>,
    "timestamp": <string or null>,
    "sender": {{
        "name": <string or null>,
        "account": <string or null>,
        "address": <string or null>,
        "notes": <string or null>,
        "metadata": {{}}
    }},
    "receiver": {{
        "name": <string or null>,
        "account": <string or null>,
        "address": <string or null>,
        "tax_id": <string or null>,
        "metadata": {{}}
    }},
    "amount": <string or null>,
    "currency_exchange": <string or null>,
    "transaction_type": <string or null>,
    "reference": <string or null>,
    "additional_notes": [<list of strings>],
    "raw_content": <original input text>,
    "extra_fields": {{}},
    "entities": [],
    "risk_evaluation": <string or null>
}}

Below are examples:

Example 1 (Structured Data):
Input:
{{
    "Transaction ID": "TXN001",
    "Payer Name": "Acme Corp",
    "Receiver Name": "SoVCo Capital Partners",
    "Transaction Details": "Payment for services rendered",
    "Amount": "500000",
    "Receiver Country": "USA"
}}
Output:
{{
    "transaction_id": "TXN001",
    "timestamp": null,
    "sender": {{
        "name": "Acme Corporation",
        "account": null,
        "address": null,
        "notes": null,
        "metadata": {{}}
    }},
    "receiver": {{
        "name": "SoVCo Capital Partners",
        "account": null,
        "address": null,
        "tax_id": null,
        "metadata": {{
            "country": "USA"
        }}
    }},
    "amount": "500000",
    "currency_exchange": null,
    "transaction_type": null,
    "reference": null,
    "additional_notes": [],
    "raw_content": "Original input preserved here",
    "entities": [
        {{
            "name": "Acme Corporation",
            "extracted_from": "Payer Name",
            "type": "company"
        }},
        {{
            "name": "SoVCo Capital Partners",
            "extracted_from": "Receiver Name",
            "type": "company"
        }}
    ],
    "extra_fields": {{
        "Transaction Details": "Payment for services rendered"
    }},
    "risk_evaluation": "Based on known data, SoVCo Capital Partners appears as a standard corporation with no immediate red flags."
}}

Example 2 (Unstructured Data):
Input:
Transaction ID: TXN-2023-5A9B
Date: 2023-08-15 14:22:00
Sender:
 - Name: "Global Horizons Consulting LLC"
 - Account: IBAN CH56 0483 5012 346 7800 9 (Swiss bank)
 - Address: Rue du Marche 17, Geneva, Switzerland
 - Notes: "Consulting fees for project Aurora"
Receiver:
 - Name: "Bright Future Nonprofit Inc"
 - Account: 987654321 (Cayman National Bank, KY)
 - Address: P.O. Box 1234, George Town, Cayman Islands
 - Tax ID: KY-45678
Amount: $49,850.00
Currency Exchange: N/A
Transaction Type: Wire Transfer
Reference: "Charitable Donation - Ref #DR-2023-0815"
Additional Notes:
 - "Urgent transfer approved by Mr. Ali Al-Mansoori (Director, ali.almansoori@globalhorizons.com, USA)."
 - "Linked invoice missing. Processed via intermediary Quantum Holdings Ltd (BVI)."
 - "Sender IP: 192.168.89.123 (VPN detected: NordVPN, exit node in Panama)"
Output:
{{
    "transaction_id": "TXN-2023-5A9B",
    "timestamp": "2023-08-15 14:22:00",
    "sender": {{
        "name": "Global Horizons Consulting LLC",
        "account": "IBAN CH56 0483 5012 346 7800 9",
        "address": "Rue du Marche 17, Geneva, Switzerland",
        "notes": "Consulting fees for project Aurora",
        "metadata": {{
            "account_info": "Swiss bank",
            "extracted_from": "Sender section"
        }}
    }},
    "receiver": {{
        "name": "Bright Future Nonprofit Inc",
        "account": "987654321",
        "address": "P.O. Box 1234, George Town, Cayman Islands",
        "tax_id": "KY-45678",
        "metadata": {{
            "bank": "Cayman National Bank, KY",
            "extracted_from": "Receiver section"
        }}
    }},
    "amount": "$49,850.00",
    "currency_exchange": "N/A",
    "transaction_type": "Wire Transfer",
    "reference": "Charitable Donation - Ref #DR-2023-0815",
    "additional_notes": [
        "Urgent transfer approved by Mr. Ali Al-Mansoori (Director, ali.almansoori@globalhorizons.com, USA).",
        "Linked invoice missing. Processed via intermediary Quantum Holdings Ltd (BVI).",
        "Sender IP: 192.168.89.123 (VPN detected: NordVPN, exit node in Panama)"
    ],
    "raw_content": "Original input preserved here",
    "entities": [
        {{
            "name": "Global Horizons Consulting LLC",
            "extracted_from": "Sender section",
            "type": "company",
            "address": "Rue du Marche 17, Geneva, Switzerland",
            "country": "Switzerland"
        }},
        {{
            "name": "Bright Future Nonprofit Inc",
            "extracted_from": "Receiver section",
            "type": "company",
            "address": "P.O. Box 1234, George Town, Cayman Islands",
            "country": "Cayman Islands",
        }},
        {{
            "name": "Ali Al-Mansoori",
            "extracted_from": "additional_notes",
            "type": "individual"
            "country": "USA",
            "email": "ali.almansoori@globalhorizons.com"
        }},
        {{
            "name": "Quantum Holdings Ltd",
            "extracted_from": "additional_notes",
            "type": "company",
            "country": "BVI (British Virgin Islands)"
        }}
    ],
    "extra_fields": {{}},
    "risk_evaluation": "The transaction involves a potential high-risk element as the additional notes mention a PEP, Mr. Ali Al-Mansoori, and the use of VPN exit nodes. Further investigation is advised."
}}

Now, convert the following input into the canonical JSON format (remember to extract extra entities from free-text fields, normalize names, flag uncertain extractions with a note, and include a risk evaluation. Most importantly, do not and I repeat do not provide any comments inside the generated JSON):
{raw_data}
    """
    return few_shot_prompt

In [None]:
def extract_with_llm(raw_data):
  prompt = generate_prompt(raw_data)
  response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={
      "Authorization": "Bearer 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",
      "Content-Type": "application/json",
    },
    data=json.dumps({
      "model": "nvidia/llama-3.1-nemotron-70b-instruct:free",
      "messages": [
        {
          "role": "user",
          "content": prompt,
          "temperature":0.1
        }
      ],
    })
  )
  return response

In [4]:
def extract_canonical_json(response_text):
    print(response_text)
    start = response_text.find('{')
    end = response_text.rfind('}')
    if start == -1 or end == -1:
        raise ValueError("No JSON object found in the response.")

    json_text = response_text[start:end+1]

    json_text = json_text.strip()

    try:
        result = json.loads(json_text)
        return result
    except json.JSONDecodeError as e:
        raise ValueError("Failed to parse extracted JSON.") from e

In [5]:
def extract_from_transaction_data(transaction_data):
  retries = 3
  while retries > 0:
    try:
      res = extract_with_llm(transaction_data)
      response_text = res.json()['choices'][0]['message']['content']
      result = extract_canonical_json(response_text)
      break
    except Exception as e:
      retries -= 1
      if retries == 0:
        raise e
      continue
  return result

In [6]:
unstructured_input = """Transaction ID: TXN-2023-5A9B
Date: 2023-08-15 14:22:00
Sender:
 - Name: "Digital Marketing Awards FZ LLC"
 - Account: IBAN CH56 0483 5012 346 7800 9 (Swiss bank)
 - Address: COMPASS BUILDING FDRK 2508, AL SHOHADA ROAD, AL HAMRA INDUSTRIAL ZONE-FZ, RAS AL KHAIMAH, ARE, United Arab Emirates
 - Notes: "Consulting fees for project Aurora led by Sanavbari Nikitenko"
Receiver:
 - Name: "8808 HOLDING LIMITED"
 - Account: 987654321 (HongKong National Bank, Hong Kong)
 - Address: TWC MANAGEMENT LIMITED SUITE D; 19/F RITZ PLAZA122 AUSTIN ROADTSIM SHA TSUI; KOWLOON HONG KONG
 - Tax ID: HK-45678
Amount: $49,860.00
Currency Exchange: N/A
Transaction Type: Wire Transfer
Reference: "Charitable Donation - Ref #DR-2023-0815"
Additional Notes:
 - "Urgent transfer approved by Mr. Trevor Prescod (India, prescod.trevor@gmail.com)."
 - "Transfer backed by Mr. Trevor Squirrell (US, tsquirrell@leg.state.vt.us)."
 - "Will further be taken care of by Mr. Corfiducia Anstalt (Liechtenstein)"
 - "Linked invoice missing. Processed via intermediary Quantum Holdings Ltd (BVI)."
 - Sender IP: 192.168.89.123 (VPN detected: NordVPN, exit node in Panama)"""

In [26]:
extracted_json = extract_from_transaction_data(unstructured_input)

```
{
    "transaction_id": "TXN-2023-5A9B",
    "timestamp": "2023-08-15 14:22:00",
    "sender": {
        "name": "Digital Marketing Awards FZ LLC",
        "account": "IBAN CH56 0483 5012 346 7800 9",
        "address": "COMPASS BUILDING FDRK 2508, AL SHOHADA ROAD, AL HAMRA INDUSTRIAL ZONE-FZ, RAS AL KHAIMAH, ARE, United Arab Emirates",
        "notes": "Consulting fees for project Aurora led by Sanavbari Nikitenko",
        "metadata": {
            "account_info": "Swiss bank",
            "extracted_from": "Sender section"
        }
    },
    "receiver": {
        "name": "8808 Holding Limited",
        "account": "987654321",
        "address": "TWC MANAGEMENT LIMITED SUITE D; 19/F RITZ PLAZA 122 AUSTIN ROAD TSIM SHA TSUI; KOWLOON HONG KONG",
        "tax_id": "HK-45678",
        "metadata": {
            "bank": "HongKong National Bank, Hong Kong",
            "extracted_from": "Receiver section"
        }
    },
    "amount": "$49,860.00",
    "currency_exchange": "N/A",
   

In [27]:
entities = extracted_json['entities']
entities

[{'name': 'Digital Marketing Awards FZ LLC',
  'extracted_from': 'Sender section',
  'type': 'company',
  'address': 'COMPASS BUILDING FDRK 2508, AL SHOHADA ROAD, AL HAMRA INDUSTRIAL ZONE-FZ, RAS AL KHAIMAH, ARE, United Arab Emirates',
  'country': 'United Arab Emirates'},
 {'name': '8808 Holding Limited',
  'extracted_from': 'Receiver section',
  'type': 'company',
  'address': 'TWC MANAGEMENT LIMITED SUITE D; 19/F RITZ PLAZA 122 AUSTIN ROAD TSIM SHA TSUI; KOWLOON HONG KONG',
  'country': 'Hong Kong'},
 {'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for country, phone, or email'},
 {'name': 'Trevor Prescod',
  'extracted_from': 'additional_notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com'},
 {'name': 'Trevor Squirrell',
  'extracted_from': 'additional_notes',
  'type': 'individual',
  'country': 'USA',
  'email': 'tsquirrell@leg.state.vt.us'},
 {'name': 'Corfiduci

# Data Enrichment Layer

In [9]:
client = MongoClient('mongodb://localhost:27017')
db = client['local']

In [51]:
def clean_text(text):
    if not text:
        return ""
    cleaned = re.sub(r"[\"']", "", text)
    return cleaned.strip()

def process_aliases(raw_aliases):
    if not raw_aliases:
        return []
    alias_list = raw_aliases.split(";")
    return [clean_text(alias) for alias in alias_list if alias.strip()]

In [None]:
def entity_db_search(collection_name, search_term, threshold=80):
    collection = db[collection_name]
    search_term = search_term.lower()
    query = { "$text": { "$search": search_term } }
    candidates = list(collection.find(query))
    
    scored_candidates = []
    if collection_name == "PEP":
        for doc in candidates:
            name_value = doc.get("name", "").lower()
            aliases_value = doc.get("aliases", "").lower()
            name_score = fuzz.token_set_ratio(search_term, name_value) if len(name_value)!=0 else -1
            aliases_score = fuzz.token_set_ratio(search_term, aliases_value) if len(aliases_value)!=0 else -1
            best_score = max(name_score, aliases_score)
            if best_score >= threshold:
                scored_candidates.append((doc, best_score))
        
        scored_candidates.sort(key=lambda x : x[1], reverse=True)
        return scored_candidates
    elif collection_name in ["Criminal-entities", "Leaks-intermediaries", "Leaks-others"]:
        for doc in candidates:
            name_value = doc.get("name", "").lower()
            name_score = fuzz.token_set_ratio(search_term, name_value) if len(name_value)!=0 else -1
            if name_score >= threshold:
                scored_candidates.append((doc, name_score))
        
        scored_candidates.sort(key=lambda x : x[1], reverse=True)
        return scored_candidates
    elif collection_name == "Leaks-entities":
        for doc in candidates:
            name_value = doc.get("name","").lower()
            original_name = doc.get("original_name","").lower()
            former_name = doc.get("former_name","").lower()
            name_score = fuzz.token_set_ratio(search_term, name_value) if len(name_value)!=0 else -1
            original_name_score = fuzz.token_set_ratio(search_term, original_name) if len(original_name)!=0 else -1
            former_name_score = fuzz.token_set_ratio(search_term, former_name) if len(former_name)!=0 else -1
            best_score = max(name_score, original_name_score, former_name_score)
            if best_score >= threshold:
                scored_candidates.append((doc, best_score))
        scored_candidates.sort(key=lambda x : x[1], reverse=True)
        return scored_candidates
    elif collection_name == "Consolidated-Sanctions":
        for doc in candidates:
            name = doc.get("name", "")
            aliases = doc.get("aliases", "")
            cleaned_name = clean_text(name)
            cleaned_aliases = process_aliases(aliases)
            cleaned_aliases.append(cleaned_name)
            best_score = -1
            for alias in cleaned_aliases:
                if len(alias)>0:
                    best_score = max(best_score, fuzz.token_set_ratio(search_term, alias))
            if best_score >= threshold:
                scored_candidates.append((doc, best_score))
        scored_candidates.sort(key=lambda x : x[1], reverse=True)
        return scored_candidates

## Search for individuals

In [11]:
entities

[{'name': 'Digital Marketing Awards FZ LLC',
  'extracted_from': 'Sender section',
  'type': 'company'},
 {'name': '8808 Holding Limited',
  'extracted_from': 'Receiver section',
  'type': 'company'},
 {'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for further details'},
 {'name': 'Trevor Prescod',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com'},
 {'name': 'Trevor Squirrell',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'USA',
  'email': 'tsquirrell@leg.state.vt.us'},
 {'name': 'Corfiducia Anstalt',
  'extracted_from': 'Additional Notes',
  'type': 'individual/entity (unclear)',
  'note': "Unclear if individual or company due to 'Anstalt' (possible company suffix); Country: Liechtenstein",
  'country': 'Liechtenstein'},
 {'name': 'Quantum Holdings Ltd',
  'extracted_from': 'Additional Notes',
  'typ

In [12]:
individuals = []
for x in entities:
    if 'individual' in x['type']:
        individuals.append(x)

In [13]:
individuals

[{'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for further details'},
 {'name': 'Trevor Prescod',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com'},
 {'name': 'Trevor Squirrell',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'USA',
  'email': 'tsquirrell@leg.state.vt.us'},
 {'name': 'Corfiducia Anstalt',
  'extracted_from': 'Additional Notes',
  'type': 'individual/entity (unclear)',
  'note': "Unclear if individual or company due to 'Anstalt' (possible company suffix); Country: Liechtenstein",
  'country': 'Liechtenstein'}]

In [14]:
import re

def remove_titles(text):
    pattern = r'\b(?:Mr\.?|Mrs\.?|Ms\.?|Miss|Dr\.?|Prof\.?)\s+'
    return re.sub(pattern, '', text, flags=re.IGNORECASE)

### In PEP DB

In [15]:
def gather_pep_evidence(individual):
    fields = individual.keys()
    name = individual['name']
    results = entity_db_search("PEP", name)
    pep_evidence, pep_flag, pep_notes = None, None, None
    if len(results)==0:
        pep_evidence = "Name not found in PEP database"
        pep_flag = 0
    else:
        if 'email' in fields:
            email = individual['email']
            for result in results:
                doc = result[0]
                pep_notes = doc['dataset']
                if 'emails' in doc.keys() and email.lower() in doc['emails'].lower():
                    pep_evidence = 'Individual found in Politically Exposed Persons (PEP) database and email matching'
                    pep_flag = 6
                    break
            else:
                pep_evidence = "Name found in PEP database but email not matching"
                pep_flag = 2
        elif 'phone' in fields:
            phone = individual['phone']
            for result in results:
                doc = result[0]
                if 'phones' in doc.keys() and phone in doc['phones']:
                    pep_evidence = 'Individual found in Politically Exposed Persons (PEP) database and phone matching'
                    pep_flag = 6
                    break
            else:
                pep_evidence = "Name found in PEP database but mobile not matching"
                pep_flag = 3
        elif 'country' in fields:
            country = individual['country']
            for result in results:
                doc = result[0]
                if 'countries' in doc.keys() and country.lower() in doc['countries'].lower():
                    pep_evidence = 'Individual found in Politically Exposed Persons (PEP) database and country matching'
                    pep_flag = 4
                    break
            else:
                pep_evidence = "Name found in PEP database but country not matching"
                pep_flag = 1
        else:
            pep_evidence = "Individual found in Politically Exposed Persons (PEP) database but no additional data such as email, phone, or country to confirm"
            pep_flag = 5

    return pep_evidence, pep_flag, pep_notes

### In Criminal-Entities

In [16]:
def gather_ce_evidence(individual):
    name = individual['name']
    results = entity_db_search("Criminal-entities", name)
    ce_evidence, ce_flag, ce_notes = None, None, None
    if len(results) == 0:
        ce_evidence = "Name not found in Criminal Entities database"
        ce_flag = -1
    elif results[0][1] == 100:
        ce_evidence = "Individual found in Criminal Entities database with a perfect match"
        ce_flag = 3
        ce_notes = results[0][0]['sanctions']
    else:
        ce_evidence = "Individual found in Criminal Entities database but not a perfect name match"
        ce_flag = 2
        ce_notes = results[0][0]['sanctions']
    return ce_evidence, ce_flag, ce_notes

In [120]:
gather_ce_evidence(individuals[0])

('Individual found in Criminal Entities database with a perfect match',
 3,
 '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"')

### In Leaks-intermediaries, Leaks-officers

In [35]:
def gather_leaks_indi_evidence(individual):
    fields = individual.keys()
    name = individual['name']
    for col in ['Leaks-intermediaries', 'Leaks-officers']:
        results = entity_db_search(col, name)
        lei_evidence, lei_flag = None, None
        if len(results)==0:
            lei_evidence = "Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks"
            lei_flag = 0
        else:
            if 'country' in fields:
                sources = ()
                for result in results:
                    doc = result[0]
                    if 'countries' in doc.keys() and individual['country'].lower() in doc['countries'].lower():
                        if col == "Leaks-intermediaries":
                            lei_evidence = f"Individual's name found in {doc["sourceID"]} and is an intermediary currently {doc['status']} in {individual["country"]}"
                        else:
                            lei_evidence = f"Individual's name found in {doc["sourceID"]} and is an officer in {individual["country"]}"
                        lei_flag = 3
                        break
                    else:
                        sources.add(doc["sourceID"])
                if lei_flag!=3:
                    # sources_string = ', '.join(sources)
                    lei_evidence = f"Individual's name found in {', '.join(sources)} but country doesn't match"
                    lei_flag = 2
            else:
                lei_evidence = f"Individual's name found in {doc["sourceID"]} but no additional data such as country to confirm"
                lei_flag = 1
        return lei_evidence, lei_flag

In [29]:
def gather_evidence_on_individual(individual):
    individual['pep_evidence'], individual['pep_flag'], individual['pep_notes'] = gather_pep_evidence(individual)
    individual['ce_evidence'], individual['ce_flag'], individual['ce_notes'] = gather_ce_evidence(individual)
    individual['lei_evidence'], individual['lei_flag'] = gather_leaks_indi_evidence(individual)
    return individual

In [30]:
gather_evidence_on_individual(individuals[0])

{'name': 'Sanavbari Nikitenko',
 'extracted_from': 'Sender Notes',
 'type': 'individual',
 'note': 'Insufficient information for further details',
 'pep_evidence': 'Name not found in PEP database',
 'pep_flag': 0,
 'pep_notes': None,
 'ce_evidence': 'Individual found in Criminal Entities database with a perfect match',
 'ce_flag': 3,
 'ce_notes': '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"',
 'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
 'lei_flag': 0}

In [31]:
gather_evidence_on_individual(individuals[1])

{'name': 'Trevor Prescod',
 'extracted_from': 'Additional Notes',
 'type': 'individual',
 'country': 'India',
 'email': 'prescod.trevor@gmail.com',
 'pep_evidence': 'Name found in PEP database but email not matching',
 'pep_flag': 2,
 'pep_notes': 'Every Politician',
 'ce_evidence': 'Name not found in Criminal Entities database',
 'ce_flag': -1,
 'ce_notes': None,
 'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
 'lei_flag': 0}

In [32]:
gather_evidence_on_individual(individuals[2])

{'name': 'Trevor Squirrell',
 'extracted_from': 'Additional Notes',
 'type': 'individual',
 'country': 'USA',
 'email': 'tsquirrell@leg.state.vt.us',
 'pep_evidence': 'Individual found in Politically Exposed Persons (PEP) database and email matching',
 'pep_flag': 6,
 'pep_notes': 'US Legislators by Plural (formerly OpenStates)',
 'ce_evidence': 'Name not found in Criminal Entities database',
 'ce_flag': -1,
 'ce_notes': None,
 'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
 'lei_flag': 0}

In [36]:
gather_evidence_on_individual(individuals[3])

{'name': 'Corfiducia Anstalt',
 'extracted_from': 'Additional Notes',
 'type': 'individual/entity (unclear)',
 'note': "Unclear if individual or company due to 'Anstalt' (possible company suffix); Country: Liechtenstein",
 'country': 'Liechtenstein',
 'pep_evidence': 'Name not found in PEP database',
 'pep_flag': 0,
 'pep_notes': None,
 'ce_evidence': 'Name not found in Criminal Entities database',
 'ce_flag': -1,
 'ce_notes': None,
 'lei_evidence': "Individual's name found in Panama Papers and is an intermediary currently ACTIVE in Liechtenstein",
 'lei_flag': 3}

In [34]:
individuals

[{'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for further details',
  'pep_evidence': 'Name not found in PEP database',
  'pep_flag': 0,
  'pep_notes': None,
  'ce_evidence': 'Individual found in Criminal Entities database with a perfect match',
  'ce_flag': 3,
  'ce_notes': '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"',
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Prescod',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com',
  'pep_evidence': 'Name found in PEP database but email not matching',
  'pep_flag': 2,
  'pep_notes': 'Every Politician',
  'ce_evidence': 'Name not found in Criminal Entities database',
  'ce_flag': -1,
  'ce_notes': None,
  '

## Companies search

In [23]:
companies = []
for x in entities:
    if 'company' in x['type']:
        companies.append(x.copy())
companies

[{'name': 'Digital Marketing Awards FZ LLC',
  'extracted_from': 'Sender section',
  'type': 'company'},
 {'name': '8808 Holding Limited',
  'extracted_from': 'Receiver section',
  'type': 'company'},
 {'name': 'Quantum Holdings Ltd',
  'extracted_from': 'Additional Notes',
  'type': 'company',
  'note': 'Identified as intermediary, jurisdiction: BVI'}]

#### In Leaks

In [45]:
def gather_leaks_company_evidence(company):
    name = company['name']
    fields = company.keys()
    # Leaks-entities
    results = entity_db_search("Leaks-entities", name)
    leaks_ent_evidence, leaks_ent_flag = None, None
    if len(results)==0:
        leaks_ent_evidence = 'Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks'
        leaks_ent_flag = 0
    else:
        sources = ()
        if 'country' in fields:
            for result in results:
                doc = result[0]
                if 'countries' in doc.keys() and company['country'].lower() in doc['countries'].lower():
                    leaks_ent_evidence = f'Company name found in {doc["sourceID"]} and is registered as {doc['status']} in {doc["countries"]}'
                    leaks_ent_flag = 2
                    break
                else:
                    sources.add(doc["sourceID"])
            if leaks_ent_flag!=2:
                leaks_ent_evidence = f'Company name found in {', '.join(sources)} but country does not match'
                leaks_ent_flag = 1
        if 'address' in fields:
            for result in results:
                doc = result[0]
                if 'addresses' in doc.keys() and company['address'].lower() in doc['addresses'].lower():
                    leaks_ent_evidence = f'Company name found in {doc["sourceID"]}, address matches and is registered as {doc['status']} in {doc["countries"]}'
                    leaks_ent_flag = 3
                    break
                else:
                    sources.add(doc["sourceID"])
            if leaks_ent_flag!=3:
                leaks_ent_evidence = f'Company name found in {', '.join(sources)} but address does not match'
                leaks_ent_flag = 1
        else:
            leaks_ent_evidence = f'Company name found in {', '.join(sources)} but no additional data such as address to confirm'
            leaks_ent_flag = 1
    # Leaks-intermediaries
    results = entity_db_search("Leaks-intermediaries", name)
    leaks_inter_evidence, leaks_inter_flag = None, None
    if len(results)==0:
        leaks_inter_evidence = 'Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks'
        leaks_inter_flag = 0
    else:
        sources = ()
        if 'country' in fields:
            for result in results:
                doc = result[0]
                if 'countries' in doc.keys() and company['country'].lower() in doc['countries'].lower():
                    leaks_inter_evidence = f'Company name found in {doc["sourceID"]} and is an intermediary currently {doc['status']} in {doc["countries"]}'
                    leaks_inter_flag = 2
                    break
                else:
                    sources.add(doc["sourceID"])
            if leaks_inter_flag!=2:
                leaks_inter_evidence = f'Company name found in {', '.join(sources)} but country does not match'
                leaks_inter_flag = 1
        else:
            leaks_inter_evidence = f'Company name found in {', '.join(sources)} but no additional data such as country to confirm'
            leaks_inter_flag = 1
    # Leaks-others
    results = entity_db_search("Leaks-others", name)
    leaks_others_evidence, leaks_others_flag = None, None
    if len(results)==0:
        leaks_others_evidence = 'Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks'
        leaks_others_flag = 0
    else:
        sources = ()
        if 'country' in fields:
            for result in results:
                doc = result[0]
                if 'countries' in doc.keys() and company['country'].lower() in doc['countries'].lower():
                    leaks_others_evidence = f'Company name found in {doc["sourceID"]} and is an intermediary currently {doc['status']} in {doc["countries"]}'
                    leaks_others_flag = 2
                    break
                else:
                    sources.add(doc["sourceID"])
            if leaks_others_flag!=2:
                leaks_others_evidence = f'Company name found in {', '.join(sources)} but country does not match'
                leaks_others_flag = 1
        else:
            leaks_others_evidence = f'Company name found in {', '.join(sources)} but no additional data such as country to confirm'
            leaks_others_flag = 1
    return leaks_inter_evidence, leaks_inter_flag, leaks_ent_evidence, leaks_ent_flag, leaks_others_evidence, leaks_others_flag

In [46]:
companies

[{'name': 'Digital Marketing Awards FZ LLC',
  'extracted_from': 'Sender section',
  'type': 'company'},
 {'name': '8808 Holding Limited',
  'extracted_from': 'Receiver section',
  'type': 'company'},
 {'name': 'Quantum Holdings Ltd',
  'extracted_from': 'Additional Notes',
  'type': 'company',
  'note': 'Identified as intermediary, jurisdiction: BVI'}]

In [47]:
gather_leaks_company_evidence(companies[0])

('Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
 0,
 'Company name found in  but no additional data such as address to confirm',
 1,
 'Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
 0)

#### In Consolidated-sanctions

In [53]:
def gather_cs_evidence(company):
    fields = company.keys()
    name = company['name']
    results = entity_db_search("Consolidated-Sanctions", name)
    cs_evidence, cs_flag, cs_notes = None, None, None
    if len(results) == 0:
        cs_evidence = "Company not found in Consolidated Sanctions database"
        cs_flag = 0
    elif results[0][1] == 100:
        cs_evidence = f"Company found in Consolidated Sanctions database with a perfect match and is sanctioned under {results[0][0]['dataset']}"
        cs_flag = 3
        cs_notes = results[0][0]['sanctions']
    else:
        if 'country' in fields:
            for result in results:
                doc = result[0]
                if 'countries' in doc.keys() and company['country'].lower() in doc['countries'].lower():
                    cs_evidence = f"Company found in Consolidated Sanctions database but not a perfect name match and is sanctioned under {doc['dataset']}"
                    cs_flag = 1
                    cs_notes = doc['sanctions']
                    break
            else:
                cs_evidence = f"Company found in Consolidated Sanctions database but not a perfect name match and also country doesn't match"
                cs_flag = 0
    return cs_evidence, cs_flag, cs_notes

#### In Criminal-entities

In [54]:
def gather_ce_evidence_on_company(company):
    name = company['name']
    results = entity_db_search("Criminal-entities", name)
    ce_evidence, ce_flag, ce_notes = None, None, None
    if len(results) == 0:
        ce_evidence = "Name not found in Criminal Entities database"
        ce_flag = -1
    elif results[0][1] == 100:
        ce_evidence = "Company found in Criminal Entities database with a perfect match"
        ce_flag = 3
        ce_notes = results[0][0]['sanctions']
    else:
        ce_evidence = "Company found in Criminal Entities database but not a perfect name match"
        ce_flag = 2
        ce_notes = results[0][0]['sanctions']
    return ce_evidence, ce_flag, ce_notes

#### News

In [80]:
def gather_news(company):
    newsapi = NewsApiClient(api_key='0969fe7923f74db2ad9610a14a742866')
    all_articles = newsapi.get_everything(q=company['name'],
                                      language='en',
                                      sort_by='publishedAt')
    content = ""
    articlesLimit = 2
    for article in all_articles['articles']:
        if articlesLimit > 0:
            content += f"Title: {article['title']}\nDescription: {article['description']}\n\n"
        articlesLimit -= 1
    if len(content) == 0:
        content = "No news articles found for the company"
    return content

In [81]:
def gather_evidence_on_company(company):
    company['leaks_inter_evidence'], company['leaks_inter_flag'], company['leaks_ent_evidence'], company['leaks_ent_flag'], company['leaks_others_evidence'], company['leaks_others_flag'] = gather_leaks_company_evidence(company)
    company['cs_evidence'], company['cs_flag'], company['cs_notes'] = gather_cs_evidence(company)
    company['ce_evidence'], company['ce_flag'], company['ce_notes'] = gather_ce_evidence_on_company(company)
    company['news_content'] = gather_news(company)
    return company

In [82]:
for company in companies:
    gather_evidence_on_company(company)

In [71]:
individuals

[{'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for further details',
  'pep_evidence': 'Name not found in PEP database',
  'pep_flag': 0,
  'pep_notes': None,
  'ce_evidence': 'Individual found in Criminal Entities database with a perfect match',
  'ce_flag': 3,
  'ce_notes': '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"',
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Prescod',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com',
  'pep_evidence': 'Name found in PEP database but email not matching',
  'pep_flag': 2,
  'pep_notes': 'Every Politician',
  'ce_evidence': 'Name not found in Criminal Entities database',
  'ce_flag': -1,
  'ce_notes': None,
  '

In [83]:
companies

[{'name': 'Digital Marketing Awards FZ LLC',
  'extracted_from': 'Sender section',
  'type': 'company',
  'leaks_inter_evidence': 'Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'leaks_inter_flag': 0,
  'leaks_ent_evidence': 'Company name found in  but no additional data such as address to confirm',
  'leaks_ent_flag': 1,
  'leaks_others_evidence': 'Company not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'leaks_others_flag': 0,
  'cs_evidence': 'Company not found in Consolidated Sanctions database',
  'cs_flag': 0,
  'cs_notes': None,
  'ce_evidence': 'Company found in Criminal Entities database with a perfect match',
  'ce_flag': 3,
  'ce_notes': 'Reciprocal - 2024-08-23',
  'news_content': 'No news articles found for the company'},
 {'name': '8808 Holding Limited',
  'extracted_from': 'Receiver section',
  'type': 'company',
  'leaks_inter_evidence': 'Company name found in  but no additional data such as country to co

In [84]:
final_entities = individuals + companies
final_entities

[{'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for further details',
  'pep_evidence': 'Name not found in PEP database',
  'pep_flag': 0,
  'pep_notes': None,
  'ce_evidence': 'Individual found in Criminal Entities database with a perfect match',
  'ce_flag': 3,
  'ce_notes': '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"',
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Prescod',
  'extracted_from': 'Additional Notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com',
  'pep_evidence': 'Name found in PEP database but email not matching',
  'pep_flag': 2,
  'pep_notes': 'Every Politician',
  'ce_evidence': 'Name not found in Criminal Entities database',
  'ce_flag': -1,
  'ce_notes': None,
  '

In [85]:
keys_to_remove = {'extracted_from', 'type', 'address', 'country', 'email', 'phone', 'note'}
for index, ent in enumerate(final_entities):
    final_entities[index] = {k : v for k,v in ent.items() if k not in keys_to_remove}
final_entities

[{'name': 'Sanavbari Nikitenko',
  'pep_evidence': 'Name not found in PEP database',
  'pep_flag': 0,
  'pep_notes': None,
  'ce_evidence': 'Individual found in Criminal Entities database with a perfect match',
  'ce_flag': 3,
  'ce_notes': '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"',
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Prescod',
  'pep_evidence': 'Name found in PEP database but email not matching',
  'pep_flag': 2,
  'pep_notes': 'Every Politician',
  'ce_evidence': 'Name not found in Criminal Entities database',
  'ce_flag': -1,
  'ce_notes': None,
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Squirrell',
  'pep_evidence': 'Individual found in Politically Exposed Persons (PEP) database and email m

In [116]:
final_entities

[{'name': 'Sanavbari Nikitenko',
  'pep_evidence': 'Name not found in PEP database',
  'pep_flag': 0,
  'pep_notes': None,
  'ce_evidence': 'Individual found in Criminal Entities database with a perfect match',
  'ce_flag': 3,
  'ce_notes': '"Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation"',
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Prescod',
  'pep_evidence': 'Name found in PEP database but email not matching',
  'pep_flag': 2,
  'pep_notes': 'Every Politician',
  'ce_evidence': 'Name not found in Criminal Entities database',
  'ce_flag': -1,
  'ce_notes': None,
  'lei_evidence': 'Individual not found in any of the Panama Papers, Paradise Papers, or Bahamas Leaks',
  'lei_flag': 0},
 {'name': 'Trevor Squirrell',
  'pep_evidence': 'Individual found in Politically Exposed Persons (PEP) database and email m

# Classification and Risk Score Calculation

In [117]:
import json

def generate_classification_prompt(extracted_entities, transaction_data):
    """
    Generates a prompt for an LLM to classify entities based on provided transaction data
    and an enriched entities list.
    
    Parameters:
      extracted_entities (list): A list of dictionaries containing enriched evidence data.
      transaction_data (str): A string with the original transaction data.
    
    Returns:
      str: A prompt for the LLM.
    """
    # Convert the enriched entities list to a JSON-formatted string for readability
    entities_json = json.dumps(extracted_entities, indent=2)
    
    prompt = f"""
You are an expert in financial entity classification and risk assessment. I will provide you with the original transaction data along with an enriched entities list. Based on the evidence provided, please classify each entity as follows:

- For *companies*, choose one of the following classifications (only one allowed): 
  - "Shell company"
  - "Corporation"
  - "NGO"

- For *individuals*, you may assign one or both classifications:
  - "PEP"
  - "Criminal"
  - "Intermediary"

For each entity, provide a clear, concise justification that explains:
- Which evidence was used (e.g., evidence from the PEP database, Criminal Entities database, offshore leaks, consolidated sanctions, news content, etc.)
- How the risk flags (such as pep_flag, ce_flag, lei_flag for individuals; leaks_inter_flag, leaks_ent_flag, leaks_others_flag, cs_flag, ce_flag for companies) influenced your decision
- Any relevant insights drawn from the news content (for companies) or additional notes

Your task is to classify each entity based on the evidence provided and output the final result in a JSON list. Each element of the JSON list must be a dictionary with exactly the following keys:
- "name": the entity's name.
- "type": the classification result.
   - For companies, choose one of: "Shell company", "Corporation", or "NGO".
   - For individuals, you may assign one or both classifications: "PEP" and/or "Criminal". If both apply, output them as a comma-separated string.
- "justification": a detailed explanation of the classification decision, referencing the evidence provided (e.g., evidence from PEP database, Criminal Entities database, offshore leaks, consolidated sanctions, news content, etc.) and how the respective risk flags influenced the decision.

For overall risk assessment, you need to provide two fields (combined for all the entities and transaction):
- "risk_score": a risk score has to be returned based on the evidence and the type of entity, transaction intermediaries, or companies. Only give the combined risk score, not the reasoning, not the various risk scores 
    The complete risk score framework is below.
      1.Sanctions Risk: Calculate based on the number and severity of sanctions listed for each company
        a.50 if both companies have sanctions
        b.30 if either company has sanctions
        c.10 otherwise
      2.Regulatory Risk: Assign a score (1-10) based on available data
      3.Financial Risk: Assign a score (1-10) based on available data
      4.Reputation Risk: Assign a score (1-10) based on available data
      Total Risk Score (Get a combined risk score based on the above 4 factors, between 0 and 1): Calculate the sum of all risk scores
- "confidence_score": Only return the score, not the reasoning behind it. Evaluate the reliability of the risk assessment based on:
    1.Availability of relevant data
    2.Recency of data sources
    3.Reliability of data sources

Risk Scoring Breakdown

1. Regulatory or Legal Risks: Score based on indications of lawsuits, investigations, regulatory actions, fines, compliance failures, or SEC-related scrutiny.
2. Financial Instability: Score based on contextual analysis of financial distress, bankruptcy risks, debt burdens, or cash flow issues.
3. Market Reputation Risks: Score based on insights from semantic search regarding past controversies, negative public sentiment, media scrutiny, or brand damage.

Output structure:
[[{{
    "name": <entity_name1>,
    "type": <entity_type1>,
    "justification": <detailed_justification_for_classification_of_entity_1>
}},
{{
    "name": <entity_name2>,
    "type": <entity_type2>,
    "justification": <detailed_justification_for_classification_of_entity_2>s
}}],
{{
    "risk_score": <risk_score>,
    "confidence_score": <confidence_score>,
    "risk_score_justification": <justification for risk score>,
    "confidence_score_justification": <justification for confidence score>
}}
]


Below is the original transaction data:
---------------------------------------------------
{transaction_data.strip()}
---------------------------------------------------

Below is the enriched entities list:
{extracted_entities}

*Task:*
1. For each entity in the enriched entities list:
   - If the entity represents an individual, classify them as "PEP", "Criminal", or both (if applicable).
   - If the entity represents a company, classify it as either "Shell company", "Corporation", or "NGO".
2. For each classification, provide a detailed justification that references the evidence fields provided.
3. Output the final classification and justification for each entity in a structured format.

Please provide the final classification and justification for each entity.
    """
    return prompt

In [118]:
print(generate_classification_prompt(final_entities, unstructured_input))


You are an expert in financial entity classification and risk assessment. I will provide you with the original transaction data along with an enriched entities list. Based on the evidence provided, please classify each entity as follows:

- For *companies*, choose one of the following classifications (only one allowed): 
  - "Shell company"
  - "Corporation"
  - "NGO"

- For *individuals*, you may assign one or both classifications:
  - "PEP"
  - "Criminal"
  - "Intermediary"

For each entity, provide a clear, concise justification that explains:
- Which evidence was used (e.g., evidence from the PEP database, Criminal Entities database, offshore leaks, consolidated sanctions, news content, etc.)
- How the risk flags (such as pep_flag, ce_flag, lei_flag for individuals; leaks_inter_flag, leaks_ent_flag, leaks_others_flag, cs_flag, ce_flag for companies) influenced your decision
- Any relevant insights drawn from the news content (for companies) or additional notes

Your task is to cla

In [122]:
def classify_with_llm(transaction_data, enriched_entities):
    prompt = generate_classification_prompt(enriched_entities, transaction_data)
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
        "Authorization": "Bearer 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",
        "Content-Type": "application/json",
        },
        data=json.dumps({
        "model": "nvidia/llama-3.1-nemotron-70b-instruct:free",
        "messages": [
            {
            "role": "user",
            "content": prompt,
            "temperature":0.1
            }
        ],
        })
    )
    return response

In [130]:
res = classify_with_llm(unstructured_input, final_entities)

In [131]:
print(res.json())

{'error': {'message': 'Provider returned error', 'code': 503, 'metadata': {'raw': '{"message":"status 429: err fallback request failed","object":"error","Type":"APITimeoutError","code":429}\n', 'provider_name': 'Targon'}}, 'user_id': 'user_2tls2NvtzOa2rB1RE0God5AIOmV'}


In [124]:
print(res.json()['choices'][0]['message']['content'])

**Entity Classifications and Justifications**
```json
[
  {
    "name": "Sanavbari Nikitenko",
    "type": "Criminal",
    "justification": "Classification as 'Criminal' is based on the presence of a perfect match in the Criminal Entities database (ce_flag: 3), indicating involvement in a terrorist organization and an illegal armed formation. The absence from the PEP database (pep_flag: 0) and lack of matching email in PEP records (pep_evidence) do not outweigh the strong evidence of criminal activity."
  },
  {
    "name": "Trevor Prescod",
    "type": "PEP",
    "justification": "Despite the email mismatch (pep_evidence), the presence in the PEP database (pep_flag: 2) with notes 'Every Politician' suggests a Politically Exposed Person. The absence from the Criminal Entities database (ce_flag: -1) and lack of other adverse evidence support this classification."
  },
  {
    "name": "Trevor Squirrell",
    "type": "PEP",
    "justification": "A match in the PEP database with a matching

In [125]:
def extract_json_objects(text):
    pattern = r"```json\s*(.*?)\s*```"
    json_strs = re.findall(pattern, text, re.DOTALL)
    json_objects = []
    for js in json_strs:
        try:
            json_objects.append(json.loads(js))
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
    return json_objects

In [98]:
def extract_json_from_backticks(response_text):
    pattern = r"```(.*?)```"
    match = re.search(pattern, response_text, re.DOTALL)
    
    if match:
        json_str = match.group(1).strip()
        try:
            parsed_json = json.loads(json_str)
            return parsed_json
        except json.JSONDecodeError as e:
            print("Error parsing JSON:", e)
            return None
    else:
        print("No JSON block found between triple backticks.")
        return None

In [126]:
x = extract_json_objects(res.json()['choices'][0]['message']['content'])

In [129]:
x[1]

{'risk_score': 0.73,
 'confidence_score': 0.82,
 'risk_score_justification': "High risk due to multiple entities classified as 'Shell company' and individuals with 'Criminal' or 'PEP' designations, indicating potential for illicit financial activities, money laundering, or politically exposed transactions. Sanctions Risk (30, as at least one company has a sanctions-related flag), Regulatory Risk (8, due to the presence of PEPs and potential shell companies), Financial Risk (9, considering the large transaction amount and use of intermediaries), and Reputation Risk (6, from the overall entity classifications and transaction nature) contribute to this score.",
 'confidence_score_justification': 'High confidence due to the availability of detailed entity information, recent data sources (e.g., 2023 and 2024 entries in databases), and the reliability of the sources used (official databases and news content). However, the confidence is not maximal (100%) due to some incomplete matches in le

In [113]:
[{'name': 'Digital Marketing Awards FZ LLC', 'extracted_from': 'Sender section', 'type': 'company', 'address': 'COMPASS BUILDING FDRK 2508, AL SHOHADA ROAD, AL HAMRA INDUSTRIAL ZONE-FZ, RAS AL KHAIMAH, ARE, United Arab Emirates', 'country': 'United Arab Emirates'}, {'name': '8808 Holding Limited', 'extracted_from': 'Receiver section', 'type': 'company', 'address': 'TWC MANAGEMENT LIMITED SUITE D; 19/F RITZ PLAZA 122 AUSTIN ROAD TSIM SHA TSUI; KOWLOON HONG KONG', 'country': 'Hong Kong'}, {'name': 'Sanavbari Nikitenko', 'extracted_from': 'Sender Notes', 'type': 'individual', 'note': 'Insufficient information for country, phone, or email'}, {'name': 'Trevor Prescod', 'extracted_from': 'additional_notes', 'type': 'individual', 'country': 'India', 'email': 'prescod.trevor@gmail.com'}, {'name': 'Trevor Squirrell', 'extracted_from': 'additional_notes', 'type': 'individual', 'country': 'USA', 'email': 'tsquirrell@leg.state.vt.us'}, {'name': 'Corfiducia Anstalt', 'extracted_from': 'additional_notes', 'type': 'individual', 'country': 'Liechtenstein', 'note': 'Uncommon surname; potential pseudonym or company misidentified as individual'}, {'name': 'Quantum Holdings Ltd', 'extracted_from': 'additional_notes', 'type': 'company', 'country': 'BVI (British Virgin Islands)'}]

[{'name': 'Digital Marketing Awards FZ LLC',
  'extracted_from': 'Sender section',
  'type': 'company',
  'address': 'COMPASS BUILDING FDRK 2508, AL SHOHADA ROAD, AL HAMRA INDUSTRIAL ZONE-FZ, RAS AL KHAIMAH, ARE, United Arab Emirates',
  'country': 'United Arab Emirates'},
 {'name': '8808 Holding Limited',
  'extracted_from': 'Receiver section',
  'type': 'company',
  'address': 'TWC MANAGEMENT LIMITED SUITE D; 19/F RITZ PLAZA 122 AUSTIN ROAD TSIM SHA TSUI; KOWLOON HONG KONG',
  'country': 'Hong Kong'},
 {'name': 'Sanavbari Nikitenko',
  'extracted_from': 'Sender Notes',
  'type': 'individual',
  'note': 'Insufficient information for country, phone, or email'},
 {'name': 'Trevor Prescod',
  'extracted_from': 'additional_notes',
  'type': 'individual',
  'country': 'India',
  'email': 'prescod.trevor@gmail.com'},
 {'name': 'Trevor Squirrell',
  'extracted_from': 'additional_notes',
  'type': 'individual',
  'country': 'USA',
  'email': 'tsquirrell@leg.state.vt.us'},
 {'name': 'Corfiduci

In [114]:
[[{'name': 'Sanavbari Nikitenko', 'type': 'Criminal', 'justification': "Classification as 'Criminal' is based on the presence of a perfect match in the Criminal Entities database (ce_flag: 3) with notes indicating 'Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation'. The absence from the PEP database (pep_flag: 0) and lack of matching email do not impact this classification due to the strong evidence from the Criminal Entities database.", 'risk_score': 0.8, 'confidence_score': 0.9}, {'name': 'Trevor Prescod', 'type': 'PEP', 'justification': " Classified as 'PEP' due to the name being found in the PEP database (pep_flag: 2), despite the email not matching. The absence from the Criminal Entities database (ce_flag: -1) and lack of other adverse evidence support this singular classification.", 'risk_score': 0.3, 'confidence_score': 0.8}, {'name': 'Trevor Squirrell', 'type': 'PEP', 'justification': "Confirmed 'PEP' classification based on a match in the PEP database with an email match (pep_flag: 6), indicating a verified Politically Exposed Person. No adverse findings in other databases support this classification.", 'risk_score': 0.2, 'confidence_score': 0.95}, {'name': 'Corfiducia Anstalt', 'type': 'Intermediary', 'justification': "Though classified as an individual, the name's presence in the Panama Papers as an ACTIVE intermediary in Liechtenstein (lei_flag: 3) and the note suggesting a potential pseudonym or misidentified company, leads to an 'Intermediary' classification. Lack of PEP or Criminal database matches (pep_flag: 0, ce_flag: -1) supports this singular focus on intermediary activity.", 'risk_score': 0.5, 'confidence_score': 0.7}, {'name': 'Digital Marketing Awards FZ LLC', 'type': 'Shell company', 'justification': 'Evidence from the Paradise Papers (leaks_ent_flag: 1) and a perfect match in the Criminal Entities database (ce_flag: 3) with reciprocal notice, alongside the lack of news content, suggests characteristics of a shell company. The absence from Consolidated Sanctions (cs_flag: 0) does not outweigh these indicators.', 'risk_score': 0.7, 'confidence_score': 0.85}, {'name': '8808 Holding Limited', 'type': 'Shell company', 'justification': "Classification as a 'Shell company' is supported by its presence in multiple leaks (leaks_inter_flag: 2, leaks_ent_flag: 1, leaks_others_flag: 1) and a near-match in the Criminal Entities database (ce_flag: 2). The company's role as an intermediary in Hong Kong further solidifies this classification.", 'risk_score': 0.85, 'confidence_score': 0.9}, {'name': 'Quantum Holdings Ltd', 'type': 'Shell company', 'justification': "Presence in various leaks (leaks_inter_flag: 1, leaks_ent_flag: 1, leaks_others_flag: 1) and a near-match in the Criminal Entities database (ce_flag: 2), coupled with the lack of confirming address data, points towards a 'Shell company' classification. News content, while present, does not directly relate to Quantum Holdings Ltd's activities, reducing its impact on classification.", 'risk_score': 0.75, 'confidence_score': 0.8}]]

[[{'name': 'Sanavbari Nikitenko',
   'type': 'Criminal',
   'justification': "Classification as 'Criminal' is based on the presence of a perfect match in the Criminal Entities database (ce_flag: 3) with notes indicating 'Red Notice - participation in the activity of a terrorist organization; participation in the activity of an illegal armed formation'. The absence from the PEP database (pep_flag: 0) and lack of matching email do not impact this classification due to the strong evidence from the Criminal Entities database.",
   'risk_score': 0.8,
   'confidence_score': 0.9},
  {'name': 'Trevor Prescod',
   'type': 'PEP',
   'justification': " Classified as 'PEP' due to the name being found in the PEP database (pep_flag: 2), despite the email not matching. The absence from the Criminal Entities database (ce_flag: -1) and lack of other adverse evidence support this singular classification.",
   'risk_score': 0.3,
   'confidence_score': 0.8},
  {'name': 'Trevor Squirrell',
   'type': 'PEP'

In [115]:
[[{'name': 'Sanavbari Nikitenko', 'type': 'Criminal', 'justification': "Classification as 'Criminal' is based on the individual's presence in the Criminal Entities database with a perfect match (ce_flag: 3), indicating involvement in a terrorist organization and an illegal armed formation. The absence from the PEP database (pep_flag: 0) and lack of matching email do not outweigh the severe criminal evidence."}, {'name': 'Trevor Prescod', 'type': 'PEP', 'justification': " Classified as 'PEP' due to the name match in the PEP database (pep_flag: 2), despite the email not matching. The individual's absence from the Criminal Entities database (ce_flag: -1) and lack of other adverse evidence support this classification."}, {'name': 'Trevor Squirrell', 'type': 'PEP', 'justification': "Confirmed 'PEP' classification based on a matching name and email in the PEP database (pep_flag: 6), indicating a clear connection to politically exposed persons. No adverse evidence from other databases (ce_flag: -1, lei_flag: 0) supports this singular classification."}, {'name': 'Corfiducia Anstalt', 'type': 'Intermediary', 'justification': "Although classified as an individual, 'Corfiducia Anstalt' is likely a misidentified company or pseudonym (note: uncommon surname). The presence in Panama Papers as an ACTIVE intermediary in Liechtenstein (lei_flag: 3) and lack of PEP or Criminal database matches (pep_flag: 0, ce_flag: -1) suggest an 'Intermediary' role, but with caution due to potential misidentification."}, {'name': 'Digital Marketing Awards FZ LLC', 'type': 'Shell company', 'justification': "Evidence suggests 'Shell company' classification due to the company's presence in multiple leaks databases (leaks_ent_flag: 1), albeit with an address mismatch, and a perfect match in the Criminal Entities database (ce_flag: 3). The absence from Consolidated Sanctions (cs_flag: 0) does not outweigh the cumulative evidence indicating potential shell company activities."}, {'name': '8808 Holding Limited', 'type': 'Shell company', 'justification': "Classified as a 'Shell company' due to extensive presence in various leaks databases (leaks_ent_flag: 1, leaks_inter_flag: 2), indicating a high likelihood of shell company activities. Although the address does not match in some records, and there's a non-perfect match in the Consolidated Sanctions database (cs_flag: 0), the cumulative evidence from leaks databases and a non-perfect match in the Criminal Entities database (ce_flag: 2) supports this classification."}, {'name': 'Quantum Holdings Ltd', 'type': 'Shell company', 'justification': "Evidence from multiple leaks databases (leaks_ent_flag: 1, leaks_inter_flag: 1) with a non-perfect match in the Criminal Entities database (ce_flag: 2) and absence from Consolidated Sanctions (cs_flag: 0) suggests 'Shell company' classification. The presence in news content related to stock holdings does not necessarily contradict this, as shell companies can engage in legitimate financial activities."}]]

[[{'name': 'Sanavbari Nikitenko',
   'type': 'Criminal',
   'justification': "Classification as 'Criminal' is based on the individual's presence in the Criminal Entities database with a perfect match (ce_flag: 3), indicating involvement in a terrorist organization and an illegal armed formation. The absence from the PEP database (pep_flag: 0) and lack of matching email do not outweigh the severe criminal evidence."},
  {'name': 'Trevor Prescod',
   'type': 'PEP',
   'justification': " Classified as 'PEP' due to the name match in the PEP database (pep_flag: 2), despite the email not matching. The individual's absence from the Criminal Entities database (ce_flag: -1) and lack of other adverse evidence support this classification."},
  {'name': 'Trevor Squirrell',
   'type': 'PEP',
   'justification': "Confirmed 'PEP' classification based on a matching name and email in the PEP database (pep_flag: 6), indicating a clear connection to politically exposed persons. No adverse evidence from o

In [132]:
result_json = {}
result_json['Transaction ID'] = extracted_json['transaction_id']
result_json['Extracted Entity'] = [x['name'] for x in x[0]]
result_json['Entity Types'] = [x['type'] for x in x[0]]
result_json['Risk Score'] = x[1]['risk_score']
result_json['Confidence Score'] = x[1]['confidence_score']
result_json['Supporting Evidence'] = [x['justification'] for x in x[0]]
result_json['Reason'] = extracted_json['risk_evaluation']

In [133]:
result_json

{'Transaction ID': 'TXN-2023-5A9B',
 'Extracted Entity': ['Sanavbari Nikitenko',
  'Trevor Prescod',
  'Trevor Squirrell',
  'Corfiducia Anstalt',
  'Digital Marketing Awards FZ LLC',
  '8808 Holding Limited',
  'Quantum Holdings Ltd'],
 'Entity Types': ['Criminal',
  'PEP',
  'PEP',
  'Criminal, PEP',
  'Shell company',
  'Shell company',
  'Shell company'],
 'Risk Score': 0.73,
 'Confidence Score': 0.82,
 'Supporting Evidence': ["Classification as 'Criminal' is based on the presence of a perfect match in the Criminal Entities database (ce_flag: 3), indicating involvement in a terrorist organization and an illegal armed formation. The absence from the PEP database (pep_flag: 0) and lack of matching email in PEP records (pep_evidence) do not outweigh the strong evidence of criminal activity.",
  "Despite the email mismatch (pep_evidence), the presence in the PEP database (pep_flag: 2) with notes 'Every Politician' suggests a Politically Exposed Person. The absence from the Criminal Ent