In [None]:
# !pip install spacy ipython regex
# !python -m spacy download en_core_web_sm
#!pip install thefuzz

In [1]:
import re
import spacy
from IPython.display import display, Markdown

# Load Spacy's English model for lemmatization
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    """Convert to lowercase and remove punctuation"""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

def lemmatize_text(text):
    """Convert words to their root forms"""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [2]:
from typing import List, Dict, Tuple
import csv
def get_ofac_sanctions_list(file_path: str = "sdn.csv") -> Tuple[List[str], Dict[str, Dict]]:
    """
    Parses OFAC SDN CSV file and returns:
    - List of primary SDN names
    - Dictionary of full SDN records keyed by ent_num
    
    Args:
        file_path: Path to OFAC SDN CSV file
        
    Returns:
        Tuple of (names_list, full_records_dict)
    """
    names = []
    records = {}
    
    with open(file_path, mode='r', encoding='utf-8') as csvfile:
        # Using csv.reader to handle quoted fields properly
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        
        for row in reader:
            if len(row) < 12:  # Minimum expected columns
                continue
                
            # Parse according to dat_spec.txt
            record = {
                "ent_num": row[0],
                "SDN_Name": row[1],
                "SDN_Type": row[2],
                "Program": row[3],
                "Title": row[4],
                "Call_Sign": row[5],
                "Vess_type": row[6],
                "Tonnage": row[7],
                "GRT": row[8],
                "Vess_flag": row[9],
                "Vess_owner": row[10],
                "Remarks": row[11],
                "aliases": []
            }
            
            # Skip null values marked as -0-
            if record["SDN_Name"] == "-0-":
                continue
                
            # Add to outputs
            names.append(record["SDN_Name"])
            records[record["ent_num"]] = record
            
            # Extract aliases from remarks (both strong and weak)
            if record["Remarks"] != "-0-":
                # Strong aliases are marked with a.k.a. without quotes
                strong_aliases = re.findall(r"a\.k\.a\.\s+'([^']+)'", record["Remarks"])
                # Weak aliases are in double quotes
                weak_aliases = re.findall(r'"([^"]+)"', record["Remarks"])
                
                record["aliases"] = strong_aliases + weak_aliases
                names.extend(record["aliases"])
    
    return names, records
SANCTIONS_LIST = get_ofac_sanctions_list()[0]

In [3]:
from thefuzz import fuzz
class FuzzyMatcher:
    def __init__(self, sanctions_list):
        self.sanctions = [lemmatize_text(clean_text(name)) for name in sanctions_list]
        
    def find_matches(self, query, threshold=85):
        """Find potential sanctions matches"""
        query = lemmatize_text(clean_text(query))
        matches = []
        for sanctioned_name in self.sanctions:
            score = fuzz.token_sort_ratio(query, sanctioned_name)
            if score >= threshold:
                matches.append({
                    "sanctioned_name": sanctioned_name,
                    "query": query,
                    "score": score,
                    "match_type": self._get_match_type(score)
                })
        return pd.DataFrame(matches)
    
    @staticmethod
    def _get_match_type(score):
        if score >= 95: return "Exact Match"
        elif score >= 85: return "Strong Match"
        elif score >= 70: return "Possible Match"
        return "No Match"

# Initialize matcher
matcher = FuzzyMatcher(SANCTIONS_LIST)

In [78]:
def screen_transaction_entity(entity_name):
    ofac_risk_score = 0
    matches = matcher.find_matches(lemmatize_text(clean_text(entity_name)))
    if not matches.empty:
        highest_match = matches.loc[matches["score"].idxmax()]
        ofac_risk_score = min(100, highest_match["score"])
    
    #return ofac_risk_score
    return {
        #"Entity Name": entity_name,
        "Risk Score": ofac_risk_score,
        "Flags": "SANCTION_MATCH" if ofac_risk_score > 0 else "NO_MATCH"
    }

In [22]:
import pandas as pd
import numpy as np

In [56]:
transaction_data=pd.read_csv("structured_dataset.csv",header=0)
#transaction_data=pd.read_csv("structured_large_generated_dataset.csv",header=0)
transaction_data.head()

Unnamed: 0,Transaction ID,Payer Name,Receiver Name,Transaction Details,Amount,Receiver Country
0,TXN001,Acme Corp,SovCo Capital Partners,Payment for services rendered,$500000,USA
1,TXN002,Global Health Foundation,Save the Children,Grant disbursement,$2000000,UK
2,TXN003,XYZ Ltd,ABC GmbH,Purchase of office supplies,$15000,Germany
3,TXN004,Green Earth Org,CCMI,Environmental project funding,$750000,Cayman Islands
4,TKN005,Oceanic Holdings LLC,Alias Chiriamas,Offshore Investment,$5000000,Panama


In [24]:
transaction_data.shape

(7, 6)

In [None]:
#Ensure the column is reset
#transaction_data['OFAC Risk Score'] = None

# Populate the new column with computed values
# transaction_data['OFAC Risk Score'] = transaction_data.apply(
#     lambda row: (screen_transaction_entity(row['Payer Name'])['Risk Score'] + screen_transaction_entity(row['Receiver Name'])['Risk Score']), axis=1
# )
# transaction_data


In [85]:
transaction_data['OFAC Risk Score'] = None
transaction_data['OFAC Risk Details'] = None

# Define a function to process each row
def process_row(row):
    # Call screen_transaction_entity only twice
    payer_data = screen_transaction_entity(row['Payer Name'])
    receiver_data = screen_transaction_entity(row['Receiver Name'])
    
    # Combine the results to create new columns
    return pd.Series({
        'OFAC Risk Score': payer_data['Risk Score'] + receiver_data['Risk Score'],
        'OFAC Risk Details': 'Payer:'+payer_data['Flags'] +' '+ 'Receiver:'+receiver_data['Flags']
    })

# Apply the function to the dataframe
transaction_data[['OFAC Risk Score', 'OFAC Risk Details']] = transaction_data.apply(process_row, axis=1)

transaction_data

Unnamed: 0,Transaction ID,Payer Name,Receiver Name,Transaction Details,Amount,Receiver Country,OFAC Risk Score,OFAC Risk Details
0,TXN001,Acme Corp,SovCo Capital Partners,Payment for services rendered,$500000,USA,0,Payer:NO_MATCH Receiver:NO_MATCH
1,TXN002,Global Health Foundation,Save the Children,Grant disbursement,$2000000,UK,0,Payer:NO_MATCH Receiver:NO_MATCH
2,TXN003,XYZ Ltd,ABC GmbH,Purchase of office supplies,$15000,Germany,0,Payer:NO_MATCH Receiver:NO_MATCH
3,TXN004,Green Earth Org,CCMI,Environmental project funding,$750000,Cayman Islands,0,Payer:NO_MATCH Receiver:NO_MATCH
4,TKN005,Oceanic Holdings LLC,Alias Chiriamas,Offshore Investment,$5000000,Panama,0,Payer:NO_MATCH Receiver:NO_MATCH
5,TXN9354,CASA CUBA,Apple,Project funding,$320000,Japan,86,Payer:SANCTION_MATCH Receiver:NO_MATCH
6,TXN8326,AEROCARIBBEAN AIRLINE,AEROCARIBEAN AIR,Project funding,$2730000,Cuba,186,Payer:SANCTION_MATCH Receiver:SANCTION_MATCH


In [86]:
# Save the DataFrame to a CSV file
transaction_data.to_csv('transaction_data_with_ofac_risk.csv', index=False)