In [59]:
pip install fuzzywuzzy



In [60]:
import spacy
import re
import pandas as pd
from transformers import pipeline
import os
import json
from fuzzywuzzy import fuzz, process

# Loading the pre-trained model

In [61]:
nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


# Cleaning entities returned by the model

In [62]:

def clean_entities(entities):
    cleaned_entities = []

    for entity in entities:
        entity = re.sub(r'\b[A-Z]\b', '', entity)
        entity = re.sub(r'\s+', ' ', entity).strip()
        words = entity.split()
        cleaned_entity = []
        for word in words:
            if len(word) > 1 or word in {"EU", "USD"}:
                cleaned_entity.append(word)
        cleaned_text = " ".join(cleaned_entity)
        if cleaned_text and cleaned_text not in cleaned_entities:
            cleaned_entities.append(cleaned_text)

    return cleaned_entities

# Extract with the model

In [63]:
def extract_spacy_entities_by_category(text):
    doc = nlp(text)
    result = {"ORG": [], "PERSON": [], "GPE": [], "MONEY": []}

    entity_map = {
        "I-ORG": "ORG",
        "I-PER": "PERSON",
        "I-LOC": "GPE",
        "I-MISC": "MONEY"
    }
    current_entity = ""
    current_label = ""
    prev_label = None

    for token in doc:
        entity = token["entity"]
        word = token["word"]
        if entity in entity_map:
            category = entity_map[entity]
            if word.startswith("##"):
                word = word[2:]
            if prev_label == entity:
                current_entity += " " + word
            else:
                if current_entity and current_label:
                    result[current_label].append(current_entity.strip())
                current_entity = word
                current_label = category
            prev_label = entity
        else:
            if current_entity and current_label:
                result[current_label].append(current_entity.strip())
            current_entity = ""
            current_label = ""
            prev_label = None
    if current_entity and current_label:
        result[current_label].append(current_entity.strip())
    for key in result:
        result[key] = clean_entities(list(set(result[key])))

    return result

In [64]:
def extract_custom_entities_by_category(text):
    result = {"ORG": [], "PERSON": []}
    pattern_blocked_person = re.compile(r"(Blocked Person [A-Z])")
    pattern_entity = re.compile(r"(Entity [A-Z])")

    matches_blocked_person = pattern_blocked_person.findall(text)
    matches_entity = pattern_entity.findall(text)

    result["PERSON"].extend(matches_blocked_person)
    result["ORG"].extend(matches_entity)

    for key in result:
        result[key] = list(set(result[key]))
    return result

In [65]:
def combine_entities(text):
    spacy_ents = extract_spacy_entities_by_category(text)
    custom_ents = extract_custom_entities_by_category(text)

    combined = {}
    for category in ["ORG", "PERSON", "GPE", "MONEY"]:
        combined[category] = list(set(spacy_ents.get(category, []) + custom_ents.get(category, [])))
    return combined

In [66]:
def cluster_entities(entities, threshold=80):
    unique_entities = list(set(entities))
    clusters = {}

    while unique_entities:
        base = unique_entities.pop(0)
        group = [base]
        for entity in unique_entities[:]:
            if fuzz.ratio(base.lower(), entity.lower()) >= threshold:
                group.append(entity)
                unique_entities.remove(entity)
        clusters[base] = group

    return clusters

In [67]:
def standardize_entities(clusters):
    standardized = []
    for group in clusters.values():
        representative = max(group, key=len)
        standardized.append(representative)
    return standardized

In [68]:
def process_text_by_category(text):
    combined = combine_entities(text)
    final_results = {}
    for category, ents in combined.items():
        clusters = cluster_entities(ents)
        final_results[category] = standardize_entities(clusters)
    return final_results

# Extract entities if JSON

In [69]:
def extract_entities_json(text):
    entities = {}

    # Regular expressions for extracting different fields
    patterns = {
        "Transaction ID": r"Transaction ID:\s*(\S+)",
        "Date": r"Date:\s*([\d-]+ \d{2}:\d{2}:\d{2})",
        "Sender Name": r"Sender:\s*(?:- )?Name:\s*\"?([^\n\"]+)\"?",
        "Sender Account": r"Sender:.*?Account:\s*([\w\s\(\)-]+)",
        "Sender Address": r"Sender:.*?Address:\s*([^\n]+)",
        "Sender Beneficiary": r"Beneficiary Owner:\s*\"([^\"]+)\"",
        "Sender Notes": r"Sender:.*?Notes:\s*\"([^\"]+)\"",
        "Receiver Name": r"Receiver:\s*(?:- )?Name:\s*\"?([^\n\"]+)\"?",
        "Receiver Account": r"Receiver:.*?Account:\s*([\w\s\(\)-]+)",
        "Receiver Address": r"Receiver:.*?Address:\s*([^\n]+)",
        "Receiver Tax ID": r"Receiver:.*?Tax ID:\s*([\w-]+)",
        "Receiver Registration": r"Receiver:.*?Registration:\s*([^\n]+)",
        "Amount": r"Amount:\s*\$([\d,\.]+)",
        "Currency Exchange": r"Currency Exchange:\s*([^\n]+)",
        "Transaction Type": r"Transaction Type:\s*([^\n]+)",
        "Reference": r"Reference:\s*\"([^\"]+)\"",
        "Additional Notes": r"Additional Notes:\s*(?:-\s*\"([^\"]+)\"\s*)+"
    }

    # Extract data using regex
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        entities[key] = [match.group(1) if match else ""]

    return entities

# Split each data point

In [70]:
def process_file(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == ".txt":
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        transactions = text.split("---")
    elif file_extension==".csv":
        transactions = pd.read_csv(file_path,delimiter = ",")
    return transactions,file_extension

# Identifies if semi-structured or paragraph

In [71]:
def is_json(text):
    structured_patterns = [
        r"Transaction ID:\s*\S+",
        r"Sender:\s*- Name:",
        r"Receiver:\s*- Name:",
        r"Amount:\s*\$?\d+",
    ]

    for pattern in structured_patterns:
        if re.search(pattern, text):
            return True
    return False

In [72]:
def parse_transaction_file(file_path):
    transactions,extension = process_file(file_path)
    processed_df = None

    json_data_list = []
    para_data_list = []
    if extension == ".txt":
        for j, transaction in enumerate(transactions):
            if is_json(transaction):
                results = extract_entities_json(transaction)
                data = {key: ', '.join(value) for key, value in results.items()}
                json_data_list.append(data)
                df = pd.DataFrame(json_data_list)
                df.replace("", pd.NA, inplace=True)
                processed_df = df.dropna(axis=1, how='all')
                processed_df.to_csv('semi_structured.csv', index=False)
            else:
                results = process_text_by_category(transaction)
                data = {key: ', '.join(value) for key, value in results.items()}
                para_data_list.append(data)
                processed_df = pd.DataFrame(para_data_list)
                processed_df.to_csv('unstructured.csv', index=False)
    else:
        processed_df = transactions
    processed_df = processed_df.fillna("NaN")
    return processed_df

# Structured input

In [73]:
df_unstructured = parse_transaction_file("/content/transactions.csv")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.colheader_justify", "center")
pd.set_option("display.max_colwidth", None)
print(df_unstructured.head())

  Transaction ID         Date                   Sender Name                             Sender Account                              Sender Address                    Receiver Name                    Receiver Account                         Receiver Address                Amount  Transaction Type
0  TXN-2023-5A9B  2023-08-15 14:22:00  Global Horizons Consulting LLC      IBAN CH56 0483 5012 3456 7800 9 (Swiss bank)  Rue du Marché 17, Geneva, Switzerland  Bright Future Nonprofit Inc  987654321 (Cayman National Bank, KY)  P.O. Box 1237, George Town, Cayman Islands   49850.0   Wire Transfer 
1  TXN-2023-7C8D  2023-09-10 11:30:00              Sunrise Trading Co  IBAN DE89 3704 0044 0532 0130 00 (Deutsche Bank)      Friedrichstr. 12, Berlin, Germany              Hope Foundation                  1122334455 (HSBC UK)               10 Downing Street, London, UK  105000.5   Wire Transfer 
2  TXN-2023-9X0Y  2023-11-20 16:45:00            FutureTech Solutions   IBAN US29 1234 5678 9101 1121 (Bank o

# Semi-structured input

In [74]:
df = parse_transaction_file("/content/entity_semistructured.txt")
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.max_colwidth", None)
print(df.head())

  Transaction ID         Date                   Sender Name           Sender Beneficiary        Receiver Name            Amount       Currency Exchange      Transaction Type                   Reference                                                    Additional Notes                               
0  TXN-2023-7C2D  2023-08-15 14:25:00            Quantum Holdings Ltd   Maria Gonzalez       Golden Sands Trading FZE  950,000.00  EUR -> USD (Rate: 1.12)           SWIFT     Commodity Trade Settlement – Contract #DX-889           Invoice attached: 'Oil Equipment Procurement' (PDF missing metadata).
1  TXN-2023-5A9B  2023-08-15 14:22:00  Global Horizons Consulting LLC              NaN    Bright Future Nonprofit Inc   49,850.00                        N/A   Wire Transfer         Charitable Donation - Ref #DR-2023-0815  Linked invoice missing. Processed via intermediary Quantum Holdings Ltd (BVI).


# Unstructured input

In [75]:
df = parse_transaction_file("/content/entity_unstructured.txt")
pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.max_colwidth", None)
print(df.head())

                      ORG                           PERSON                 GPE            MONEY
0                                    Entity B  Blocked Person X                                
1  Alice Smith, John, SB Bank, Barclay London                Do  United Kingdom, New York      
2                           Google LLC Morgan                                                  
