In [1]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [2]:
pip install openpyxl



In [10]:
import pandas as pd
import spacy
import chardet
import re
from sklearn.ensemble import IsolationForest
import requests

def detect_encoding(file_path):
    """Detect file encoding to prevent UnicodeDecodeError"""
    with open(file_path, "rb") as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result["encoding"]

def extract_transactions(file_path):
    """Extract structured data from an unstructured text file"""
    encoding = detect_encoding(file_path)

    with open(file_path, "r", encoding=encoding, errors="replace") as f:
        lines = f.readlines()

    transactions = []
    for line in lines:
        # Extract transaction details using regex patterns
        amount_match = re.search(r"\$?(\d{1,3}(?:[,\.]\d{3})*(?:\.\d{1,2})?)", line)
        name_match = re.findall(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b", line)

        amount = float(amount_match.group(1).replace(",", "")) if amount_match else None
        payer = name_match[0] if len(name_match) > 0 else "Unknown"
        receiver = name_match[1] if len(name_match) > 1 else "Unknown"

        transactions.append({
            "transaction_details": line.strip(),
            "payers_name": payer,
            "receiver_name": receiver,
            "amount": amount
        })

    return pd.DataFrame(transactions)

def extract_entities(text, nlp):
    """Extract relevant entities (ORG, PERSON) using SpaCy"""
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON"]]

def enrich_entity(name):
    """Fetch additional details about an entity from OpenCorporates API"""
    try:
        search_url = f"https://api.opencorporates.com/v0.4/companies/search?q={name}"
        response = requests.get(search_url, timeout=10)
        if response.status_code == 200:
            return response.json()
        return {"name": name, "error": "No data found"}
    except requests.exceptions.RequestException:
        return {"name": name, "error": "API request failed"}

def detect_anomalies(data):
    """Detect anomalies in transaction amounts using Isolation Forest"""
    model = IsolationForest(contamination=0.05)
    data['anomaly_score'] = model.fit_predict(data[['amount']].fillna(0))
    return data

def classify_entity(name):
    """Classify entity type based on keywords"""
    categories = {
        "corporation": ["Inc", "Corp", "LLC", "Capital", "Partners"],
        "non-profit": ["Foundation", "Charity", "Save", "Children"],
        "government": ["Department", "Agency", "Bureau"]
    }
    for category, keywords in categories.items():
        if any(word in name for word in keywords):
            return category
    return "unknown"

def assign_risk_score(entity_data):
    """Assign a risk score based on entity classification and anomaly detection"""
    risk_score = 0
    if entity_data.get("category", "") == "shell company":
        risk_score += 5
    if entity_data.get("anomaly_score", 0) == -1:
        risk_score += 3
    return risk_score

def main(file_path):
    """Main function to process unstructured financial transaction data"""
    print(f"Processing file: {file_path}")

    # Load English NLP model
    nlp = spacy.load("en_core_web_sm")

    # Extract transactions from the text file
    df = extract_transactions(file_path)

    # Extract entities from transaction details
    df['entities'] = df.apply(lambda x: extract_entities(str(x['transaction_details']), nlp)
                              + [x['payers_name'], x['receiver_name']], axis=1)

    # Enrich entity information
    enriched_data = []
    for entity in df['entities'].explode().dropna().unique():
        entity_info = enrich_entity(entity)
        entity_info['name'] = entity
        entity_info['category'] = classify_entity(entity)
        entity_info['risk_score'] = assign_risk_score(entity_info)
        enriched_data.append(entity_info)

    # Merge enriched data with original dataset
    enriched_df = pd.DataFrame(enriched_data)
    final_df = df.explode('entities').merge(enriched_df, left_on='entities', right_on='name', how='left')

    # Detect anomalies
    final_df = detect_anomalies(final_df)

    # Save output to an Excel file
    output_file = "Risk_analysis_unstructured_output.xlsx"
    final_df.to_excel(output_file, index=False)
    print(f"Processed data saved to {output_file}")

if __name__ == "__main__":
    main("/content/sample_data/unstructured_text")


Processing file: /content/sample_data/unstructured_text
Processed data saved to Risk_analysis_unstructured_output.xlsx
