In [None]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
pip install openpyxl



In [None]:
import pandas as pd
import spacy
from sklearn.ensemble import IsolationForest
import requests

def load_data(file_path):
    df = pd.read_excel(file_path, engine="openpyxl")

    # Clean the Amount column
    df["Amount"] = df["Amount"].astype(str)  # Ensure all values are strings
    df["Amount"] = df["Amount"].str.replace("[^0-9.,]", "", regex=True)  # Remove non-numeric characters
    df["Amount"] = df["Amount"].str.replace(",", ".")  # Replace commas with dots
    df["Amount"] = pd.to_numeric(df["Amount"], errors="coerce")  # Convert to float, setting errors to NaN

    return df

def extract_entities(text, nlp):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON"]]
    return entities

def enrich_entity(name):
    try:
        search_url = f"https://api.opencorporates.com/v0.4/companies/search?q={name}"
        response = requests.get(search_url, timeout=10)
        if response.status_code == 200:
            return response.json()
        else:
            return {"name": name, "error": "No data found"}
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {name}: {e}")
        return {"name": name, "error": "API request failed"}

def detect_anomalies(data):
    model = IsolationForest(contamination=0.05)
    data['anomaly_score'] = model.fit_predict(data[['Amount']])
    return data

def classify_entity(name):
    categories = {"corporation": ["Inc", "Corp", "LLC", "Capital", "Partners"],
                  "non-profit": ["Foundation", "Charity", "Save", "Children"],
                  "government": ["Department", "Agency", "Bureau"]}
    for category, keywords in categories.items():
        if any(word in name for word in keywords):
            return category
    return "unknown"

def assign_risk_score(entity_data):
    risk_score = 0
    if entity_data.get("category", "") == "shell company":
        risk_score += 5
    if entity_data.get("anomaly_score", 0) == -1:
        risk_score += 3
    return risk_score

def main(file_path):
    nlp = spacy.load("en_core_web_sm")
    df = load_data(file_path)
    df['entities'] = df.apply(lambda x: extract_entities(str(x['transaction_details']), nlp) + [x['Payers_name'], x['Receiver_Name']], axis=1)

    enriched_data = []
    for entity in df['entities'].explode().dropna().unique():
        entity_info = enrich_entity(entity)
        entity_info['name'] = entity
        entity_info['category'] = classify_entity(entity)
        entity_info['risk_score'] = assign_risk_score(entity_info)
        enriched_data.append(entity_info)

    enriched_df = pd.DataFrame(enriched_data)
    final_df = df.explode('entities').merge(enriched_df, left_on='entities', right_on='name', how='left')
    final_df = detect_anomalies(final_df)

    final_df.to_excel("Risk_analysis_structured_output.xlsx", index=False)
    print("Processed data saved to Risk_analysis_structured_output.xlsx")

if __name__ == "__main__":
    main("/content/sample_data/dataset.xlsx")

Processed data saved to output.xlsx
