# Final Code

In [55]:
import re
import pandas as pd
from itertools import combinations

# =========================================================
# 1  NEGATION LIST
# =========================================================
NEGATIONS = [
    r"\bnot\b", r"\bno\b", r"\bnever\b", r"\bdidn'?t\b", r"\bwasn'?t\b",
    r"\baren'?t\b", r"\bhasn'?t\b", r"\bhaven'?t\b", r"\bdoesn'?t\b",
    r"\bcouldn'?t\b", r"\bwithout\b"
]

# =========================================================
# 2  FACT PATTERN REGEX TEMPLATES
#     ({{0,{gap}}} = placeholder for configurable gap)
# =========================================================
FACT_PATTERNS = {
    "structuring": [
        r"multiple.{{0,{gap}}}cash.{{0,{gap}}}deposit(s)?",
        r"structur(ed|ing|es)?.{{0,{gap}}}(deposit|transaction|activity|pattern)(s)?",
        r"below.{{0,{gap}}}report(ing|ed)?.{{0,{gap}}}threshold(s)?",
        r"split.{{0,{gap}}}(deposit|transaction)(s)?",
        r"under.{{0,{gap}}}(10k|10000)",
        r"frequent.{{0,{gap}}}small.{{0,{gap}}}deposit(s)?",
        r"avoid.{{0,{gap}}}ctr",
        r"aware.{{0,{gap}}}report(ing)?.{{0,{gap}}}(limit|threshold)"
    ],
    "foreign_transfer": [
        r"(international|foreign).{{0,{gap}}}(wire|transfer)(s)?",
        r"offshore.{{0,{gap}}}account(s)?",
        r"cross.{{0,{gap}}}border.{{0,{gap}}}(transaction|transfer)(s)?",
        r"foreign.{{0,{gap}}}beneficiar(y|ies)",
        r"(fund|money|amount)(s)?.{{0,{gap}}}(sent|wired|transferred).{{0,{gap}}}(overseas|abroad)",
        r"(receive|received|incoming).{{0,{gap}}}(fund|wire|transfer)(s)?.{{0,{gap}}}(foreign|international)"
    ],
    "unknown_source": [
        r"unknown.{{0,{gap}}}(origin|source)",
        r"unexplained.{{0,{gap}}}(fund|transaction)(s)?",
        r"unable.{{0,{gap}}}(provide|verify).{{0,{gap}}}(document|info)(s|ation)?",
        r"customer.{{0,{gap}}}could.{{0,{gap}}}not.{{0,{gap}}}explain",
        r"source.{{0,{gap}}}of.{{0,{gap}}}fund(s)?.{{0,{gap}}}(unclear|unknown|unverifiable)",
        r"no.{{0,{gap}}}(support|supporting).{{0,{gap}}}document(s)?",
        r"activity.{{0,{gap}}}inconsistent.{{0,{gap}}}(occupation|employment)",
        r"no.{{0,{gap}}}legitimate.{{0,{gap}}}(reason|purpose)"
    ],
    "third_party": [
        r"third.{{0,{gap}}}part(y|ies)",
        r"fund(s)?.{{0,{gap}}}from.{{0,{gap}}}unrelated.{{0,{gap}}}(account|entity)(s)?",
        r"acting.{{0,{gap}}}on.{{0,{gap}}}behalf.{{0,{gap}}}(of|for)",
        r"(money|fund)(s)?.{{0,{gap}}}transfer(red|ring)?.{{0,{gap}}}to.{{0,{gap}}}unrelated",
        r"account.{{0,{gap}}}used.{{0,{gap}}}for.{{0,{gap}}}another.{{0,{gap}}}(person|individual|entity)",
        r"deposit.{{0,{gap}}}made.{{0,{gap}}}by.{{0,{gap}}}(different|unknown).{{0,{gap}}}(person|individual)"
    ],
    "rapid_movement": [
        r"(immediate|same.{{0,{gap}}}day).{{0,{gap}}}(withdrawal|transfer)(s)?",
        r"rapid.{{0,{gap}}}(inflow|outflow)",
        r"fund(s)?.{{0,{gap}}}(moved|transferred).{{0,{gap}}}(shortly|quickly)",
        r"(zero|near).{{0,{gap}}}balance.{{0,{gap}}}(maintain|kept)",
        r"quick.{{0,{gap}}}movement.{{0,{gap}}}of.{{0,{gap}}}fund(s)?",
        r"(back.{{0,{gap}}}to.{{0,{gap}}}back).{{0,{gap}}}(transaction|transfer)(s)?"
    ],
    "customer_behavior": [
        r"customer.{{0,{gap}}}(appear|seem)(ed)?.{{0,{gap}}}(nervous|anxious)",
        r"(reluctant|refused|unwilling).{{0,{gap}}}to.{{0,{gap}}}(provide|share|discuss).{{0,{gap}}}(info|information|details)?",
        r"(became|appeared).{{0,{gap}}}(defensive|agitated|evasive)",
        r"avoid(ed|ing).{{0,{gap}}}(question|eye contact)",
        r"request(ed)?.{{0,{gap}}}to.{{0,{gap}}}split.{{0,{gap}}}(transaction|deposit)(s)?"
    ],
    "elder_fraud": [
        r"elder(ly)?.{{0,{gap}}}(customer|person|client)",
        r"online.{{0,{gap}}}(relationship|friend|acquaintance)",
        r"instruct(ed)?.{{0,{gap}}}by.{{0,{gap}}}(unknown|unfamiliar).{{0,{gap}}}(person|individual)",
        r"possible.{{0,{gap}}}(scam|fraud)",
        r"suspect(ed)?.{{0,{gap}}}elder.{{0,{gap}}}(abuse|fraud)",
        r"assist(ed)?.{{0,{gap}}}by.{{0,{gap}}}(unfamiliar|unknown).{{0,{gap}}}(person|party)",
        r"customer.{{0,{gap}}}(appear|seem)(ed)?.{{0,{gap}}}(confused|uncertain)",
        r"coach(ed)?.{{0,{gap}}}by.{{0,{gap}}}(phone|telephone)",
        r"sending.{{0,{gap}}}(gift.{{0,{gap}}}card|giftcard)(s)?"
    ],
    "unusual_activity": [
        r"activity.{{0,{gap}}}(inconsistent|not consistent).{{0,{gap}}}(purpose|profile)",
        r"sudden.{{0,{gap}}}(increase|change).{{0,{gap}}}(activity|volume)",
        r"new.{{0,{gap}}}pattern.{{0,{gap}}}(deposit|transaction)(s)?",
        r"unusual.{{0,{gap}}}(account|transaction|pattern)(s)?",
        r"large.{{0,{gap}}}round.{{0,{gap}}}dollar.{{0,{gap}}}(transaction|deposit)(s)?",
        r"(dormant|inactive).{{0,{gap}}}account.{{0,{gap}}}(now|became).{{0,{gap}}}active",
        r"unexplained.{{0,{gap}}}(surge|spike).{{0,{gap}}}(activity|transactions)?"
    ],
    "suspicious_language": [
        r"suspiciou(s|sly)?",
        r"unusual(ly)?",
        r"inconsisten(t|cy)",
        r"concern(ed|ing)",
        r"questionable.{{0,{gap}}}(transaction|activity)(s)?",
        r"possible.{{0,{gap}}}(structuring|scam|fraud)",
        r"believe.{{0,{gap}}}suspicious",
        r"potential.{{0,{gap}}}(scam|fraud)",
        r"red.{{0,{gap}}}flag(s)?"
    ]
}

# =========================================================
# 3  NEGATION CHECK
# =========================================================
def has_negation_nearby(text, match_start, window=10):
    start_idx = max(0, match_start - window)
    context = text[start_idx:match_start].lower()
    return any(re.search(neg, context) for neg in NEGATIONS)

# =========================================================
# 4  FACT-PATTERN DETECTION
# =========================================================
def detect_fact_patterns(text, fact_patterns=FACT_PATTERNS, max_gap=10):
    """
    Detect AML fact patterns and return 'Yes'/'No' flags per theme.
    """
    text = str(text).lower()
    results = {}

    formatted_patterns = {
        theme: [pat.format(gap=max_gap) for pat in pats]
        for theme, pats in fact_patterns.items()
    }

    for theme, patterns in formatted_patterns.items():
        found = False
        for pat in patterns:
            for m in re.finditer(pat, text, re.IGNORECASE):
                if not has_negation_nearby(text, m.start()):
                    found = True
                    break
            if found:
                break
        results[theme] = "Yes" if found else "No"
    return results

# =========================================================
# 5  CO-OCCURRENCE INVESTIGATION RATE (works with Yes/No)
# =========================================================
def cooccurrence_investigation_rate_overall(df, pattern_cols, target_col='investigated', combo_size=2):
    """
    Compute investigation rates for pattern co-occurrences, normalized over total records.
    Works with 'Yes'/'No' or 1/0 input columns.
    """
    df_num = df.copy()
    # Convert Yes/No → 1/0 if needed
    for c in pattern_cols:
        if df_num[c].dtype == object:
            df_num[c] = df_num[c].str.lower().map({'yes': 1, 'no': 0})

    total_records = len(df_num)
    results = []

    for combo in combinations(pattern_cols, combo_size):
        mask = df_num[list(combo)].all(axis=1)
        subset = df_num[mask]
        investigated_count = subset[target_col].sum()
        total_occurrences = len(subset)
        investigation_rate = investigated_count / total_records if total_records > 0 else 0

        results.append({
            'patterns': ' + '.join(combo),
            'total_occurrences': total_occurrences,
            'investigated_with_combo': investigated_count,
            'investigation_rate_overall': round(investigation_rate, 4)
        })

    result_df = pd.DataFrame(results).sort_values(
        by='investigation_rate_overall', ascending=False
    ).reset_index(drop=True)
    return result_df

# =========================================================
# 6  EXAMPLE USAGE (END TO END)
# =========================================================

data = {
    "UTR_text": [
        "Customer made multiple cash deposits below reporting threshold. Funds were transferred to offshore account in Dubai",
        "Funds were transferred to offshore account in Dubai.",
        "Customer did not appear nervous.Customer made multiple cash deposits.",
        "Elderly client being coached by phone to send gift cards.",
        "Frequent small deposits possibly structuring to avoid CTR.",
        "No suspicious or unusual activity observed.Frequent small deposits possibly structuring to avoid CTR."
    ],
    "investigated": [1, 1, 0, 1,0,1]
}


df = pd.DataFrame(data)

# Run pattern detection
df["patterns"] = df["UTR_text"].apply(lambda x: detect_fact_patterns(x, max_gap=8))
expanded = pd.json_normalize(df["patterns"])
df = pd.concat([df, expanded], axis=1)

print("\n=== DETECTED PATTERNS (Yes/No) ===")
print(df[["UTR_text"] + list(expanded.columns)])

# List of flag columns for co-occurrence analysis
pattern_cols = list(expanded.columns)

# Compute co-occurrence investigation rates
pair_rates = cooccurrence_investigation_rate_overall(df, pattern_cols, combo_size=2)

print("\n=== GLOBAL INVESTIGATION RATES (Denominator = Total Records) ===")
print(pair_rates)



=== DETECTED PATTERNS (Yes/No) ===
                                            UTR_text structuring  \
0  Customer made multiple cash deposits below rep...         Yes   
1  Funds were transferred to offshore account in ...          No   
2  Customer did not appear nervous.Customer made ...         Yes   
3  Elderly client being coached by phone to send ...          No   
4  Frequent small deposits possibly structuring t...         Yes   
5  No suspicious or unusual activity observed.Fre...         Yes   

  foreign_transfer unknown_source third_party rapid_movement  \
0              Yes             No          No             No   
1              Yes             No          No             No   
2               No             No          No             No   
3               No             No          No             No   
4               No             No          No             No   
5               No             No          No             No   

  customer_behavior elder_fraud unusua

In [56]:
df

Unnamed: 0,UTR_text,investigated,patterns,structuring,foreign_transfer,unknown_source,third_party,rapid_movement,customer_behavior,elder_fraud,unusual_activity,suspicious_language
0,Customer made multiple cash deposits below rep...,1,"{'structuring': 'Yes', 'foreign_transfer': 'Ye...",Yes,Yes,No,No,No,No,No,No,No
1,Funds were transferred to offshore account in ...,1,"{'structuring': 'No', 'foreign_transfer': 'Yes...",No,Yes,No,No,No,No,No,No,No
2,Customer did not appear nervous.Customer made ...,0,"{'structuring': 'Yes', 'foreign_transfer': 'No...",Yes,No,No,No,No,No,No,No,No
3,Elderly client being coached by phone to send ...,1,"{'structuring': 'No', 'foreign_transfer': 'No'...",No,No,No,No,No,No,Yes,No,No
4,Frequent small deposits possibly structuring t...,0,"{'structuring': 'Yes', 'foreign_transfer': 'No...",Yes,No,No,No,No,No,No,No,No
5,No suspicious or unusual activity observed.Fre...,1,"{'structuring': 'Yes', 'foreign_transfer': 'No...",Yes,No,No,No,No,No,No,No,Yes


In [57]:
pair_rates

Unnamed: 0,patterns,total_occurrences,investigated_with_combo,investigation_rate_overall
0,structuring + foreign_transfer,1,1,0.1667
1,structuring + suspicious_language,1,1,0.1667
2,rapid_movement + elder_fraud,0,0,0.0
3,third_party + rapid_movement,0,0,0.0
4,third_party + customer_behavior,0,0,0.0
5,third_party + elder_fraud,0,0,0.0
6,third_party + unusual_activity,0,0,0.0
7,third_party + suspicious_language,0,0,0.0
8,rapid_movement + customer_behavior,0,0,0.0
9,rapid_movement + unusual_activity,0,0,0.0
