In [22]:
import numpy as np
import pandas as pd

OUTCOMES = ["cancelled", "undersold", "successful"]

# Categorical domains (keep consistent with the tree)
WEATHER_LEVELS = ["clear", "light_rain", "heavy_rain", "storm"]
EVENT_TYPES = ["concert", "sports", "festival", "community_event", "exhibition"]
DAY_OF_WEEK = ["weekday", "friday", "weekend"]
LOGO_COLORS = ["red", "blue", "green", "black", "white", "yellow"]
MOON_PHASES = ["new", "half", "full"]

def _clip01(x):
    return np.clip(x, 0.0, 1.0)

def _determine_outcome(row):
    """
    Implements the EXACT causal decision tree described in the previous message.
    Outcomes: cancelled / undersold / successful
    """
    w = row["weather_forecast_severity"]
    et = row["event_type"]
    org = row["organizer_success_rate"]
    ts = row["ticket_sales_ratio"]
    lead = row["days_between_announcement_and_event"]
    acc = row["venue_accessibility_score"]
    pricep = row["ticket_price_percentile"]
    comp = row["competing_major_events_count"]
    dow = row["day_of_week"]
    tour = row["city_tourism_seasonality_index"]

    # --- 1) Safety / feasibility gate ---
    # Node A
    if w == "storm":
        return "cancelled"

    # Node B / D
    if w == "heavy_rain":
        if et in ["festival", "community_event"]:
            # Node D
            if org < 0.5:
                return "cancelled"
            # else go to Node C

    # --- 2) Demand gate ---
    # Node C
    if ts < 0.35:
        # --- 3) Low-sales pathway ---
        # Node E
        if lead < 7:
            # Node H
            if acc < 4:
                # Node I
                if pricep > 80:
                    return "undersold"
                else:
                    # Node J
                    if comp >= 2:
                        return "undersold"
                    else:
                        # Node K (re-check)
                        if w == "heavy_rain" and et in ["festival", "community_event"]:
                            return "cancelled"
                        else:
                            return "undersold"
            else:
                return "undersold"
        else:
            return "undersold"

    elif 0.35 <= ts <= 0.75:
        # --- 4) Mid-sales pathway ---
        # Node F
        if dow in ["weekend", "friday"]:
            # Node L
            if tour > 0:
                # Node N
                if org >= 0.6:
                    return "successful"
                else:
                    return "undersold"
            else:
                # Node M
                if comp >= 1:
                    return "undersold"
                else:
                    # Node N
                    if org >= 0.6:
                        return "successful"
                    else:
                        return "undersold"
        else:
            # Node M
            if comp >= 1:
                return "undersold"
            else:
                # Node N
                if org >= 0.6:
                    return "successful"
                else:
                    return "undersold"

    else:  # ts > 0.75
        # --- 5) High-sales pathway ---
        # Node G
        if acc >= 4:
            # Node O
            if et in ["sports", "concert"]:
                return "successful"
            else:
                # Node Q
                if w == "heavy_rain" and et in ["festival", "community_event"]:
                    return "undersold"
                else:
                    return "successful"
        else:
            # Node P
            if dow == "weekday":
                return "undersold"
            else:
                # Node O
                if et in ["sports", "concert"]:
                    return "successful"
                else:
                    # Node Q
                    if w == "heavy_rain" and et in ["festival", "community_event"]:
                        return "undersold"
                    else:
                        return "successful"

def _apply_label_noise(y, rng, noise_rate=0.03):
    """
    With probability `noise_rate`, replace label with a different random class.
    """
    y = np.array(y, dtype=object)
    flip = rng.random(len(y)) < noise_rate
    if flip.any():
        for i in np.where(flip)[0]:
            others = [c for c in OUTCOMES if c != y[i]]
            y[i] = rng.choice(others)
    return y

def generate_outdoor_event_dataset(n=5000, seed=0, noise_rate=0.03):
    rng = np.random.default_rng(seed)

    # --- Generate features (distributions chosen to be plausible; the TREE defines the labels) ---
    weather = rng.choice(WEATHER_LEVELS, size=n, p=[0.55, 0.25, 0.15, 0.05])
    event_type = rng.choice(EVENT_TYPES, size=n, p=[0.25, 0.20, 0.25, 0.20, 0.10])
    day_of_week = rng.choice(DAY_OF_WEEK, size=n, p=[0.55, 0.15, 0.30])

    # Numeric features
    ticket_sales_ratio = _clip01(rng.beta(2.2, 2.0, size=n))  # center-ish, [0,1]
    organizer_success_rate = _clip01(rng.beta(5.0, 2.2, size=n))  # skew higher
    days_between_announcement_and_event = np.clip(
        rng.lognormal(mean=np.log(20), sigma=0.6, size=n).round().astype(int),
        0, 180
    )

    venue_accessibility_score = np.clip(rng.normal(loc=6.0, scale=2.0, size=n), 1, 10)
    ticket_price_percentile = np.clip(rng.normal(loc=55, scale=25, size=n), 0, 100)
    competing_major_events_count = np.clip(rng.poisson(lam=0.7, size=n), 0, 6)
    city_tourism_seasonality_index = np.clip(rng.normal(loc=0.0, scale=0.5, size=n), -1, 1)

    # Almost-irrelevant / weakly-related (generated independently)
    organizer_logo_primary_color = rng.choice(LOGO_COLORS, size=n)
    number_of_food_vendors_booked = np.clip(rng.poisson(lam=12, size=n), 0, 80)
    moon_phase = rng.choice(MOON_PHASES, size=n, p=[0.33, 0.34, 0.33])

    df = pd.DataFrame({
        "weather_forecast_severity": weather,
        "ticket_sales_ratio": np.round(ticket_sales_ratio,2),
        "event_type": event_type,
        "days_between_announcement_and_event": days_between_announcement_and_event,
        "organizer_success_rate": np.round(organizer_success_rate,2),
        "ticket_price_percentile": np.round(ticket_price_percentile,0),
        "day_of_week": day_of_week,
        "venue_accessibility_score": np.round(venue_accessibility_score,0),
        "competing_major_events_count": competing_major_events_count,
        "city_tourism_seasonality_index": np.round(city_tourism_seasonality_index,2),
        "organizer_logo_primary_color": organizer_logo_primary_color,
        "number_of_food_vendors_booked": number_of_food_vendors_booked,
        "moon_phase": moon_phase,
    })

    # --- Apply the exact tree to create the label ---
    y = [ _determine_outcome(row) for _, row in df.iterrows() ]

    # --- Add 3% stochastic noise (label flipping) ---
    y = _apply_label_noise(y, rng, noise_rate=noise_rate)
    df["outcome"] = y

    return df

if __name__ == "__main__":
    df = generate_outdoor_event_dataset(n=20000, seed=42, noise_rate=0.05)
    print(df.head(10))
    print("\nOutcome distribution:")
    print(df["outcome"].value_counts(normalize=True).round(3))


  weather_forecast_severity  ticket_sales_ratio       event_type  \
0                light_rain                0.49  community_event   
1                     clear                0.29       exhibition   
2                heavy_rain                0.39          concert   
3                light_rain                0.61          concert   
4                     clear                0.74         festival   
5                     storm                0.40           sports   
6                light_rain                0.20         festival   
7                light_rain                0.14         festival   
8                     clear                0.51  community_event   
9                     clear                0.57          concert   

   days_between_announcement_and_event  organizer_success_rate  \
0                                   10                    0.80   
1                                   32                    0.65   
2                                   38               

In [23]:
df.to_csv('outdoor.csv',index=False)

In [15]:
round(0.918,2)

0.92

In [26]:
import numpy as np
import pandas as pd

def synthesize_elderly_living_arrangement(
    n: int = 5000,
    seed: int = 42,
    add_label_noise: float = 0.08,   # chance to randomly flip label to a neighboring plausible class
    add_feature_noise: float = 0.05, # small stochasticity inside the tree gates
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)

    # --- Helper samplers ---
    def sample_categorical(choices, probs, size):
        return rng.choice(choices, size=size, p=np.array(probs) / np.sum(probs))

    # Age group
    age_group = sample_categorical(
        ["65-74", "75-84", "85+"],
        [0.50, 0.35, 0.15],
        n
    )

    # Marital status (age-dependent)
    marital = np.empty(n, dtype=object)
    for i, ag in enumerate(age_group):
        if ag == "65-74":
            marital[i] = sample_categorical(
                ["married", "widowed", "divorced", "never_married"], [0.62, 0.20, 0.12, 0.06], 1
            )[0]
        elif ag == "75-84":
            marital[i] = sample_categorical(
                ["married", "widowed", "divorced", "never_married"], [0.48, 0.38, 0.10, 0.04], 1
            )[0]
        else:  # 85+
            marital[i] = sample_categorical(
                ["married", "widowed", "divorced", "never_married"], [0.25, 0.65, 0.07, 0.03], 1
            )[0]

    spouse_present = (marital == "married").astype(int)

    # Adult children nearby (depends on urban/rural mildly)
    urban = sample_categorical(["urban", "rural"], [0.78, 0.22], n)
    adult_child_nearby = np.where(
        urban == "urban",
        rng.binomial(1, 0.62, n),
        rng.binomial(1, 0.70, n)
    )

    # Income tier
    income_tier = sample_categorical(["low", "mid", "high"], [0.45, 0.45, 0.10], n)

    # Housing type (depends on income a bit)
    housing = np.empty(n, dtype=object)
    for i, tier in enumerate(income_tier):
        if tier == "low":
            housing[i] = sample_categorical(
                ["public_housing", "rental", "owned", "senior_housing"], [0.55, 0.25, 0.12, 0.08], 1
            )[0]
        elif tier == "mid":
            housing[i] = sample_categorical(
                ["owned", "public_housing", "rental", "senior_housing"], [0.55, 0.18, 0.22, 0.05], 1
            )[0]
        else:  # high
            housing[i] = sample_categorical(
                ["owned", "rental", "senior_housing", "public_housing"], [0.72, 0.20, 0.07, 0.01], 1
            )[0]

    # Care need variables: ADL (0-6), IADL (0-8), Functional status
    # Generate a latent "frailty" from age group, then map to ADL/IADL/Functional.
    frailty_base = np.where(age_group == "65-74", 0.15, np.where(age_group == "75-84", 0.35, 0.65))
    frailty = np.clip(rng.normal(frailty_base, 0.20), 0, 1)

    # ADL as binomial with frailty
    adl = rng.binomial(6, np.clip(frailty, 0.02, 0.98), n)
    # IADL slightly higher sensitivity than ADL
    iadl = rng.binomial(8, np.clip(frailty + 0.10, 0.02, 0.98), n)

    functional = np.empty(n, dtype=object)
    for i in range(n):
        # Probabilities tied to frailty and ADL
        f = frailty[i]
        a = adl[i]
        p_dep = np.clip(0.05 + 0.10 * (a >= 4) + 0.55 * f, 0, 0.90)
        p_need = np.clip(0.20 + 0.25 * (a >= 2) + 0.25 * f, 0, 0.95)
        # Normalize for {independent, needs_assistance, dependent}
        p_dependent = min(p_dep, 0.85)
        p_needs = min(max(p_need - p_dependent, 0.05), 0.90)
        p_ind = max(1.0 - p_dependent - p_needs, 0.02)
        probs = np.array([p_ind, p_needs, p_dependent])
        probs = probs / probs.sum()
        functional[i] = rng.choice(["independent", "needs_assistance", "dependent"], p=probs)

    # Recent hospitalization (depends on frailty/ADL)
    recent_hosp = rng.binomial(
        1,
        np.clip(0.10 + 0.08 * (adl >= 2) + 0.25 * frailty, 0.05, 0.65),
        n
    )

    # Less relevant / plausible but weak features
    self_rated_health = np.empty(n, dtype=object)
    for i in range(n):
        # correlated with frailty but noisy
        f = frailty[i]
        probs = np.array([max(0.10, 1.0 - 1.2 * f), 0.25 + 0.6 * f, 0.10 + 0.6 * f])
        probs = np.clip(probs, 0.02, None)
        probs = probs / probs.sum()
        self_rated_health[i] = rng.choice(["good", "fair", "poor"], p=probs)

    # Almost irrelevant but plausible
    social_club = sample_categorical(["weekly", "monthly", "rarely"], [0.22, 0.28, 0.50], n)
    pet_owner = rng.binomial(1, 0.18, n)
    tv_genre = sample_categorical(["news", "drama", "variety", "sports"], [0.45, 0.25, 0.20, 0.10], n)

    # --- Decision tree labeling logic (from earlier message) ---
    labels = np.empty(n, dtype=object)

    def maybe(p):
        return rng.random() < p

    for i in range(n):
        ADL = adl[i]
        IADL = iadl[i]
        Functional = functional[i]
        RecentHosp = recent_hosp[i]
        SpousePresent = spouse_present[i]
        AdultChildNearby = adult_child_nearby[i]
        IncomeTier = income_tier[i]
        HousingType = housing[i]

        # Node 1: High care need?
        high_need = (Functional == "dependent") or (ADL >= 4)
        if maybe(add_feature_noise):
            # small random perturbation to avoid hard boundaries
            high_need = high_need or (ADL == 3 and RecentHosp == 1 and maybe(0.4))

        if high_need:
            # Node 2: Younger caretaker availability?
            if AdultChildNearby == 1:
                label = "With younger caretaker"
            else:
                # Node 3: Same-age cohabitation possible?
                if SpousePresent == 1:
                    label = "With another elderly"
                else:
                    # Node 4: proxy caretaker via resources / senior housing
                    if (IncomeTier == "high") or (HousingType == "senior_housing"):
                        label = "With younger caretaker"
                    else:
                        label = "With another elderly"
        else:
            # Node 5: Moderate care need / transitional risk?
            moderate_need = (Functional == "needs_assistance") or (ADL in [2, 3]) or (IADL >= 4) or (RecentHosp == 1)
            if maybe(add_feature_noise):
                moderate_need = moderate_need or (ADL == 1 and IADL >= 5 and maybe(0.35))

            if moderate_need:
                # Node 6: Spouse present?
                if SpousePresent == 1:
                    label = "With another elderly"
                else:
                    # Node 7: Adult child nearby?
                    if AdultChildNearby == 1:
                        label = "With younger caretaker"
                    else:
                        # Node 8: Housing/income pushes away from living alone
                        if (HousingType in ["senior_housing", "public_housing"]) or (IncomeTier == "low"):
                            label = "With another elderly"
                        else:
                            label = "Living alone"
            else:
                # Node 9: Low care need
                if SpousePresent == 1:
                    label = "With another elderly"
                else:
                    # Node 10: Independent and no spouse
                    if AdultChildNearby == 1 and ((age_group[i] == "85+") or (RecentHosp == 1)):
                        label = "With younger caretaker"
                    else:
                        label = "Living alone"

        # Optional: label noise (flip to a plausible alternative)
        if maybe(add_label_noise):
            if label == "Living alone":
                label = rng.choice(["With another elderly", "With younger caretaker"], p=[0.75, 0.25])
            elif label == "With another elderly":
                label = rng.choice(["Living alone", "With younger caretaker"], p=[0.55, 0.45])
            else:  # With younger caretaker
                label = rng.choice(["With another elderly", "Living alone"], p=[0.85, 0.15])

        labels[i] = label

    df = pd.DataFrame({
        # Highly relevant
        "FunctionalIndependence": functional,
        "ADL_Limitations": adl,
        "MaritalStatus": marital,
        "AdultChildrenNearby": adult_child_nearby,
        "RecentHospitalization12m": recent_hosp,

        # Less relevant
        "IncomeTier": income_tier,
        "HousingType": housing,
        "SelfRatedHealth": self_rated_health,
        "AgeGroup": age_group,
        "UrbanRural": urban,

        # Almost irrelevant but plausible
        "SocialClubParticipation": social_club,
        "PetOwnership": pet_owner,
        "PreferredTVGenre": tv_genre,


        # Label
        "LivingArrangement": labels
    })

    return df


if __name__ == "__main__":
    df = synthesize_elderly_living_arrangement(n=1000, seed=7)
    print(df.head())
    print("\nClass distribution:")
    print(df["LivingArrangement"].value_counts(normalize=True).round(3))
    df.to_csv("elderly_living_arrangement_synth.csv", index=False)
    print("\nSaved to elderly_living_arrangement_synth.csv")


  FunctionalIndependence  ADL_Limitations MaritalStatus  AdultChildrenNearby  \
0            independent                1      divorced                    1   
1       needs_assistance                2       widowed                    0   
2              dependent                2       widowed                    1   
3            independent                0       married                    0   
4       needs_assistance                0       married                    0   

   RecentHospitalization12m IncomeTier     HousingType SelfRatedHealth  \
0                         0        mid           owned            good   
1                         0       high           owned            poor   
2                         0        mid  public_housing            poor   
3                         1        low  public_housing            poor   
4                         0        mid           owned            good   

  AgeGroup UrbanRural SocialClubParticipation  PetOwnership PreferredTVGen

In [34]:
import numpy as np
import pandas as pd

def synthesize_microloan_fraud(
    n: int = 10000,
    seed: int = 42,
    # label noise: flip a fraction of labels after rule-based assignment
    label_flip_prob: float = 0.03,
    # feature noise: jitter numeric features + randomly flip some binary/cat values
    feature_noise_prob: float = 0.05,
):
    rng = np.random.default_rng(seed)

    # -------------------------
    # 1) Generate base features
    # -------------------------
    # Highly relevant
    # Identity score: mixture to create both good and bad identities
    mix = rng.random(n) < 0.25  # 25% suspicious population
    identity_score = np.where(
        mix,
        rng.beta(2.0, 5.5, n),   # lower scores
        rng.beta(6.5, 2.0, n)    # higher scores
    )
    identity_score = np.clip(identity_score, 0.0, 1.0)

    # Contact overlap: mostly 0, some >0; more likely in suspicious population
    contact_overlap = np.where(
        mix,
        rng.poisson(0.6, n),
        rng.poisson(0.05, n)
    )
    contact_overlap = np.clip(contact_overlap, 0, 10).astype(int)

    # Geo-IP consistency: 1 usually, but suspicious population more likely inconsistent
    geo_ip_consistent = np.where(
        mix,
        (rng.random(n) > 0.35).astype(int),  # 65% consistent
        (rng.random(n) > 0.08).astype(int)   # 92% consistent
    ).astype(int)

    # Device-account uniqueness count: suspicious has higher count
    device_account_uniqueness_30d = np.where(
        mix,
        rng.poisson(2.2, n),
        rng.poisson(0.3, n)
    )
    device_account_uniqueness_30d = np.clip(device_account_uniqueness_30d, 0, 20).astype(int)

    # Application velocity (24h): suspicious higher
    app_velocity_24h = np.where(
        mix,
        rng.poisson(1.6, n),
        rng.poisson(0.15, n)
    )
    app_velocity_24h = np.clip(app_velocity_24h, 0, 10).astype(int)

    # Less relevant
    loan_income_ratio = np.where(
        rng.random(n) < 0.15,
        rng.uniform(2.0, 5.0, n),
        rng.uniform(0.2, 2.5, n)
    )

    bank_account_age_months = np.where(
        rng.random(n) < 0.18,
        rng.integers(0, 3, n),          # new-ish
        rng.integers(3, 72, n)          # older
    ).astype(int)

    employment_types = np.array(["salaried", "freelancer", "self_employed", "unemployed"])
    employment_type = rng.choice(
        employment_types,
        size=n,
        p=[0.52, 0.22, 0.18, 0.08]
    )

    time_buckets = np.array(["daytime", "late_night", "early_morning"])
    time_of_application = rng.choice(time_buckets, size=n, p=[0.72, 0.18, 0.10])

    emergency_contacts = np.clip(rng.poisson(2.0, n), 0, 8).astype(int)

    # Almost irrelevant but seemingly related
    mobile_os = rng.choice(np.array(["Android", "iOS"]), size=n, p=[0.78, 0.22])
    loan_tenure_months = rng.choice(np.array([1, 2, 3, 6, 9, 12]), size=n, p=[0.15, 0.15, 0.20, 0.25, 0.10, 0.15]).astype(int)
    preferred_language = rng.choice(np.array(["English", "Spanish", "Chinese", "Hindi", "other"]), size=n, p=[0.55, 0.13, 0.08, 0.12, 0.12])

    # -----------------------------------
    # 2) Add feature noise (optional)
    # -----------------------------------
    # Numeric jitter
    jitter_mask = rng.random(n) < feature_noise_prob
    identity_score[jitter_mask] = np.clip(identity_score[jitter_mask] + rng.normal(0, 0.06, jitter_mask.sum()), 0, 1)

    jitter_mask = rng.random(n) < feature_noise_prob
    loan_income_ratio[jitter_mask] = np.clip(loan_income_ratio[jitter_mask] + rng.normal(0, 0.25, jitter_mask.sum()), 0.05, 6.0)

    # Flip some binary
    flip_mask = rng.random(n) < (feature_noise_prob * 0.6)
    geo_ip_consistent[flip_mask] = 1 - geo_ip_consistent[flip_mask]

    # Perturb some counts
    def perturb_counts(arr, p=feature_noise_prob, max_add=2, max_val=None):
        m = rng.random(arr.shape[0]) < p
        delta = rng.integers(-max_add, max_add + 1, m.sum())
        out = arr.copy()
        out[m] = out[m] + delta
        out = np.clip(out, 0, max_val if max_val is not None else out.max())
        return out.astype(int)

    contact_overlap = perturb_counts(contact_overlap, p=feature_noise_prob, max_add=1, max_val=10)
    device_account_uniqueness_30d = perturb_counts(device_account_uniqueness_30d, p=feature_noise_prob, max_add=2, max_val=20)
    app_velocity_24h = perturb_counts(app_velocity_24h, p=feature_noise_prob, max_add=1, max_val=10)
    bank_account_age_months = perturb_counts(bank_account_age_months, p=feature_noise_prob, max_add=2, max_val=120)
    emergency_contacts = perturb_counts(emergency_contacts, p=feature_noise_prob, max_add=1, max_val=10)

    # Flip some categorical values (small probability)
    def flip_cats(arr, choices, p=feature_noise_prob * 0.35):
        m = rng.random(arr.shape[0]) < p
        out = arr.copy()
        out[m] = rng.choice(choices, size=m.sum())
        return out

    employment_type = flip_cats(employment_type, employment_types)
    time_of_application = flip_cats(time_of_application, time_buckets)
    mobile_os = flip_cats(mobile_os, np.array(["Android", "iOS"]))
    preferred_language = flip_cats(preferred_language, np.array(["English", "Spanish", "Chinese", "Hindi", "other"]))

    # -----------------------------------
    # 3) Label generation via the tree
    # -----------------------------------
    label = np.array(["Non-fraud"] * n, dtype=object)

    # Helper booleans
    id_low = identity_score < 0.65
    overlap_pos = contact_overlap > 0
    geo_bad = geo_ip_consistent == 0

    # Branch: identity_score < 0.65
    # If overlap > 0 => Fraud
    label[id_low & overlap_pos] = "Fraud"

    # If overlap == 0:
    #   if geo bad => Fraud
    label[id_low & (~overlap_pos) & geo_bad] = "Fraud"

    #   else geo good:
    #       if device_uniqueness >= 3 => Fraud
    cond = id_low & (~overlap_pos) & (~geo_bad) & (device_account_uniqueness_30d >= 3)
    label[cond] = "Fraud"

    #       else:
    #           if velocity >= 2 => Fraud else Non-fraud
    cond = id_low & (~overlap_pos) & (~geo_bad) & (device_account_uniqueness_30d < 3) & (app_velocity_24h >= 2)
    label[cond] = "Fraud"

    # Branch: identity_score >= 0.65
    # If overlap > 0 => Fraud
    label[(~id_low) & overlap_pos] = "Fraud"

    # If overlap == 0:
    #   if geo bad:
    #       if device_uniqueness >= 2 => Fraud else Non-fraud
    cond = (~id_low) & (~overlap_pos) & geo_bad & (device_account_uniqueness_30d >= 2)
    label[cond] = "Fraud"

    #   else geo good:
    #       if device_uniqueness >= 3 => Fraud
    cond = (~id_low) & (~overlap_pos) & (~geo_bad) & (device_account_uniqueness_30d >= 3)
    label[cond] = "Fraud"

    #       else:
    #           if velocity >= 3 => Fraud else go to tie-breakers
    high_velocity = (~id_low) & (~overlap_pos) & (~geo_bad) & (device_account_uniqueness_30d < 3) & (app_velocity_24h >= 3)
    label[high_velocity] = "Fraud"

    # Tie-breaker zone for the remaining subset:
    tie = (~id_low) & (~overlap_pos) & (~geo_bad) & (device_account_uniqueness_30d < 3) & (app_velocity_24h < 3)

    # (6) loan_income_ratio >= 2.5 ?
    t1 = tie & (loan_income_ratio >= 2.5)
    # (7) bank_account_age < 2 months ? => Fraud else Non-fraud
    label[t1 & (bank_account_age_months < 2)] = "Fraud"
    label[t1 & (bank_account_age_months >= 2)] = "Non-fraud"

    # else loan_income_ratio < 2.5:
    t2 = tie & (loan_income_ratio < 2.5)
    # (8) employment ∈ {unemployed, gig} ?
    emp_risky = np.isin(employment_type, ["unemployed", "gig"])
    # (9) time ∈ {late_night, early_morning} ? => Fraud else Non-fraud
    nightish = np.isin(time_of_application, ["late_night", "early_morning"])
    label[t2 & emp_risky & nightish] = "Fraud"
    label[t2 & emp_risky & (~nightish)] = "Non-fraud"
    label[t2 & (~emp_risky)] = "Non-fraud"

    # -----------------------------------
    # 4) Add label noise (flip some labels)
    # -----------------------------------
    flip = rng.random(n) < label_flip_prob
    label[flip] = np.where(label[flip] == "Fraud", "Non-fraud", "Fraud")

    # -----------------------------------
    # 5) Build dataframe
    # -----------------------------------
    df = pd.DataFrame({
        # Highly relevant
        "identity_verification_match_score": identity_score.round(4),
        "contact_overlap_known_fraud_network": contact_overlap,
        "geo_ip_consistency_flag": geo_ip_consistent,  # 1 consistent, 0 inconsistent
        "device_account_uniqueness_30d": device_account_uniqueness_30d,
        "application_velocity_24h": app_velocity_24h,

        # Less relevant
        "loan_amount_income_ratio": loan_income_ratio.round(3),
        "bank_account_age_months": bank_account_age_months,
        "employment_type": employment_type,
        "time_of_application": time_of_application,
        "num_emergency_contacts": emergency_contacts,

        # Almost irrelevant but seemingly related
        "mobile_os": mobile_os,
        "loan_tenure_months": loan_tenure_months,
        "preferred_language": preferred_language,

        # Label
        "label": label
    })

    return df

if __name__ == "__main__":
    df = synthesize_microloan_fraud(
        n=20000,
        seed=7,
        label_flip_prob=0.08,
        feature_noise_prob=0.06,
    )
    print(df.head())
    print("\nClass balance:\n", df["label"].value_counts(normalize=True).round(3))
    df.to_csv("microloan_fraud_synth.csv", index=False)


   identity_verification_match_score  contact_overlap_known_fraud_network  \
0                             0.7890                                    0   
1                             0.8195                                    0   
2                             0.8411                                    0   
3                             0.2333                                    1   
4                             0.7578                                    0   

   geo_ip_consistency_flag  device_account_uniqueness_30d  \
0                        1                              0   
1                        1                              0   
2                        0                              0   
3                        0                              2   
4                        0                              0   

   application_velocity_24h  loan_amount_income_ratio  \
0                         0                     4.573   
1                         0                     1.818   

In [9]:
import numpy as np
import pandas as pd

def synthesize_microloan_fraud(n=5000, seed=42, label_noise=0.03, feature_noise=0.05):
    """
    Generates a synthetic micro-loan fraud dataset with:
    - Feature correlations (via shared latent factors)
    - A hard decision tree mapping features -> label
    - Small noise (both feature noise + label flipping)
    """
    rng = np.random.default_rng(seed)

    # ----------------------------
    # 1) Latent factors (drive correlations)
    # ----------------------------
    # fraud_latent: higher -> more likely fraud-like behaviors
    fraud_latent = rng.normal(0, 1, n)
    # stability_latent: higher -> older accounts, more prior good loans, higher income consistency
    stability_latent = rng.normal(0, 1, n)
    # geo_latent: higher -> riskier region
    geo_latent = rng.normal(0, 1, n)
    # marketing_latent: drives campaign + volume spikes (spurious correlation)
    marketing_latent = rng.normal(0, 1, n)

    # ----------------------------
    # 2) Generate features with correlations
    # ----------------------------
    # Highly relevant
    # num_prior_defaults correlates negatively with stability, positively with fraud_latent
    num_prior_defaults = np.clip(
        rng.poisson(lam=np.exp(0.15 + 0.55 * fraud_latent - 0.35 * stability_latent)),
        0, 8
    ).astype(int)

    # identity_verification_passed: more likely to fail with higher fraud_latent and higher geo risk
    p_kyc_pass = 1 / (1 + np.exp(1.0 * fraud_latent + 0.6 * geo_latent - 0.3 * stability_latent))
    identity_verification_passed = (rng.uniform(0, 1, n) < p_kyc_pass).astype(int)

    # device_fingerprint_match: positively with stability, negatively with fraud_latent; also correlated with KYC pass
    p_device_match = 1 / (1 + np.exp(0.9 * fraud_latent - 0.8 * stability_latent - 0.5 * identity_verification_passed))
    device_fingerprint_match = (rng.uniform(0, 1, n) < p_device_match).astype(int)

    # application_velocity_24h: higher with fraud_latent + marketing_latent + geo_latent; lower with stability
    base_vel = np.exp(0.5 + 0.9 * fraud_latent + 0.4 * marketing_latent + 0.3 * geo_latent - 0.4 * stability_latent)
    application_velocity_24h = np.clip(rng.poisson(lam=base_vel), 0, 20).astype(int)

    # income_to_loan_ratio: higher with stability, lower with fraud_latent (fraud requests relatively more)
    income_to_loan_ratio = np.clip(
        rng.normal(loc=1.2 + 0.45 * stability_latent - 0.35 * fraud_latent, scale=0.25),
        0.05, 4.0
    )

    # Less relevant
    # account_age_days correlates strongly with stability
    account_age_days = np.clip(
        rng.lognormal(mean=3.3 + 0.55 * stability_latent, sigma=0.55),
        1, 3650
    ).astype(int)

    # previous_loan_count correlates with account_age_days + stability; slightly lower with fraud_latent
    prev_count_lambda = np.exp(-0.3 + 0.35 * np.log1p(account_age_days) + 0.45 * stability_latent - 0.25 * fraud_latent)
    previous_loan_count = np.clip(rng.poisson(lam=prev_count_lambda / 5.0), 0, 30).astype(int)

    # employment_type depends on stability (proxy)
    # 0: unemployed, 1: self-employed, 2: salaried
    emp_score = 0.7 * stability_latent - 0.2 * fraud_latent + rng.normal(0, 0.5, n)
    employment_type = np.where(emp_score > 0.6, "salaried",
                        np.where(emp_score > -0.2, "self-employed", "unemployed"))

    # repayment_method depends weakly on geo + fraud_latent (cash_agent slightly riskier)
    rm_score = 0.35 * fraud_latent + 0.25 * geo_latent + rng.normal(0, 0.6, n)
    repayment_method = np.where(rm_score > 0.7, "cash_agent",
                         np.where(rm_score > -0.1, "e-wallet", "bank_transfer"))

    # geo_risk_score: derived from geo_latent + small noise
    geo_risk_score = 1 / (1 + np.exp(-(0.9 * geo_latent + 0.2 * fraud_latent + rng.normal(0, 0.4, n))))
    geo_risk_score = np.clip(geo_risk_score, 0.0, 1.0)

    # Almost irrelevant but seemingly related
    application_hour = rng.integers(0, 24, n)

    # mobile_os: weak correlation with device match (noisy)
    p_ios = np.clip(0.35 + 0.10 * (device_fingerprint_match - 0.5) + rng.normal(0, 0.05, n), 0.05, 0.95)
    mobile_os = np.where(rng.uniform(0, 1, n) < p_ios, "iOS", "Android")

    # marketing_campaign_id: correlates with marketing_latent (spurious correlation with velocity)
    # create 6 campaigns
    campaign_bins = np.digitize(marketing_latent + rng.normal(0, 0.3, n),
                                bins=np.quantile(marketing_latent, [1/6, 2/6, 3/6, 4/6, 5/6]))
    marketing_campaign_id = np.array([f"C{b+1}" for b in campaign_bins])

    # ----------------------------
    # 3) Add small feature noise (measurement noise)
    # ----------------------------
    # Numeric noise
    income_to_loan_ratio = np.clip(
        income_to_loan_ratio + rng.normal(0, feature_noise * 0.2, n),
        0.05, 4.0
    )
    geo_risk_score = np.clip(
        geo_risk_score + rng.normal(0, feature_noise * 0.15, n),
        0.0, 1.0
    )

    # Occasionally flip a small fraction of binary features (sensor / pipeline noise)
    def flip_binary(x, p):
        flip = rng.uniform(0, 1, x.shape[0]) < p
        x2 = x.copy()
        x2[flip] = 1 - x2[flip]
        return x2

    identity_verification_passed = flip_binary(identity_verification_passed, feature_noise * 0.25)
    device_fingerprint_match = flip_binary(device_fingerprint_match, feature_noise * 0.25)

    # ----------------------------
    # 4) Decision tree label (hard rules)
    # ----------------------------
    # Tree (from earlier):
    # Node1: identity_verification_passed
    #  - 0 -> Node2: application_velocity_24h >=3 => Fraud else Node3
    #    Node3: num_prior_defaults >=1 => Fraud else Node3a: geo_risk_score >=0.7 => Fraud else Non-fraud
    #  - 1 -> Node4: device_fingerprint_match
    #    - 0 -> Node5: application_velocity_24h >=4 => Fraud else Node6
    #         Node6: income_to_loan_ratio <0.8 => Fraud else Node6a: account_age_days <14 => Fraud else Non-fraud
    #    - 1 -> Node7: num_prior_defaults >=2 => Fraud else Node8
    #         Node8: previous_loan_count >=2 => Non-fraud else Node9
    #         Node9: income_to_loan_ratio >=1.2 => Non-fraud else Node10
    #         Node10: repayment_method == cash_agent => Fraud else Node11
    #         Node11: hour in [2..5] => Fraud else Non-fraud

    label = np.zeros(n, dtype=int)  # 1=Fraud, 0=Non-fraud

    # Left branch: KYC failed
    left = identity_verification_passed == 0
    left_node2 = left & (application_velocity_24h >= 3)
    label[left_node2] = 1

    left_node2_else = left & (application_velocity_24h < 3)
    left_node3 = left_node2_else & (num_prior_defaults >= 1)
    label[left_node3] = 1

    left_node3_else = left_node2_else & (num_prior_defaults == 0)
    left_node3a = left_node3_else & (geo_risk_score >= 0.7)
    label[left_node3a] = 1
    # else remains Non-fraud

    # Right branch: KYC passed
    right = identity_verification_passed == 1

    # Device mismatch
    dev_mismatch = right & (device_fingerprint_match == 0)
    node5 = dev_mismatch & (application_velocity_24h >= 4)
    label[node5] = 1

    node5_else = dev_mismatch & (application_velocity_24h < 4)
    node6 = node5_else & (income_to_loan_ratio < 0.8)
    label[node6] = 1

    node6_else = node5_else & (income_to_loan_ratio >= 0.8)
    node6a = node6_else & (account_age_days < 14)
    label[node6a] = 1
    # else remains Non-fraud

    # Device match
    dev_match = right & (device_fingerprint_match == 1)
    node7 = dev_match & (num_prior_defaults >= 2)
    label[node7] = 1

    node7_else = dev_match & (num_prior_defaults < 2)
    node8 = node7_else & (previous_loan_count >= 2)
    # node8 => Non-fraud (already 0)

    node8_else = node7_else & (previous_loan_count < 2)
    node9 = node8_else & (income_to_loan_ratio >= 1.2)
    # node9 => Non-fraud (already 0)

    node9_else = node8_else & (income_to_loan_ratio < 1.2)
    node10 = node9_else & (repayment_method == "cash_agent")
    label[node10] = 1

    node10_else = node9_else & (repayment_method != "cash_agent")
    node11 = node10_else & ((application_hour >= 2) & (application_hour <= 5))
    label[node11] = 1
    # else remains Non-fraud

    # ----------------------------
    # 5) Small label noise (flip a small fraction)
    # ----------------------------
    flip = rng.uniform(0, 1, n) < label_noise
    label_noisy = label.copy()
    label_noisy[flip] = 1 - label_noisy[flip]

    # ----------------------------
    # 6) Assemble dataframe
    # ----------------------------
    df = pd.DataFrame({
        # Highly relevant
        "num_prior_defaults": num_prior_defaults,
        "identity_verification_passed": identity_verification_passed,
        "device_fingerprint_match": device_fingerprint_match,
        "application_velocity_24h": application_velocity_24h,
        "income_to_loan_ratio": np.round(income_to_loan_ratio, 3),

        # Less relevant
        "account_age_days": account_age_days,
        "employment_type": employment_type,
        "repayment_method": repayment_method,
        "geo_risk_score": np.round(geo_risk_score, 3),
        "previous_loan_count": previous_loan_count,

        # Almost irrelevant
        "application_hour": application_hour,
        "mobile_os": mobile_os,
        "marketing_campaign_id": marketing_campaign_id,

        # Label
        "label": np.where(label_noisy == 1, "Fraud", "Non-fraud")
    })

    return df

if __name__ == "__main__":
    df = synthesize_microloan_fraud(n=20000, seed=7, label_noise=0.05, feature_noise=0)
    print(df.head())
    print("\nClass balance:")
    print(df["label"].value_counts(normalize=True).round(3))
    df.to_csv("microloan_fraud_synth.csv", index=False)
    print("\nSaved: microloan_fraud_synth.csv")


   num_prior_defaults  identity_verification_passed  device_fingerprint_match  \
0                   1                             0                         1   
1                   3                             0                         0   
2                   0                             1                         0   
3                   1                             1                         1   
4                   1                             1                         1   

   application_velocity_24h  income_to_loan_ratio  account_age_days  \
0                         1                 1.192                29   
1                         2                 0.837                13   
2                         1                 0.659                20   
3                         0                 1.478                26   
4                         0                 2.047                32   

  employment_type repayment_method  geo_risk_score  previous_loan_count  \
0        sa

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('fraud_encode.csv')

In [12]:
df

Unnamed: 0,num_prior_defaults,identity_verification_passed,device_fingerprint_match,application_velocity_24h,income_to_loan_ratio,account_age_days,employment_type,repayment_method,geo_risk_score,previous_loan_count,application_hour,mobile_os,marketing_campaign_id,fraud_label
0,1,1,1,0,2.016,38,1,2,0.702,1,2,1,1,0
1,1,0,0,1,0.953,26,2,1,0.355,1,2,1,5,1
2,4,0,0,1,1.085,39,2,1,0.210,0,1,2,2,1
3,3,1,0,11,0.055,21,3,1,0.225,0,15,2,3,1
4,2,0,0,4,0.853,8,3,2,0.656,0,12,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,0,4,0.632,31,2,1,0.698,1,3,1,3,1
996,1,1,1,3,0.776,25,1,2,0.402,0,2,1,1,1
997,0,1,1,1,1.482,96,2,1,0.449,0,3,1,5,0
998,0,1,1,5,2.355,95,1,2,0.578,1,21,2,1,0


In [13]:
import pandas as pd

FEATURES = [
    "identity_verification_passed",
    "application_velocity_24h",
    "num_prior_defaults",
    "geo_risk_score",
    "device_fingerprint_match",
    "income_to_loan_ratio",
    "account_age_days",
    "previous_loan_count",
    "repayment_method",
    "application_hour",
    "employment_type",
    "mobile_os",
    "marketing_campaign_id"
]

MAX_RANK = 20


def rank_features_by_tree(row):
    """
    Given a single row (pd.Series), return a dict:
    {feature_name: rank}
    """
    ranks = {f: MAX_RANK for f in FEATURES}
    current_rank = 1

    def use(feature):
        nonlocal current_rank
        if ranks[feature] == MAX_RANK:
            ranks[feature] = current_rank
            current_rank += 1

    # ----- Decision tree traversal -----

    # Node 1
    use("identity_verification_passed")
    if row["identity_verification_passed"] == 0:
        # Node 2
        use("application_velocity_24h")
        if row["application_velocity_24h"] >= 3:
            return ranks
        # Node 3
        use("num_prior_defaults")
        if row["num_prior_defaults"] >= 1:
            return ranks
        # Node 3a
        use("geo_risk_score")
        return ranks

    # identity_verification_passed == 1
    # Node 4
    use("device_fingerprint_match")
    if row["device_fingerprint_match"] == 0:
        # Node 5
        use("application_velocity_24h")
        if row["application_velocity_24h"] >= 4:
            return ranks
        # Node 6
        use("income_to_loan_ratio")
        if row["income_to_loan_ratio"] < 0.8:
            return ranks
        # Node 6a
        use("account_age_days")
        return ranks

    # device_fingerprint_match == 1
    # Node 7
    use("num_prior_defaults")
    if row["num_prior_defaults"] >= 2:
        return ranks
    # Node 8
    use("previous_loan_count")
    if row["previous_loan_count"] >= 2:
        return ranks
    # Node 9
    use("income_to_loan_ratio")
    if row["income_to_loan_ratio"] >= 1.2:
        return ranks
    # Node 10
    use("repayment_method")
    if row["repayment_method"] == "cash_agent":
        return ranks
    # Node 11
    use("application_hour")
    return ranks


In [14]:
def compute_instance_feature_ranks(df):
    """
    Returns a DataFrame of shape (n_samples, n_features)
    where each cell is the rank for that feature on that instance.
    """
    rank_rows = []
    for _, row in df.iterrows():
        rank_rows.append(rank_features_by_tree(row))
    return pd.DataFrame(rank_rows, index=df.index)


# Example usage
rank_df = compute_instance_feature_ranks(df)

print(rank_df.head())


   identity_verification_passed  application_velocity_24h  num_prior_defaults  \
0                             1                        20                   3   
1                             1                         2                   3   
2                             1                         2                   3   
3                             1                         3                  20   
4                             1                         2                  20   

   geo_risk_score  device_fingerprint_match  income_to_loan_ratio  \
0              20                         2                     5   
1              20                        20                    20   
2              20                        20                    20   
3              20                         2                    20   
4              20                        20                    20   

   account_age_days  previous_loan_count  repayment_method  application_hour  \
0                2

In [15]:
rank_df.to_csv('fraud_ranking.csv',index=False)