In [85]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## loading data

In [3]:
CONS_PATH = "/uss/hdsi-prismdata/q2-ucsd-consDF.pqt"
ACCT_PATH = "/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt"
TRXN_PATH = "/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt"
CATMAP_PATH = "/uss/hdsi-prismdata/q2-ucsd-cat-map.csv"

In [4]:
# Load data
consdf = pd.read_parquet(CONS_PATH)
acctdf = pd.read_parquet(ACCT_PATH)
trxndf = pd.read_parquet(TRXN_PATH)
cat_map = pd.read_csv(CATMAP_PATH)

print("consdf:", consdf.shape)
print("acctdf:", acctdf.shape)
print("trxndf:", trxndf.shape)
print("cat_map:", cat_map.shape)

consdf: (15000, 4)
acctdf: (24466, 5)
trxndf: (6407321, 6)
cat_map: (50, 2)


In [5]:
consdf

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0
...,...,...,...,...
14995,14995,2022-03-08,655.0,
14996,14996,2022-01-15,625.0,
14997,14997,2022-01-31,688.0,
14998,14998,2022-03-08,722.0,


In [6]:
acctdf

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.90
...,...,...,...,...,...
24461,11500,24461,CHECKING,2022-03-27,732.75
24462,11615,24462,SAVINGS,2022-03-30,5.00
24463,11615,24463,CHECKING,2022-03-30,1956.46
24464,12210,24464,CHECKING,2022-03-28,2701.51


In [7]:
trxndf

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
6407316,10533,6405304,31,4.96,DEBIT,2022-03-11
6407317,10533,6405305,12,63.48,DEBIT,2022-03-30
6407318,10533,6405306,12,53.99,DEBIT,2022-03-30
6407319,10533,6405307,12,175.98,DEBIT,2022-03-31


## data cleaning/prepping

In [8]:
consdf = consdf.copy()
consdf["evaluation_date"] = pd.to_datetime(consdf["evaluation_date"], errors="coerce")

# drop missing DQ_TARGET
consdf = consdf[consdf["DQ_TARGET"].notna()].copy()
consdf["DQ_TARGET"] = consdf["DQ_TARGET"].astype(int)

acctdf = acctdf.copy()
acctdf["balance_date"] = pd.to_datetime(acctdf["balance_date"], errors="coerce")

trxndf = trxndf.copy()
trxndf["posted_date"] = pd.to_datetime(trxndf["posted_date"], errors="coerce")

# Deduplicate transactions (use this whenever you build transaction features)
trxndf = (
    trxndf.sort_values(["posted_date"])
      .drop_duplicates(subset=["prism_transaction_id"], keep="first")
)


## scoring exclusions

In [9]:
# Accounts: how many accounts + how many balance snapshots
acct_stats = (
    acctdf.groupby("prism_consumer_id")
    .agg(
        n_accounts=("prism_account_id", "nunique"),
        n_balance_days=("balance_date", "nunique"),
        first_balance=("balance_date", "min"),
        last_balance=("balance_date", "max"),
    )
    .reset_index()
)

# Transactions: count + span + credits
tx_stats = (
    trxndf.groupby("prism_consumer_id")
    .agg(
        n_txn=("prism_transaction_id", "count"),
        first_txn=("posted_date", "min"),
        last_txn=("posted_date", "max"),
    )
    .reset_index()
)
tx_stats["txn_span_days"] = (tx_stats["last_txn"] - tx_stats["first_txn"]).dt.days

credit_stats = (
    trxndf.assign(is_credit=(trxndf["credit_or_debit"] == "CREDIT").astype(int))
    .groupby("prism_consumer_id")
    .agg(n_credit=("is_credit", "sum"))
    .reset_index()
)

# Combine into one scoring table (one row per consumer)
scoring = (
    consdf[["prism_consumer_id", "evaluation_date", "DQ_TARGET", "credit_score"]]
    .merge(acct_stats, on="prism_consumer_id", how="left")
    .merge(tx_stats, on="prism_consumer_id", how="left")
    .merge(credit_stats, on="prism_consumer_id", how="left")
)

# Fill missing stats with 0 where appropriate
for col in ["n_accounts", "n_balance_days", "n_txn", "txn_span_days", "n_credit"]:
    if col in scoring.columns:
        scoring[col] = scoring[col].fillna(0)


In [10]:
scoring.head()

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,credit_score,n_accounts,n_balance_days,first_balance,last_balance,n_txn,first_txn,last_txn,txn_span_days,n_credit
0,0,2021-09-01,0,726.0,2.0,1.0,2021-08-31,2021-08-31,408.0,2021-03-16,2021-09-11,179.0,38.0
1,1,2021-07-01,0,626.0,2.0,1.0,2021-06-30,2021-06-30,314.0,2021-01-15,2021-07-14,180.0,71.0
2,2,2021-05-01,0,680.0,2.0,1.0,2021-04-30,2021-04-30,448.0,2020-11-04,2021-05-03,180.0,81.0
3,3,2021-03-01,0,734.0,2.0,1.0,2021-02-28,2021-02-28,271.0,2020-09-23,2021-03-22,180.0,51.0
4,4,2021-10-01,0,676.0,2.0,2.0,2020-12-31,2021-09-30,306.0,2020-07-19,2021-06-21,337.0,40.0


In [11]:
# consumers with no accounts
consumers_with_accounts = set(acctdf["prism_consumer_id"].unique())
all_consumers = set(consdf["prism_consumer_id"].unique())

no_account_ids = list(all_consumers - consumers_with_accounts)

print("Total consumers:", len(all_consumers))
print("Consumers with NO accounts:", len(no_account_ids))

Total consumers: 12000
Consumers with NO accounts: 1592


In [12]:
1592/12000

0.13266666666666665

In [13]:
# checking to see how many "no accounts" have transactions
trx_no_account = trxndf[trxndf["prism_consumer_id"].isin(no_account_ids)]

consumers_no_account_with_txn = trx_no_account["prism_consumer_id"].nunique()

print("\nConsumers with NO accounts but WITH transactions:", consumers_no_account_with_txn)


# 3️⃣ Total transaction rows for these consumers
print("Total transaction rows for these consumers:", trx_no_account.shape[0])


# 4️⃣ Show example consumer IDs
print("\nExample consumer IDs (no account but with transactions):")
print(list(trx_no_account["prism_consumer_id"].unique())[:5])


# 5️⃣ Show sample transaction rows
print("\nSample transaction rows:")
display(trx_no_account.head(10))


Consumers with NO accounts but WITH transactions: 1479
Total transaction rows for these consumers: 961436

Example consumer IDs (no account but with transactions):
['1706', '3327', '2042', '1970', '114']

Sample transaction rows:


Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
858236,1706,857660,0,2000.0,DEBIT,2020-11-01
895947,3327,895260,0,300.0,CREDIT,2020-11-01
895953,3327,895266,0,200.0,DEBIT,2020-11-02
895954,3327,895267,0,100.0,DEBIT,2020-11-06
728042,2042,727489,2,500.0,CREDIT,2020-11-12
858237,1706,857661,1,5000.0,DEBIT,2020-11-12
728043,2042,727490,4,25.0,CREDIT,2020-11-18
728044,2042,727491,2,500.0,CREDIT,2020-11-18
728045,2042,727492,4,475.0,CREDIT,2020-11-19
88530,1970,88466,4,0.05,CREDIT,2020-11-20


In [14]:
scoring["n_txn"].describe()


count    12000.000000
mean       427.969500
std        393.403508
min          0.000000
25%        155.000000
50%        337.000000
75%        594.250000
max       8478.000000
Name: n_txn, dtype: float64

In [15]:
scoring["txn_span_days"].describe()


count    12000.000000
mean       153.381167
std         78.184482
min          0.000000
25%         88.000000
50%        178.000000
75%        242.000000
max        801.000000
Name: txn_span_days, dtype: float64

even the low activity consumers have 88 days (2-3 months) of transaction history

In [16]:
scoring["n_credit"].describe()


count    12000.000000
mean        73.214583
std         76.819153
min          0.000000
25%         28.000000
50%         52.000000
75%         93.000000
max       1553.000000
Name: n_credit, dtype: float64

bottom 25% has 28 credit transactions

In [17]:
RULES = {
    "no_accounts_and_no_transactions": 
    (scoring["n_accounts"] < 1) & (scoring["n_txn"] < 1),

    # short behavioral history (less than 30 days)
    "short_txn_history": scoring["txn_span_days"] < 30,
}

# store reasons
for name, mask in RULES.items():
    scoring[name] = mask

# Exclusion flag
scoring["excluded"] = scoring[list(RULES.keys())].any(axis=1)

# Summary
print("Total consumers:", scoring.shape[0])
print("Excluded:", scoring["excluded"].sum())
print("Eligible:", (~scoring["excluded"]).sum())
print("Exclusion rate:", scoring["excluded"].mean())


Total consumers: 12000
Excluded: 898
Eligible: 11102
Exclusion rate: 0.07483333333333334


In [18]:
eligible_ids = scoring.loc[~scoring["excluded"], "prism_consumer_id"]

print("Eligible consumers:", len(eligible_ids))


Eligible consumers: 11102


In [19]:
consdf_eligible = consdf[
    consdf["prism_consumer_id"].isin(eligible_ids)
].copy()

acctdf_eligible = acctdf[
    acctdf["prism_consumer_id"].isin(eligible_ids)
].copy()

trxndf_eligible = trxndf[
    trxndf["prism_consumer_id"].isin(eligible_ids)
].copy()


In [20]:
print("Consumers in consdf_eligible:", consdf_eligible["prism_consumer_id"].nunique())
print("Consumers in acctdf_eligible:", acctdf_eligible["prism_consumer_id"].nunique())
print("Consumers in trxndf_eligible:", trxndf_eligible["prism_consumer_id"].nunique())


Consumers in consdf_eligible: 11102
Consumers in acctdf_eligible: 9659
Consumers in trxndf_eligible: 11102


In [21]:
consdf = consdf_eligible
acctdf = acctdf_eligible
trxndf = trxndf_eligible

## feature engineering

In [22]:
initial_df = (
    acctdf
    .merge(consdf, on='prism_consumer_id', how='inner')
    .groupby(['prism_consumer_id'])
    .agg(
        balance=('balance', 'sum'),
        balance_date=('balance_date', 'max')
    )
    .reset_index()
).merge(trxndf,on='prism_consumer_id')

In [23]:
mapping = dict(zip(cat_map["category_id"], cat_map["category"]))
initial_df["category"] = initial_df["category"].replace(mapping)
monthly_summary=initial_df.copy()
monthly_summary['amount'] = np.where(initial_df['credit_or_debit'] == 'DEBIT', -initial_df['amount'],initial_df['amount'])
monthly_summary['posted_date'] = pd.to_datetime(monthly_summary['posted_date'])
monthly_summary = (
    monthly_summary
    .groupby(['prism_consumer_id', monthly_summary['posted_date'].dt.to_period('M')])
    .agg(
        starting_balance=('balance', 'first'),
        monthly_total=('balance', 'sum'),
        trxndf_count = ('balance', 'count')
    )
    .reset_index()
)
monthly_summary['posted_date'] = monthly_summary['posted_date'].dt.to_timestamp()

In [24]:
monthly_summary = monthly_summary.merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [25]:
# ensure date type
monthly_summary["posted_date"] = pd.to_datetime(monthly_summary["posted_date"])

# sort properly
monthly_summary = monthly_summary.sort_values(["prism_consumer_id", "posted_date"])

# calculate running balance
monthly_summary["monthly_balance"] = (
    monthly_summary["starting_balance"]
    + monthly_summary.groupby("prism_consumer_id")["monthly_total"].cumsum()
)

In [26]:
del_df = monthly_summary[monthly_summary['DQ_TARGET'] == 1]
nondel_df = monthly_summary[monthly_summary['DQ_TARGET'] == 0]
ids_1 = del_df["prism_consumer_id"].dropna().unique()
ids_0 = del_df["prism_consumer_id"].dropna().unique()

In [27]:
mtotal_df = monthly_summary.groupby('prism_consumer_id').agg(
        DQ_TARGET = ('DQ_TARGET', 'first'),
        monthly_mean=('monthly_total', 'mean'),
        monthly_max=('monthly_total', 'max'),
        monthly_min=('monthly_total', 'min'),
        trxndf_count = ('trxndf_count','first'),
        month_count=('monthly_total', 'count')
    )

In [28]:
cd_df = initial_df[['prism_consumer_id','amount','credit_or_debit']].groupby(['prism_consumer_id','credit_or_debit']).sum().reset_index()


In [29]:
cd_df = (
    cd_df
    .pivot_table(
        index='prism_consumer_id',
        columns='credit_or_debit',
        values='amount',
        aggfunc='sum',
        fill_value=0
    )
    .assign(
        credit_debit_ratio=lambda x: x['CREDIT'] / (x['DEBIT'] + 1),
        net_flow=lambda x: x['CREDIT'] - x['DEBIT']
    )
)

In [30]:
cd_df = cd_df.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [31]:
net_df = initial_df[['prism_consumer_id','posted_date','category','credit_or_debit','amount']].copy()
net_df['amount'] = np.where(net_df['credit_or_debit'] == 'DEBIT', -net_df['amount'],net_df['amount'])
net_df['posted_date'] = pd.to_datetime(net_df['posted_date'])
net_df['month'] = net_df['posted_date'].dt.to_period('M')
mn_df = net_df.groupby(['prism_consumer_id','month']).agg(
        monthly_total=('amount', 'sum'),
        monthly_std =('amount','std')
    ).reset_index()


monthly features

In [32]:
monthly_features = mn_df.groupby(['prism_consumer_id']).agg(
    monthly_net_total=('monthly_total', 'sum'),
    monthly_net_avg=('monthly_total', 'mean'),
    monthly_net_max=('monthly_total', 'max'),
    monthly_net_min=('monthly_total', 'min'),
    monthly_std_avg=('monthly_std', 'mean')
).reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()
monthly_features['prism_consumer_id'] = monthly_features['prism_consumer_id'].astype(int)
mtotal_df = mtotal_df.reset_index()
mtotal_df['prism_consumer_id'] = mtotal_df['prism_consumer_id'].astype(int)
cd_df['prism_consumer_id'] = cd_df['prism_consumer_id'].astype(int)
monthly_features['net_range'] = monthly_features['monthly_net_max'] - monthly_features['monthly_net_min']

In [33]:
initial_df['amount'] = np.where(initial_df['credit_or_debit'] == 'DEBIT', -initial_df['amount'],initial_df['amount'])
cat_df = initial_df.groupby(['prism_consumer_id','category'])['amount'].sum().reset_index()

In [34]:
cat_pivot = (
    cat_df
    .pivot(
        index='prism_consumer_id',
        columns='category',
        values='amount'
    )
    .fillna(0)
)

In [35]:
outflows = cat_pivot.clip(upper=0).abs()
inflows  = cat_pivot.clip(lower=0)

cat_features = pd.DataFrame(index=cat_pivot.index)

cat_features['total_outflows'] = outflows.sum(axis=1)
cat_features['total_inflows']  = inflows.sum(axis=1)
cat_features['net_flow']       = cat_pivot.sum(axis=1)

In [36]:
for col in outflows.columns:
    cat_features[f'{col}_outflow_ratio'] = (
        outflows[col] / (cat_features['total_outflows'] + 1))

In [37]:
# Income reliance
cat_features['paycheck_ratio'] = (
    inflows.get('PAYCHECK', 0) / (cat_features['total_inflows'] + 1)
)

# Cash usage
cat_features['atm_cash_ratio'] = (
    outflows.get('ATM_CASH', 0) / (cat_features['total_outflows'] + 1)
)

# Entertainment vs essentials proxy
cat_features['entertainment_ratio'] = (
    outflows.get('ENTERTAINMENT', 0) / (cat_features['total_outflows'] + 1)
)

# Refund dependence
cat_features['refund_ratio'] = (
    inflows.get('REFUND', 0) / (cat_features['total_inflows'] + 1)
)

In [38]:
outflows = outflows.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [39]:
cat_features = cat_features.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [40]:
add_df = cat_features[['prism_consumer_id','refund_ratio','paycheck_ratio']].copy()
add_df['prism_consumer_id'] = add_df['prism_consumer_id'].astype(int)
outflows['prism_consumer_id'] = outflows['prism_consumer_id'].astype(int)
out_df = outflows.copy()

In [41]:
initial_df['amount'] = np.where(initial_df['credit_or_debit'] == 'DEBIT', -initial_df['amount'],initial_df['amount'])
cat_df = initial_df.groupby(['prism_consumer_id','category'])['amount'].mean().reset_index()

In [42]:
cat_pivot = (
    cat_df
    .pivot(
        index='prism_consumer_id',
        columns='category',
        values='amount'
    )
    .fillna(0)
)
cat_pivot.columns = cat_pivot.columns + "_trxnavg"
cat_pivot = cat_pivot.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()
cat_pivot['prism_consumer_id'] = cat_pivot['prism_consumer_id'].astype(int)

income

In [43]:
mapping = dict(zip(cat_map["category_id"], cat_map["category"]))
trxndf["category"] = trxndf["category"].replace(mapping)

income_categories = [
    'PAYCHECK',
    'DEPOSIT',
    'UNEMPLOYMENT_BENEFITS',
    'OTHER_BENEFITS',
    'PENSION',
    'INVESTMENT_INCOME'
]

income_df = trxndf[
    trxndf['category'].isin(income_categories)
].copy()
income_df['prism_transaction_id'].duplicated().sum()
income_df['posted_date'] = pd.to_datetime(income_df['posted_date'])

In [44]:
income_time = (
    income_df
    .groupby('prism_consumer_id')
    .agg(
        first_income_date=('posted_date', 'min'),
        last_income_date=('posted_date', 'max')
    )
    .reset_index()
)

income_time['income_span_days'] = (
    income_time['last_income_date'] - income_time['first_income_date']
).dt.days

In [45]:
income_df = income_time[['prism_consumer_id','income_span_days']]
income_df['prism_consumer_id'] = income_time['prism_consumer_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_df['prism_consumer_id'] = income_time['prism_consumer_id'].astype(int)


preliminary testing

In [46]:
cat_pivot= cat_pivot.drop(columns='DQ_TARGET')


In [47]:
main_df= monthly_features.merge(mtotal_df,on='prism_consumer_id')
main_df['DQ_TARGET'] = main_df['DQ_TARGET_x']
main_df = main_df.drop(columns=['DQ_TARGET_x','DQ_TARGET_y'])
cd_df = cd_df.drop(columns=['net_flow','DQ_TARGET'])
main_df= main_df.merge(cd_df,on='prism_consumer_id')
main_df= main_df.merge(add_df,on='prism_consumer_id')
main_df= main_df.merge(out_df,on='prism_consumer_id')
main_df= main_df.merge(income_df,on='prism_consumer_id')
main_df= main_df.merge(cat_pivot,on='prism_consumer_id')
main_df

Unnamed: 0,prism_consumer_id,monthly_net_total,monthly_net_avg,monthly_net_max,monthly_net_min,monthly_std_avg,net_range,monthly_mean,monthly_max,monthly_min,...,REFUND_trxnavg,RENT_trxnavg,RISK_CATCH_ALL_trxnavg,RTO_LTO_trxnavg,SELF_TRANSFER_trxnavg,TAX_trxnavg,TIME_OR_STUFF_trxnavg,TRANSPORATION_trxnavg,TRAVEL_trxnavg,UNEMPLOYMENT_BENEFITS_trxnavg
0,0,-521.59,-74.512857,830.73,-2584.24,213.544425,3414.97,1.867299e+04,27231.45,8970.36,...,19.960000,0.000000,0.0,0.0,116.685652,867.840,0.000000,2.480000,54.375000,0.0
1,1,1805.43,257.918571,1109.02,-940.73,292.763392,2049.75,1.481371e+05,208052.46,102375.02,...,2.420000,0.000000,0.0,0.0,233.410256,1162.700,0.000000,25.900000,0.000000,0.0
2,10,-1190.04,-170.005714,431.40,-971.45,260.603079,1402.85,4.015226e+04,60169.52,19781.76,...,18.466000,103.000000,0.0,0.0,237.568750,0.000,0.000000,17.520000,0.000000,0.0
3,100,-4505.77,-750.961667,1276.72,-3332.81,832.186871,4609.53,5.399456e+04,63731.28,45142.99,...,1.468750,0.000000,0.0,0.0,547.296667,0.000,0.000000,0.000000,0.000000,0.0
4,1000,438.08,62.582857,2982.67,-2884.56,1223.790895,5867.23,2.871107e+03,3524.25,476.25,...,1.370000,0.000000,0.0,0.0,828.920370,0.000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,995,21842.68,3120.382857,10212.74,-3244.20,1016.422674,13456.94,2.053630e+06,3162590.31,670852.49,...,1.463333,1134.158000,0.0,0.0,1250.937000,626.432,0.000000,0.000000,0.000000,0.0
9326,996,26713.18,3816.168571,41464.50,-16811.41,2623.765971,58275.91,0.000000e+00,0.00,0.00,...,2.944444,0.000000,0.0,0.0,3532.145542,0.000,0.000000,12.000000,493.909091,0.0
9327,997,-14899.66,-2128.522857,206.99,-3741.45,745.079512,3948.44,4.787867e+06,6756531.35,1404823.35,...,14.420000,0.000000,0.0,0.0,940.464154,2516.000,20.149683,0.000000,0.000000,0.0
9328,998,5507.73,786.818571,3359.83,-1022.35,537.836676,4382.18,7.987620e+05,1116887.94,275774.80,...,9.546429,231.316667,0.0,0.0,759.527885,2991.340,0.000000,12.700000,248.140000,0.0


In [48]:
# columns I will need: credit/debit, amount, posted date, evaluation date, prism consumer id, DQ_TARGET
merged = pd.merge(consdf.dropna(), trxndf, on='prism_consumer_id', how='left')

In [49]:
merged = merged[merged['posted_date'] <= merged['evaluation_date']]
credit_only = merged[merged['credit_or_debit'] == 'CREDIT'].copy()
credit_only['posted_date'] = pd.to_datetime(credit_only['posted_date'])
credit_only['Year-Month'] = credit_only['posted_date'].dt.to_period('M')
debt_only = trxndf[trxndf['credit_or_debit']=='DEBIT']
monthly_inflow = credit_only.groupby(['prism_consumer_id', 'Year-Month'])['amount'].sum().reset_index(name='monthly_inflow')
consdf['Evaluation Month'] = consdf['evaluation_date'].dt.to_period('M')
with_eval_month = pd.merge(consdf, monthly_inflow, on='prism_consumer_id', how='left')

In [50]:
with_eval_month['months_diff'] = (
    (with_eval_month['Evaluation Month'].dt.year - with_eval_month['Year-Month'].dt.year) * 12 +
    (with_eval_month['Evaluation Month'].dt.month - with_eval_month['Year-Month'].dt.month)
)
last_year = with_eval_month[(with_eval_month['months_diff'] >= 1) & (with_eval_month['months_diff'] <= 12)]
sum_yearly_inflow = last_year.groupby('prism_consumer_id')['monthly_inflow'].sum().reset_index(name='avg_yearly_inflow')
year_std = last_year.groupby('prism_consumer_id')['monthly_inflow'].std().reset_index()
year_std.columns = ['prism_consumer_id', 'std_inflow']

In [51]:
# Trend: Is income increasing or decreasing?
def calculate_trend(group):
    if len(group) < 2:
        return 0
    months = group['months_diff'].values
    inflows = group['monthly_inflow'].values
    return np.polyfit(months, inflows, 1)[0]  # slope

trend = last_year.groupby('prism_consumer_id').apply(calculate_trend, include_groups=False).reset_index()
trend.columns = ['prism_consumer_id', 'trend']
num_transactions = last_year.groupby('prism_consumer_id').size().reset_index()
num_transactions.columns = ['prism_consumer_id', 'num_transactions']

In [52]:
debt_only = trxndf[trxndf['credit_or_debit'] == 'DEBIT'].copy()
debt_only['posted_date'] = pd.to_datetime(debt_only['posted_date'])
# debt_only['category'] = debt_only['category'].astype(int)

# debt_with_category = pd.merge(debt_only, cat_map, left_on='category', right_on='category_id', how='left')[['prism_consumer_id',\
#     'prism_transaction_id', 'amount', 'credit_or_debit', 'posted_date', 'category_id', 'category_y']]
debt_with_category = debt_only.rename(columns={'category_y':'category'})
groceries_only = debt_with_category[debt_with_category['category']=='GROCERIES']

debt_with_eval = pd.merge(groceries_only, consdf[['prism_consumer_id', 'evaluation_date']], on='prism_consumer_id', how='left')

# Filter for transactions in the 3 months before evaluation_date
debt_with_eval['months_before_eval'] = (
    (debt_with_eval['evaluation_date'].dt.year - debt_with_eval['posted_date'].dt.year) * 12 +
    (debt_with_eval['evaluation_date'].dt.month - debt_with_eval['posted_date'].dt.month)
)

debt_9m = debt_with_eval[(debt_with_eval['months_before_eval'] >= 0) & 
                          (debt_with_eval['months_before_eval'] < 9)]

# total spend of groceries per consumer over a 9 month window (last 9 months before eval date)
total_spend_groceries_9m = debt_9m.groupby('prism_consumer_id')['amount'].sum().reset_index()
total_spend_groceries_9m.columns = ['prism_consumer_id', 'sum_groceries_9m']

In [53]:
# total spend of dining per consumer over a month window (last month before eval date)
dining_only = debt_with_category[debt_with_category['category']=='FOOD_AND_BEVERAGES']

debt_with_eval_dining = pd.merge(dining_only, consdf[['prism_consumer_id', 'evaluation_date']], on='prism_consumer_id', how='left')

# Filter for transactions in the 6 months before evaluation_date
debt_with_eval_dining['months_before_eval'] = (
    (debt_with_eval_dining['evaluation_date'].dt.year - debt_with_eval_dining['posted_date'].dt.year) * 12 +
    (debt_with_eval_dining['evaluation_date'].dt.month - debt_with_eval_dining['posted_date'].dt.month)
)

debt_6m = debt_with_eval_dining[(debt_with_eval_dining['months_before_eval'] >= 0) & 
                          (debt_with_eval_dining['months_before_eval'] < 6)]

# total spend of groceries per consumer over a 6 month window (last 6 months before eval date)
total_spend_dining_6m = debt_6m.groupby('prism_consumer_id')['amount'].sum().reset_index()
total_spend_dining_6m.columns = ['prism_consumer_id', 'sum_dining_6m']

In [54]:
# merge evaluation date ONCE
tx = debt_with_category.merge(
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

tx = tx[tx['credit_or_debit'] == 'DEBIT']
tx['amount'] = tx['amount'].abs()

# numerator
total_spend_gambling = tx[tx['category'] == 'GAMBLING'].groupby('prism_consumer_id')['amount'].sum()

# denominator
total_spend_all = tx.groupby('prism_consumer_id')['amount'].sum()

pct_spend_gambling = (total_spend_gambling / total_spend_all).fillna(0).reset_index(name='pct_spend_gambling')

In [55]:
essentials = ['RENT', 'MORTGAGE', 'BILLS_UTILITIES', 'ESSENTIAL_SERVICES', 'GROCERIES', 'AUTOMOTIVE', 'TRANSPORTATION', \
'HEALTHCARE_MEDICAL', 'INSURANCE', 'CHILD_DEPENDENTS', 'PETS', 'TAX', 'LOAN', 'AUTO_LOAN', 'DEBT', 'CREDIT_CARD_PAYMENT', \
'EDUCATION', 'LEGAL', 'GOVERNMENT_SERVICES']

total_spend_essentials = tx[tx['category'].isin(essentials)].groupby('prism_consumer_id')['amount'].sum()

pct_spend_essentials = (total_spend_essentials / total_spend_all).reset_index()

pct_spend_essentials = pct_spend_essentials.rename(columns={'amount':'pct_spend_essentials'})

In [56]:
# # change in groceries per consumer from the 3 most recent months to the prior 3-6 months before evaluation date
# lowers AUC from 0.721 to 0.71

# recent 3 months (0–2)
recent_3m = debt_with_eval[(debt_with_eval['months_before_eval'] >= 0) & (debt_with_eval['months_before_eval'] < 3)]

recent_spend = recent_3m.groupby('prism_consumer_id')['amount'].sum().reset_index(name='groceries_0_3m')

# prior 3 months (3–5)
prior_3m = debt_with_eval[(debt_with_eval['months_before_eval'] >= 3) & (debt_with_eval['months_before_eval'] < 6)]

prior_spend = prior_3m.groupby('prism_consumer_id')['amount'].sum().reset_index(name='groceries_3_6m')

# merge and compute delta
delta_groceries_3m = recent_spend.merge(
    prior_spend,
    on='prism_consumer_id',
    how='outer'
).fillna(0)

delta_groceries_3m['delta_groceries_3m'] = delta_groceries_3m['groceries_0_3m'] - delta_groceries_3m['groceries_3_6m']

delta_groceries_3m = delta_groceries_3m[['prism_consumer_id', 'delta_groceries_3m']]

utilities = ['BILLS_UTILITIES', 'ESSENTIAL_SERVICES']

total_spend_utilities = tx[tx['category'].isin(utilities)].groupby('prism_consumer_id')['amount'].sum()

pct_spend_utilities = (total_spend_utilities / total_spend_all).reset_index()

pct_spend_utilities = pct_spend_utilities.rename(columns={'amount':'pct_spend_utilities'})

In [57]:
# has overdraft - 6 months
# Merge evaluation dates with ALL debt transactions
debt_with_eval = pd.merge(
    debt_with_category, 
    consdf[['prism_consumer_id', 'evaluation_date']], 
    on='prism_consumer_id', 
    how='left'
)

# Calculate days before evaluation
debt_with_eval['days_before_eval'] = (
    debt_with_eval['evaluation_date'] - debt_with_eval['posted_date']
).dt.days

# Filter for OVERDRAFT category AND within 6 months
overdraft_6m = debt_with_eval[
    (debt_with_eval['category'] == 'OVERDRAFT') &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with overdrafts
has_overdraft_6m = overdraft_6m.groupby('prism_consumer_id').size().reset_index(name='overdraft_count')
has_overdraft_6m['has_overdraft_6m'] = 1

has_overdraft_6m = has_overdraft_6m[['prism_consumer_id', 'has_overdraft_6m']]

In [58]:
# has account fees - 6 months
# Merge evaluation dates with ALL debt transactions
debt_with_eval = pd.merge(
    debt_with_category, 
    consdf[['prism_consumer_id', 'evaluation_date']], 
    on='prism_consumer_id', 
    how='left'
)

# Calculate days before evaluation
debt_with_eval['days_before_eval'] = (
    debt_with_eval['evaluation_date'] - debt_with_eval['posted_date']
).dt.days

# Filter for ACCOUNT FEES category AND within 6 months
acct_fees_6m = debt_with_eval[
    (debt_with_eval['category'] == 'ACCOUNT_FEES') &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with acct fee
has_acct_fee_6m = acct_fees_6m.groupby('prism_consumer_id').size().reset_index(name='acct_fees_count')
has_acct_fee_6m['has_acct_fee_6m'] = 1

has_acct_fee_6m = has_acct_fee_6m[['prism_consumer_id', 'has_acct_fee_6m']]

In [59]:
#atm cash ratio per consumer

debt_with_eval = pd.merge(
    debt_with_category,
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

debt_with_eval['posted_date'] = pd.to_datetime(debt_with_eval['posted_date'])
debt_with_eval['evaluation_date'] = pd.to_datetime(debt_with_eval['evaluation_date'])

debt_with_eval = debt_with_eval[
    debt_with_eval['posted_date'] <= debt_with_eval['evaluation_date']
]

total_debt_spend = debt_with_eval.groupby('prism_consumer_id')['amount'].sum().reset_index(name='total_debit_spend')

In [60]:
atm_cash_spend = (
    debt_with_eval[debt_with_eval['category'] == 'ATM_CASH']
    .groupby('prism_consumer_id')['amount']
    .sum()
    .reset_index(name='atm_cash_spend')
)

atm_cash_ratio = total_debt_spend.merge(atm_cash_spend, on='prism_consumer_id',how='left').fillna(0)
atm_cash_ratio['atm_cash_ratio'] = atm_cash_ratio['atm_cash_spend'] / atm_cash_ratio['total_debit_spend']
atm_cash_ratio['atm_cash_ratio'] = (
    atm_cash_ratio['atm_cash_ratio']
    .replace([np.inf, -np.inf], 0)
    .fillna(0)
)


In [61]:
# Merge evaluation dates with ALL debt transactions
debt_with_eval = pd.merge(
    debt_with_category, 
    consdf[['prism_consumer_id', 'evaluation_date']], 
    on='prism_consumer_id', 
    how='left'
)

# Calculate days before evaluation
debt_with_eval['days_before_eval'] = (
    debt_with_eval['evaluation_date'] - debt_with_eval['posted_date']
).dt.days

atm_cash_freq_6m = acct_fees_6m.groupby('prism_consumer_id').size().reset_index(name='atm_cash_freq_6m')

In [62]:
# refund ratio
credit_only = trxndf[trxndf['credit_or_debit']=='CREDIT']
# merged_credit = pd.merge(credit_only, cat_map, left_on='category', right_on='category_id', how='left')[['prism_consumer_id', 'prism_transaction_id', 'amount', \
# 'credit_or_debit', 'posted_date', 'category_id', 'category_y']]
merged_credit = credit_only.rename(columns={'category_y': 'category'})

credit_with_eval = pd.merge(
    merged_credit,
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

credit_with_eval['posted_date'] = pd.to_datetime(credit_with_eval['posted_date'])
credit_with_eval['evaluation_date'] = pd.to_datetime(credit_with_eval['evaluation_date'])

credit_with_eval['days_before_eval'] = (credit_with_eval['evaluation_date'] - credit_with_eval['posted_date']).dt.days
window = credit_with_eval[(credit_with_eval['days_before_eval'] >= 0) & (credit_with_eval['days_before_eval'] <= 180)]

refund = window[window['category']=='REFUND'].groupby('prism_consumer_id')['amount'].sum().reset_index(name='refund_amount')

In [63]:
debit_only = trxndf[trxndf['credit_or_debit'] == 'DEBIT']
# merged_debit = pd.merge(
#     debit_only,
#     cat_map,
#     left_on='category',
#     right_on='category_id',
#     how='left'
# )[[
#     'prism_consumer_id',
#     'prism_transaction_id',
#     'amount',
#     'credit_or_debit',
#     'posted_date',
#     'category_id',
#     'category_y'
# ]]

merged_debit = debit_only.rename(columns={'category_y': 'category'})
debit_with_eval = pd.merge(
    merged_debit,
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

debit_with_eval['posted_date'] = pd.to_datetime(debit_with_eval['posted_date'])
debit_with_eval['evaluation_date'] = pd.to_datetime(debit_with_eval['evaluation_date'])

debit_with_eval['days_before_eval'] = (
    debit_with_eval['evaluation_date'] - debit_with_eval['posted_date']
).dt.days

debit_window = debit_with_eval[
    (debit_with_eval['days_before_eval'] >= 0) &
    (debit_with_eval['days_before_eval'] <= 180)
]

debit_spend = debit_window[
    debit_window['category'] != 'REFUND'
]
denominator = (
    debit_spend
    .groupby('prism_consumer_id')['amount']
    .sum()
    .reset_index(name='total_debit_spend')
)


In [64]:
refund_ratio = denominator.merge(
    refund,
    on='prism_consumer_id',
    how='left'
).fillna(0)

refund_ratio['refund_ratio'] = (
    refund_ratio['refund_amount'] /
    refund_ratio['total_debit_spend']
)

refund_ratio['refund_ratio'] = (
    refund_ratio['refund_ratio']
    .replace([np.inf, -np.inf], 0)
    .fillna(0)
)
refund_ratio = refund_ratio[['prism_consumer_id', 'refund_ratio']]

In [65]:
# debt_payment_ratio
# (LOAN + CREDIT_CARD_PAYMENT + AUTO_LOAN + BNPL) / total_debit_spend
categories_of_interest = ['LOAN', 'CREDIT_CARD_PAYMENT', 'AUTO_LOAN', 'BNPL']

summary = (
    debit_with_eval
    .groupby('prism_consumer_id')
    .agg(
        total_debit_spend=('amount', 'sum'),
        debt_spend=('amount', lambda x: x[
            debit_with_eval.loc[x.index, 'category'].isin(categories_of_interest)
        ].sum())
    )
    .reset_index()
)

summary['debt_spend_ratio'] = summary['debt_spend'] / summary['total_debit_spend']

In [66]:
# bnpl usage flag
# Filter for BNPL category AND within 6 months
bnpl_usage_6m = debt_with_eval[
    (debt_with_eval['category'] == 'BNPL') &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with acct fee
has_bnpl_usage_6m = bnpl_usage_6m.groupby('prism_consumer_id').size().reset_index(name='bnpl_usage_flag')
has_bnpl_usage_6m['bnpl_usage_flag'] = 1

has_bnpl_usage_6m = has_bnpl_usage_6m[['prism_consumer_id', 'bnpl_usage_flag']]

In [67]:
debt_categories = ['LOAN', 'CREDIT_CARD_PAYMENT', 'AUTO_LOAN', 'BNPL']

debt_category_count = (
    debit_with_eval[debit_with_eval['category'].isin(debt_categories)]
    .groupby(['prism_consumer_id', 'category'])['amount']
    .sum()
    .reset_index()
)

# keep only categories with non-zero spend
debt_category_count = debt_category_count[debt_category_count['amount'] != 0]

debt_category_count = (
    debt_category_count
    .groupby('prism_consumer_id')
    .size()
    .reset_index(name='debt_category_count')
)

In [68]:
# discretionary drop flag
discretionary_cat_map = ['ENTERTAINMENT', 'TRAVEL', 'FITNESS']
df = debit_with_eval.copy()
df['month'] = df['posted_date'].dt.to_period('M')
monthly_disc = df[df['category'].isin(discretionary_cat_map)].groupby(['prism_consumer_id', 'month'])['amount'].sum().reset_index()

In [69]:

monthly_disc = monthly_disc.sort_values(['prism_consumer_id', 'month'])
monthly_disc['disc_3m_spend'] = monthly_disc.groupby('prism_consumer_id')['amount'].rolling(3, min_periods=3).sum().reset_index(drop=True)
monthly_disc['prev_disc_3m_spend'] = (
    monthly_disc
    .groupby('prism_consumer_id')['disc_3m_spend']
    .shift(3)
)

In [70]:

DROP_THRESHOLD = 0.30

monthly_disc['discretionary_drop_flag_3m'] = (
    (monthly_disc['prev_disc_3m_spend'] > 0) &
    ((monthly_disc['prev_disc_3m_spend'] - monthly_disc['disc_3m_spend'])
     / monthly_disc['prev_disc_3m_spend'] >= DROP_THRESHOLD)
).astype(int)

discretionary_drop_flag_3m = (
    monthly_disc
    .dropna(subset=['discretionary_drop_flag_3m'])
    .groupby('prism_consumer_id')
    .tail(1)
    [['prism_consumer_id', 'discretionary_drop_flag_3m']]
)

In [71]:
# essential spend volatility in 6 months
# Filter for essentials AND within 6 months
essential_spend_volatility_6m = debt_with_eval[
    (debt_with_eval['category'].isin(essentials)) &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with acct fee
essential_spend_volatility_6m = essential_spend_volatility_6m.groupby('prism_consumer_id')['amount'].std().reset_index(name='essential_spend_volatility_6m')

essential_spend_volatility_6m = essential_spend_volatility_6m[['prism_consumer_id', 'essential_spend_volatility_6m']]

In [72]:
# child dependents spend sum in 6 months
# Filter for child dependents AND within 6 months
child_dependents_6m = debt_with_eval[
    (debt_with_eval['category']=='CHILD_DEPENDENTS')&
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with child dependents
has_child_deps_6m = bnpl_usage_6m.groupby('prism_consumer_id').size().reset_index(name='child_dependents_6m')
has_child_deps_6m['child_dependents_6m'] = 1

In [73]:

# child dependents spend sum in 6 months
# Filter for essentials AND within 6 months
pets_6m = debt_with_eval[
    (debt_with_eval['category']=='PETS')&
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with child dependents
has_pets_6m = pets_6m.groupby('prism_consumer_id').size().reset_index(name='pets_6m')
has_pets_6m['pets_6m'] = 1


In [74]:
df_eval = pd.merge(consdf, sum_yearly_inflow, on='prism_consumer_id', how='inner')
df_eval = pd.merge(df_eval, year_std, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, trend, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, num_transactions, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, total_spend_groceries_9m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, total_spend_dining_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, pct_spend_gambling, on='prism_consumer_id',how='left')
df_eval = pd.merge(df_eval, pct_spend_essentials, on='prism_consumer_id',how='left')
df_eval = pd.merge(df_eval, delta_groceries_3m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, pct_spend_utilities, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_overdraft_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, atm_cash_ratio, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_acct_fee_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, atm_cash_freq_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, refund_ratio, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, summary, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_bnpl_usage_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, debt_category_count, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, discretionary_drop_flag_3m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, essential_spend_volatility_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_child_deps_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_pets_6m, on='prism_consumer_id', how='left')
df_eval['has_overdraft_6m'] = df_eval['has_overdraft_6m'].fillna(0).astype(int)
df_eval['has_acct_fee_6m'] = df_eval['has_acct_fee_6m'].fillna(0).astype(int)
df_eval['atm_cash_freq_6m'] = df_eval['atm_cash_freq_6m'].fillna(0).astype(int)
df_eval['bnpl_usage_flag'] = df_eval['bnpl_usage_flag'].fillna(0).astype(int)
df_eval['debt_category_count'] = df_eval['debt_category_count'].fillna(0).astype(int)
df_eval['child_dependents_6m'] = df_eval['child_dependents_6m'].fillna(0).astype(int)
df_eval['pets_6m'] = df_eval['pets_6m'].fillna(0).astype(int)

In [76]:
df_eval['prism_consumer_id'] =df_eval['prism_consumer_id'].astype(int)
df_eval = main_df.merge(df_eval,on='prism_consumer_id')

In [77]:

period_cols = [col for col in df_eval.columns 
               if str(df_eval[col].dtype).startswith('period')]

datetime_cols = df_eval.select_dtypes(include=['datetime64[ns]', 'datetimetz']).columns

time_cols = list(datetime_cols) + period_cols
df_eval = df_eval.drop(columns=time_cols)

In [78]:
df_eval = df_eval.drop(columns=['DQ_TARGET_y','DQ_TARGET_x','credit_score'])


## model testing

In [81]:
# Define target
y = df_eval["DQ_TARGET"].astype(int)

# Drop ID + target
X = df_eval.drop(columns=["prism_consumer_id", "DQ_TARGET"], errors="ignore")

# Fill missing values
X = X.fillna(0)

print("Rows:", X.shape[0])
print("Features:", X.shape[1])


Rows: 9060
Features: 137


In [82]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [83]:
def eval_model(name, model, Xtr, ytr, Xte, yte):
    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    t1 = time.perf_counter()

    # Train AUC
    ytr_prob = model.predict_proba(Xtr)[:, 1]
    train_auc = roc_auc_score(ytr, ytr_prob)

    # Test AUC
    t2 = time.perf_counter()
    yte_prob = model.predict_proba(Xte)[:, 1]
    t3 = time.perf_counter()

    test_auc = roc_auc_score(yte, yte_prob)

    # Classification report
    y_pred = model.predict(Xte)

    print(f"\n{name}")
    print(f"  Train AUC: {train_auc:.4f}")
    print(f"  Test  AUC: {test_auc:.4f}")
    print(f"  Train time: {t1 - t0:.4f}s")
    print(f"  Score time: {t3 - t2:.4f}s")
    print("\nClassification Report:")
    print(classification_report(yte, y_pred))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc,
        "train_time": t1 - t0,
        "score_time": t3 - t2
    }


In [86]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

res_log = eval_model(
    "Logistic Regression (All Features)",
    logreg,
    X_train_scaled,
    y_train,
    X_test_scaled,
    y_test
)



Logistic Regression (All Features)
  Train AUC: 0.8038
  Test  AUC: 0.7262
  Train time: 11.9225s
  Score time: 0.0013s

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.67      0.79      1653
           1       0.17      0.69      0.27       159

    accuracy                           0.67      1812
   macro avg       0.56      0.68      0.53      1812
weighted avg       0.89      0.67      0.74      1812



In [87]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=3,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

res_rf = eval_model(
    "Random Forest (All Features)",
    rf,
    X_train,
    y_train,
    X_test,
    y_test
)



Random Forest (All Features)
  Train AUC: 0.9999
  Test  AUC: 0.7666
  Train time: 2.4462s
  Score time: 0.1363s

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      1653
           1       0.38      0.07      0.12       159

    accuracy                           0.91      1812
   macro avg       0.65      0.53      0.53      1812
weighted avg       0.87      0.91      0.88      1812



In [88]:
xgb = XGBClassifier(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

res_xgb = eval_model(
    "XGBoost (All Features)",
    xgb,
    X_train,
    y_train,
    X_test,
    y_test
)



XGBoost (All Features)
  Train AUC: 1.0000
  Test  AUC: 0.7602
  Train time: 3.3778s
  Score time: 0.0136s

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      1653
           1       0.40      0.06      0.11       159

    accuracy                           0.91      1812
   macro avg       0.66      0.53      0.53      1812
weighted avg       0.87      0.91      0.88      1812



In [89]:
results_df = pd.DataFrame([res_log, res_rf, res_xgb])
results_df.sort_values("test_auc", ascending=False)


Unnamed: 0,model,train_auc,test_auc,train_time,score_time
1,Random Forest (All Features),0.999937,0.766576,2.446192,0.136329
2,XGBoost (All Features),0.999993,0.760196,3.377812,0.013586
0,Logistic Regression (All Features),0.803794,0.726189,11.922532,0.001262


## feature selection

In [90]:
# Target
y = df_eval["DQ_TARGET"].astype(int)

# Features
X = df_eval.drop(columns=["prism_consumer_id", "DQ_TARGET"], errors="ignore")
X = X.fillna(0)

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [91]:
selector_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

selector_model.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(
    selector_model.feature_importances_,
    index=X_train.columns
)

# Select top 50
top_50 = importances.sort_values(ascending=False).head(50).index.tolist()

print("Top 10 Features:")
print(importances.sort_values(ascending=False).head(10))


Top 10 Features:
month_count            0.113312
num_transactions       0.058769
monthly_max            0.015436
monthly_min            0.013476
CREDIT_CARD_PAYMENT    0.013407
OVERDRAFT              0.012026
refund_ratio_x         0.011274
monthly_mean           0.011003
INSURANCE              0.010725
LEGAL_trxnavg          0.010128
dtype: float32


In [92]:
X_train_50 = X_train[top_50]
X_test_50  = X_test[top_50]


In [93]:
def eval_model(name, model, Xtr, ytr, Xte, yte):
    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    t1 = time.perf_counter()

    ytr_prob = model.predict_proba(Xtr)[:, 1]
    train_auc = roc_auc_score(ytr, ytr_prob)

    t2 = time.perf_counter()
    yte_prob = model.predict_proba(Xte)[:, 1]
    t3 = time.perf_counter()

    test_auc = roc_auc_score(yte, yte_prob)

    y_pred = model.predict(Xte)

    print(f"\n{name}")
    print(f"  Train AUC: {train_auc:.4f}")
    print(f"  Test  AUC: {test_auc:.4f}")
    print(f"  Overfit gap: {train_auc - test_auc:.4f}")
    print(f"  Train time: {t1 - t0:.4f}s")
    print(f"  Score time: {t3 - t2:.4f}s")
    print("\nClassification Report:")
    print(classification_report(yte, y_pred))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc,
        "gap": train_auc - test_auc
    }


In [94]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_50)
X_test_scaled  = scaler.transform(X_test_50)

logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

res_log = eval_model(
    "Logistic Regression (Top 50)",
    logreg,
    X_train_scaled,
    y_train,
    X_test_scaled,
    y_test
)



Logistic Regression (Top 50)
  Train AUC: 0.7728
  Test  AUC: 0.7181
  Overfit gap: 0.0547
  Train time: 0.0954s
  Score time: 0.0003s

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.64      0.77      1653
           1       0.16      0.70      0.26       159

    accuracy                           0.65      1812
   macro avg       0.56      0.67      0.51      1812
weighted avg       0.89      0.65      0.72      1812



In [95]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=8,              # limit depth to prevent overfit
    min_samples_leaf=5,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

res_rf = eval_model(
    "Random Forest (Top 50)",
    rf,
    X_train_50,
    y_train,
    X_test_50,
    y_test
)



Random Forest (Top 50)
  Train AUC: 0.9513
  Test  AUC: 0.7640
  Overfit gap: 0.1873
  Train time: 1.4773s
  Score time: 0.1364s

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      1653
           1       0.22      0.43      0.29       159

    accuracy                           0.82      1812
   macro avg       0.58      0.64      0.59      1812
weighted avg       0.88      0.82      0.84      1812



In [96]:
xgb = XGBClassifier(
    n_estimators=600,
    max_depth=4,              # shallower to reduce overfit
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    gamma=0.2,
    reg_alpha=0.3,
    reg_lambda=1.5,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

res_xgb = eval_model(
    "XGBoost (Top 50)",
    xgb,
    X_train_50,
    y_train,
    X_test_50,
    y_test
)



XGBoost (Top 50)
  Train AUC: 0.9803
  Test  AUC: 0.7705
  Overfit gap: 0.2098
  Train time: 0.8632s
  Score time: 0.0068s

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      1653
           1       0.50      0.07      0.12       159

    accuracy                           0.91      1812
   macro avg       0.71      0.53      0.54      1812
weighted avg       0.88      0.91      0.88      1812



In [97]:
results_df = pd.DataFrame([res_log, res_rf, res_xgb])
results_df.sort_values("test_auc", ascending=False)


Unnamed: 0,model,train_auc,test_auc,gap
2,XGBoost (Top 50),0.980257,0.770476,0.209781
1,Random Forest (Top 50),0.951332,0.764023,0.187309
0,Logistic Regression (Top 50),0.772781,0.718085,0.054697


In [98]:
# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Forward Selection ---
selected_features = []
remaining_features = list(X.columns)
target_feature_count = 50

for i in range(target_feature_count):
    best_auc = 0
    best_feat = None
    
    for feat in remaining_features:
        current_feats = selected_features + [feat]
        
        model = XGBClassifier(
            n_estimators=200,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="auc",
            random_state=42
        )
        model.fit(X_train[current_feats], y_train)
        
        y_probs = model.predict_proba(X_test[current_feats])[:, 1]
        auc = roc_auc_score(y_test, y_probs)
        
        if auc > best_auc:
            best_auc = auc
            best_feat = feat
    
    if best_feat is None:
        print("No improvement; stopping early.")
        break
    
    selected_features.append(best_feat)
    remaining_features.remove(best_feat)
    print(f"Step {i+1}: Added feature '{best_feat}' with AUC={best_auc:.4f}")

# --- Final Selected Features ---
print("Top 50 Selected Features:", selected_features)

# --- Train Final XGBoost Model ---
final_model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42
)
final_model.fit(X_train[selected_features], y_train)

y_probs = final_model.predict_proba(X_test[selected_features])[:, 1]
final_auc = roc_auc_score(y_test, y_probs)
print("Final Test ROC-AUC:", final_auc)

Step 1: Added feature 'num_transactions' with AUC=0.6969
Step 2: Added feature 'monthly_min' with AUC=0.7316
Step 3: Added feature 'debt_category_count' with AUC=0.7390
Step 4: Added feature 'AUTOMOTIVE_trxnavg' with AUC=0.7521
Step 5: Added feature 'pct_spend_gambling' with AUC=0.7676
Step 6: Added feature 'GROCERIES' with AUC=0.7767
Step 7: Added feature 'INSURANCE_trxnavg' with AUC=0.7767
Step 8: Added feature 'TRAVEL' with AUC=0.7792
Step 9: Added feature 'ENTERTAINMENT_trxnavg' with AUC=0.7881
Step 10: Added feature 'paycheck_ratio' with AUC=0.7939
Step 11: Added feature 'OVERDRAFT_trxnavg' with AUC=0.7953
Step 12: Added feature 'FITNESS_trxnavg' with AUC=0.7955
Step 13: Added feature 'TAX' with AUC=0.7988
Step 14: Added feature 'LOAN' with AUC=0.7969
Step 15: Added feature 'month_count' with AUC=0.8032
Step 16: Added feature 'atm_cash_spend' with AUC=0.8038
Step 17: Added feature 'refund_ratio_x' with AUC=0.8070
Step 18: Added feature 'OVERDRAFT' with AUC=0.8124
Step 19: Added fe

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final Test ROC-AUC: 0.8051151517918632


In [34]:
from sklearn.model_selection import train_test_split

# Features
X = df_feat.drop(columns=["prism_consumer_id", "evaluation_date", "DQ_TARGET"])
y = df_feat["DQ_TARGET"].astype(int)

# Stratified split (important for class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (8881, 184)
Test size: (2221, 184)


In [35]:
import time
from sklearn.metrics import roc_auc_score, classification_report

def eval_model(name, model, Xtr, ytr, Xte, yte):
    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    t1 = time.perf_counter()

    # Train AUC
    y_train_prob = model.predict_proba(Xtr)[:, 1]
    train_auc = roc_auc_score(ytr, y_train_prob)

    t2 = time.perf_counter()
    y_test_prob = model.predict_proba(Xte)[:, 1]
    t3 = time.perf_counter()

    test_auc = roc_auc_score(yte, y_test_prob)

    print(f"\n{name}")
    print(f"  Train AUC: {train_auc:.4f}")
    print(f"  Test  AUC: {test_auc:.4f}")
    print(f"  Train time: {t1-t0:.4f}s")
    print(f"  Score time: {t3-t2:.4f}s")
    print("\nClassification Report (Test):")
    print(classification_report(yte, model.predict(Xte)))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc,
        "train_time": t1-t0,
        "score_time": t3-t2
    }


In [36]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=2000, n_jobs=-1)

results_logreg = eval_model(
    "Logistic Regression",
    logreg,
    X_train, y_train,
    X_test, y_test
)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression
  Train AUC: 0.6811
  Test  AUC: 0.6628
  Train time: 4.0574s
  Score time: 0.0126s

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2039
           1       0.36      0.03      0.05       182

    accuracy                           0.92      2221
   macro avg       0.64      0.51      0.50      2221
weighted avg       0.87      0.92      0.88      2221



In [37]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    random_state=42,
    n_jobs=-1
)

results_rf = eval_model(
    "Random Forest",
    rf,
    X_train, y_train,
    X_test, y_test
)




Random Forest
  Train AUC: 0.9638
  Test  AUC: 0.7876
  Train time: 1.4046s
  Score time: 0.0827s

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2039
           1       0.00      0.00      0.00       182

    accuracy                           0.92      2221
   macro avg       0.46      0.50      0.48      2221
weighted avg       0.84      0.92      0.88      2221



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

results_xgb = eval_model(
    "XGBoost",
    xgb,
    X_train, y_train,
    X_test, y_test
)



XGBoost
  Train AUC: 0.9977
  Test  AUC: 0.7860
  Train time: 2.4419s
  Score time: 0.0151s

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      2039
           1       0.59      0.09      0.16       182

    accuracy                           0.92      2221
   macro avg       0.76      0.54      0.56      2221
weighted avg       0.90      0.92      0.89      2221



In [39]:
import pandas as pd

results_df = pd.DataFrame([
    results_logreg,
    results_rf,
    results_xgb
])

results_df.sort_values("test_auc", ascending=False)


Unnamed: 0,model,train_auc,test_auc,train_time,score_time
1,Random Forest,0.963814,0.787576,1.404599,0.082664
2,XGBoost,0.997664,0.786032,2.441929,0.015106
0,Logistic Regression,0.681137,0.662838,4.057393,0.01258


In [49]:
from sklearn.model_selection import train_test_split

X = df_feat.drop(columns=["prism_consumer_id", "evaluation_date", "DQ_TARGET"])
y = df_feat["DQ_TARGET"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [50]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

selector_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("select", SelectKBest(score_func=mutual_info_classif, k=50)),
])

selector_pipe.fit(X_train, y_train)

Xtr_50 = selector_pipe.transform(X_train)
Xte_50 = selector_pipe.transform(X_test)

print("Top 50 feature matrix shape:", Xtr_50.shape)


Top 50 feature matrix shape: (8881, 50)


In [51]:
import time
from sklearn.metrics import roc_auc_score, classification_report

def evaluate_model_np(name, model, Xtr, ytr, Xte, yte):
    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    t1 = time.perf_counter()

    train_probs = model.predict_proba(Xtr)[:, 1]
    test_probs  = model.predict_proba(Xte)[:, 1]

    train_auc = roc_auc_score(ytr, train_probs)
    test_auc  = roc_auc_score(yte, test_probs)

    print(f"\n{name} (Top 50)")
    print(f"Train AUC: {train_auc:.4f}")
    print(f"Test  AUC: {test_auc:.4f}")
    print("\nClassification Report (Test, threshold=0.5):")
    print(classification_report(yte, model.predict(Xte), zero_division=0))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc
    }


In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Xtr_scaled = scaler.fit_transform(Xtr_50)
Xte_scaled = scaler.transform(Xte_50)

logreg = LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    n_jobs=-1
)

results_log = evaluate_model_np(
    "Logistic Regression",
    logreg,
    Xtr_scaled, y_train,
    Xte_scaled, y_test
)



Logistic Regression (Top 50)
Train AUC: 0.7296
Test  AUC: 0.7400

Classification Report (Test, threshold=0.5):
              precision    recall  f1-score   support

           0       0.97      0.64      0.77      2039
           1       0.16      0.74      0.26       182

    accuracy                           0.65      2221
   macro avg       0.56      0.69      0.51      2221
weighted avg       0.90      0.65      0.73      2221



In [53]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=8,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

results_rf = evaluate_model_np(
    "Random Forest",
    rf,
    Xtr_50, y_train,
    Xte_50, y_test
)



Random Forest (Top 50)
Train AUC: 0.9294
Test  AUC: 0.7775

Classification Report (Test, threshold=0.5):
              precision    recall  f1-score   support

           0       0.95      0.86      0.90      2039
           1       0.24      0.50      0.32       182

    accuracy                           0.83      2221
   macro avg       0.59      0.68      0.61      2221
weighted avg       0.89      0.83      0.85      2221



In [54]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

results_xgb = evaluate_model_np(
    "XGBoost",
    xgb,
    Xtr_50, y_train,
    Xte_50, y_test
)



XGBoost (Top 50)
Train AUC: 0.9850
Test  AUC: 0.7621

Classification Report (Test, threshold=0.5):
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      2039
           1       0.40      0.07      0.11       182

    accuracy                           0.92      2221
   macro avg       0.66      0.53      0.53      2221
weighted avg       0.88      0.92      0.89      2221



### income

In [23]:
income_df = (
    trxndf_eligible[trxndf_eligible["credit_or_debit"] == "CREDIT"]
    .groupby("prism_consumer_id")
    .agg(
        total_income=("amount", "sum"),
        avg_income=("amount", "mean"),
        median_income=("amount", "median"),
        max_income=("amount", "max"),
        income_txn_count=("amount", "count"),
        income_std=("amount", "std")
    )
    .reset_index()
)

income_df["income_cv"] = (
    income_df["income_std"] / income_df["avg_income"]
)


### category

In [24]:
trx_cat = trxndf_eligible.copy()

# ensure numeric category id exists
trx_cat["category_id"] = pd.to_numeric(trx_cat["category"], errors="coerce").astype("Int64")

cat_tmp = cat_map.copy()
cat_tmp["category_id"] = pd.to_numeric(cat_tmp["category_id"], errors="coerce").astype("Int64")

trx_cat = trx_cat.merge(
    cat_tmp[["category_id", "category"]].rename(columns={"category": "category_name"}),
    on="category_id",
    how="left",
    validate="m:1"
)

trx_cat

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date,category_id,category_name
0,3761,1336892,14,8.70,DEBIT,2020-07-06,14,FOOD_AND_BEVERAGES
1,1607,311813,17,8.86,DEBIT,2020-07-06,17,AUTOMOTIVE
2,3761,1336893,14,11.19,DEBIT,2020-07-06,14,FOOD_AND_BEVERAGES
3,3761,1336894,14,8.72,DEBIT,2020-07-06,14,FOOD_AND_BEVERAGES
4,1607,311812,20,21.99,DEBIT,2020-07-06,20,ENTERTAINMENT
...,...,...,...,...,...,...,...,...
5130548,8495,1659534,14,10.01,DEBIT,2024-04-03,14,FOOD_AND_BEVERAGES
5130549,8495,1659533,14,7.42,DEBIT,2024-04-03,14,FOOD_AND_BEVERAGES
5130550,8495,1659505,18,31.67,DEBIT,2024-04-03,18,GROCERIES
5130551,8495,1659532,14,7.98,DEBIT,2024-04-03,14,FOOD_AND_BEVERAGES


In [25]:
# Only spending
spend_df = trx_cat[trx_cat["credit_or_debit"] == "DEBIT"].copy()

# Total spend per category per consumer
cat_spend = (
    spend_df
    .groupby(["prism_consumer_id", "category_name"])["amount"]
    .sum()
    .unstack(fill_value=0)
    .add_prefix("spend_")
    .reset_index()
)


In [26]:
cat_count = (
    spend_df
    .groupby(["prism_consumer_id", "category_name"])["amount"]
    .count()
    .unstack(fill_value=0)
    .add_prefix("txn_count_")
    .reset_index()
)


In [27]:
total_spend = (
    spend_df
    .groupby("prism_consumer_id")["amount"]
    .sum()
    .rename("total_spend")
    .reset_index()
)


In [168]:
cat_spend = cat_spend.merge(total_spend, on="prism_consumer_id", how="left")

# Create ratio features
spend_cols = [c for c in cat_spend.columns if c.startswith("spend_")]

for col in spend_cols:
    cat_spend[f"{col}_ratio"] = (
        cat_spend[col] / cat_spend["total_spend"]
    ).fillna(0)


In [169]:
overdraft_df = (
    spend_df[spend_df["category_name"] == "OVERDRAFT"]
    .groupby("prism_consumer_id")
    .agg(
        overdraft_count=("amount", "count"),
        overdraft_total=("amount", "sum")
    )
    .reset_index()
)


In [170]:
overdraft_df["has_overdraft"] = (
    overdraft_df["overdraft_count"] > 0
).astype(int)


In [171]:
income_cat_df = trx_cat[trx_cat["credit_or_debit"] == "CREDIT"]

income_by_type = (
    income_cat_df
    .groupby(["prism_consumer_id", "category_name"])["amount"]
    .sum()
    .unstack(fill_value=0)
    .add_prefix("income_")
    .reset_index()
)


In [172]:
category_features = cat_spend.merge(cat_count, on="prism_consumer_id", how="left")
category_features = category_features.merge(overdraft_df, on="prism_consumer_id", how="left")
category_features = category_features.merge(income_by_type, on="prism_consumer_id", how="left")

category_features = category_features.fillna(0)


In [173]:
trx_cat["posted_date"] = pd.to_datetime(trx_cat["posted_date"])
trx_cat["year_month"] = trx_cat["posted_date"].dt.to_period("M")


In [174]:
monthly_spend = (
    trx_cat[trx_cat["credit_or_debit"] == "DEBIT"]
    .groupby(["prism_consumer_id", "year_month", "category_name"])["amount"]
    .sum()
    .reset_index()
)


In [175]:
monthly_spend = monthly_spend.sort_values(
    ["prism_consumer_id", "category_name", "year_month"]
)

monthly_spend["t"] = (
    monthly_spend.groupby(["prism_consumer_id", "category_name"])
    .cumcount()
)


In [176]:
def compute_slope(df):
    n = len(df)
    sum_t = df["t"].sum()
    sum_y = df["amount"].sum()
    sum_tt = (df["t"] ** 2).sum()
    sum_ty = (df["t"] * df["amount"]).sum()

    denom = (n * sum_tt - sum_t**2)

    if denom == 0:
        return 0.0

    return (n * sum_ty - sum_t * sum_y) / denom


trend_df = (
    monthly_spend
    .groupby(["prism_consumer_id", "category_name"])
    .apply(compute_slope)
    .reset_index(name="spend_trend")
)


  .apply(compute_slope)


In [177]:
trend_wide = (
    trend_df
    .pivot(index="prism_consumer_id", 
           columns="category_name", 
           values="spend_trend")
    .add_prefix("trend_")
    .reset_index()
)

trend_wide = trend_wide.fillna(0)


In [178]:
cutoff_date = trx_cat["posted_date"].max() - pd.Timedelta(days=90)

recent_spend = (
    trx_cat[
        (trx_cat["credit_or_debit"] == "DEBIT") &
        (trx_cat["posted_date"] >= cutoff_date)
    ]
    .groupby("prism_consumer_id")["amount"]
    .sum()
    .rename("recent_spend_90d")
    .reset_index()
)

older_spend = (
    trx_cat[
        (trx_cat["credit_or_debit"] == "DEBIT") &
        (trx_cat["posted_date"] < cutoff_date)
    ]
    .groupby("prism_consumer_id")["amount"]
    .sum()
    .rename("older_spend")
    .reset_index()
)

momentum_df = recent_spend.merge(
    older_spend,
    on="prism_consumer_id",
    how="left"
).fillna(0)

momentum_df["spend_momentum"] = (
    momentum_df["recent_spend_90d"] /
    (momentum_df["older_spend"] + 1)
)


In [179]:
category_features = category_features.merge(
    trend_wide,
    on="prism_consumer_id",
    how="left"
)

category_features = category_features.merge(
    momentum_df,
    on="prism_consumer_id",
    how="left"
)

category_features = category_features.fillna(0)


### balance

In [182]:
tx = trxndf_eligible.copy()

tx["posted_date"] = pd.to_datetime(tx["posted_date"])

tx["signed_amount"] = tx["amount"].where(
    tx["credit_or_debit"] == "CREDIT",
    -tx["amount"]
)

tx = tx.sort_values(["prism_consumer_id", "posted_date"])
tx["running_balance"] = (
    tx.groupby("prism_consumer_id")["signed_amount"]
    .cumsum()
)


In [183]:
tx["year_month"] = tx["posted_date"].dt.to_period("M")

monthly_balance = (
    tx.groupby(["prism_consumer_id", "year_month"])["running_balance"]
    .mean()
    .reset_index()
)


In [184]:
monthly_balance = monthly_balance.sort_values(
    ["prism_consumer_id", "year_month"]
)

monthly_balance["t"] = (
    monthly_balance.groupby("prism_consumer_id")
    .cumcount()
)


In [185]:
def compute_slope(df):
    n = len(df)
    sum_t = df["t"].sum()
    sum_y = df["running_balance"].sum()
    sum_tt = (df["t"] ** 2).sum()
    sum_ty = (df["t"] * df["running_balance"]).sum()

    denom = (n * sum_tt - sum_t**2)

    if denom == 0:
        return 0.0

    return (n * sum_ty - sum_t * sum_y) / denom


balance_trend = (
    monthly_balance
    .groupby("prism_consumer_id")
    .apply(compute_slope)
    .reset_index(name="balance_trend")
)


  .apply(compute_slope)


In [186]:
balance_volatility = (
    monthly_balance
    .groupby("prism_consumer_id")["running_balance"]
    .std()
    .reset_index(name="balance_volatility")
)


In [187]:
negative_balance = (
    tx.groupby("prism_consumer_id")["running_balance"]
    .apply(lambda x: (x < 0).mean())
    .reset_index(name="pct_time_negative_balance")
)


In [188]:
balance_features = balance_trend.merge(
    balance_volatility,
    on="prism_consumer_id",
    how="left"
)

balance_features = balance_features.merge(
    negative_balance,
    on="prism_consumer_id",
    how="left"
).fillna(0)


In [190]:
# Use trx_cat (already has posted_date cleaned)
income_tx = trx_cat[trx_cat["credit_or_debit"] == "CREDIT"].copy()

income_tx["posted_date"] = pd.to_datetime(income_tx["posted_date"])
income_tx["year_month"] = income_tx["posted_date"].dt.to_period("M")

monthly_income = (
    income_tx
    .groupby(["prism_consumer_id", "year_month"])["amount"]
    .sum()
    .reset_index()
)


In [191]:
income_mean = (
    monthly_income
    .groupby("prism_consumer_id")["amount"]
    .mean()
    .reset_index(name="avg_monthly_income")
)

income_std = (
    monthly_income
    .groupby("prism_consumer_id")["amount"]
    .std()
    .reset_index(name="income_volatility")
)

income_stability = income_mean.merge(
    income_std,
    on="prism_consumer_id",
    how="left"
)

income_stability = income_stability.fillna(0)


In [192]:
income_stability["income_cv"] = (
    income_stability["income_volatility"] /
    (income_stability["avg_monthly_income"] + 1)
)


In [193]:
monthly_income = monthly_income.sort_values(
    ["prism_consumer_id", "year_month"]
)

monthly_income["t"] = (
    monthly_income.groupby("prism_consumer_id")
    .cumcount()
)


In [194]:
def compute_income_slope(df):
    n = len(df)
    sum_t = df["t"].sum()
    sum_y = df["amount"].sum()
    sum_tt = (df["t"] ** 2).sum()
    sum_ty = (df["t"] * df["amount"]).sum()

    denom = (n * sum_tt - sum_t**2)

    if denom == 0:
        return 0.0

    return (n * sum_ty - sum_t * sum_y) / denom


income_trend = (
    monthly_income
    .groupby("prism_consumer_id")
    .apply(compute_income_slope)
    .reset_index(name="income_trend")
)


  .apply(compute_income_slope)


In [195]:
income_stability = income_stability.merge(
    income_trend,
    on="prism_consumer_id",
    how="left"
).fillna(0)


In [196]:
features_df = income_df.merge(category_features, on="prism_consumer_id", how="left")
features_df = features_df.merge(income_stability, on="prism_consumer_id", how="left")
features_df = features_df.merge(balance_features, on="prism_consumer_id", how="left")

features_df = features_df.fillna(0)


In [197]:
model_df = consdf_eligible.merge(
    features_df,
    on="prism_consumer_id",
    how="left"
).fillna(0)


In [198]:
y = model_df["DQ_TARGET"].astype(int)

X = model_df.drop(
    columns=[
        "prism_consumer_id",
        "evaluation_date",
        "DQ_TARGET",
        "credit_score"   # IMPORTANT
    ],
    errors="ignore"
)


In [199]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.8,
    random_state=42,
    stratify=y
)


In [200]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [201]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

logreg.fit(X_train_scaled, y_train)


In [202]:
from sklearn.metrics import roc_auc_score

# Train AUC
train_probs = logreg.predict_proba(X_train_scaled)[:, 1]
train_auc = roc_auc_score(y_train, train_probs)

# Test AUC
test_probs = logreg.predict_proba(X_test_scaled)[:, 1]
test_auc = roc_auc_score(y_test, test_probs)

print("Train ROC-AUC:", round(train_auc, 4))
print("Test  ROC-AUC:", round(test_auc, 4))


Train ROC-AUC: 0.8927
Test  ROC-AUC: 0.6728


In [203]:
from sklearn.metrics import classification_report

y_pred = logreg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      8154
           1       0.15      0.53      0.24       728

    accuracy                           0.72      8882
   macro avg       0.55      0.64      0.53      8882
weighted avg       0.88      0.72      0.78      8882



In [5]:
# Ensure datetime
trxndf = trxndf.copy()
trxndf["posted_date"] = pd.to_datetime(trxndf["posted_date"], errors="coerce")

# Drop duplicated prism_transaction_id (keep first chronologically per consumer when possible)
# (Your original notebook deduped by prism_transaction_id only; we keep that behavior but stabilize by posted_date.)
trxndf = trxndf.sort_values(["posted_date"])
trxndf = trxndf.drop_duplicates(subset=["prism_transaction_id"], keep="first")

# Map category_id -> category_name (keep original numeric category_id in `category`)
cat_map = cat_map.rename(columns={"category_id": "category", "category": "category_name"})
mapping = dict(zip(cat_map["category"], cat_map["category_name"]))
trxndf["category_name"] = trxndf["category"].map(mapping)

print("trxndf after dedupe:", trxndf.shape)
print("category_name nulls:", trxndf["category_name"].isna().mean())

trxndf after dedupe: (6405309, 7)
category_name nulls: 0.0


In [6]:
def build_features_categories_income(consdf: pd.DataFrame, trxndf: pd.DataFrame) -> pd.DataFrame:
    """Income + paycheck + income volatility + recent income + income category ratios (from features_categories.ipynb)."""
    income_categories = [
        "PAYCHECK",
        "DEPOSIT",
        "UNEMPLOYMENT_BENEFITS",
        "OTHER_BENEFITS",
        "PENSION",
        "INVESTMENT_INCOME",
    ]

    income_df = trxndf[trxndf["category_name"].isin(income_categories)].copy()
    income_df["posted_date"] = pd.to_datetime(income_df["posted_date"], errors="coerce")

    # If duplicates exist at consumer+transaction, drop to avoid double-counting
    income_df = income_df.sort_values("posted_date").drop_duplicates(
        subset=["prism_consumer_id", "prism_transaction_id"], keep="first"
    )

    income_core = (
        income_df.groupby("prism_consumer_id")
        .agg(
            total_income=("amount", "sum"),
            avg_income=("amount", "mean"),
            median_income=("amount", "median"),
            max_income=("amount", "max"),
            income_txn_count=("amount", "count"),
        )
        .reset_index()
    )

    # Ratios of income sources (counts)
    category_counts = (
        income_df.groupby(["prism_consumer_id", "category_name"])
        .size()
        .unstack(fill_value=0)
        .reset_index()
    )

    category_ratios = category_counts.merge(
        income_core[["prism_consumer_id", "income_txn_count"]],
        on="prism_consumer_id",
        how="left",
    )

    for cat in income_categories:
        if cat in category_ratios.columns:
            category_ratios[f"{cat.lower()}_ratio"] = category_ratios[cat] / (category_ratios["income_txn_count"] + 1e-9)

    category_ratios = category_ratios[
        ["prism_consumer_id"] + [c for c in category_ratios.columns if c.endswith("_ratio")]
    ]

    income_vol = (
        income_df.groupby("prism_consumer_id")
        .agg(income_std=("amount", "std"))
        .reset_index()
    )

    # income_cv = std / avg
    avg_income_map = income_core.set_index("prism_consumer_id")["avg_income"]
    income_vol["income_cv"] = income_vol.set_index("prism_consumer_id")["income_std"] / (avg_income_map + 1e-9)
    income_vol = income_vol.reset_index()

    paycheck_df = income_df[income_df["category_name"] == "PAYCHECK"]
    paycheck_features = (
        paycheck_df.groupby("prism_consumer_id")
        .agg(
            paycheck_count=("amount", "count"),
            avg_paycheck=("amount", "mean"),
            std_paycheck=("amount", "std"),
            max_paycheck=("amount", "max"),
        )
        .reset_index()
    )

    income_time = (
        income_df.groupby("prism_consumer_id")
        .agg(first_income_date=("posted_date", "min"), last_income_date=("posted_date", "max"))
        .reset_index()
    )
    income_time["income_span_days"] = (income_time["last_income_date"] - income_time["first_income_date"]).dt.days

    latest_date = income_df["posted_date"].max()
    if pd.isna(latest_date):
        recent_income = pd.DataFrame(columns=["prism_consumer_id", "income_last_90d", "txn_last_90d"])
    else:
        recent_income = (
            income_df[income_df["posted_date"] >= latest_date - pd.Timedelta(days=90)]
            .groupby("prism_consumer_id")
            .agg(income_last_90d=("amount", "sum"), txn_last_90d=("amount", "count"))
            .reset_index()
        )

    # Return ONLY engineered features (keep key). Do NOT include target columns here.
    out = (
        consdf[["prism_consumer_id"]]
        .merge(income_core, on="prism_consumer_id", how="left")
        .merge(income_vol, on="prism_consumer_id", how="left")
        .merge(paycheck_features, on="prism_consumer_id", how="left")
        .merge(category_ratios, on="prism_consumer_id", how="left")
        .merge(income_time[["prism_consumer_id", "income_span_days"]], on="prism_consumer_id", how="left")
        .merge(recent_income, on="prism_consumer_id", how="left")
    )

    return out.fillna(0)


def build_ellie_w5_features(consdf: pd.DataFrame, acctdf: pd.DataFrame, trxndf: pd.DataFrame) -> pd.DataFrame:
    """Monthly stability + credit/debit aggregates + category outflow ratios + category transaction averages (from ellie_w5.ipynb)."""
    # --- Initial consumer-level balance snapshot (sum across accounts) ---
    acct_tmp = acctdf.copy()
    acct_tmp["balance_date"] = pd.to_datetime(acct_tmp["balance_date"], errors="coerce")

    acct_agg = (
        acct_tmp.groupby("prism_consumer_id")
        .agg(balance=("balance", "sum"), balance_date=("balance_date", "max"))
        .reset_index()
    )

    initial_df = acct_agg.merge(trxndf, on="prism_consumer_id", how="inner").copy()

    # Signed amount
    initial_df["signed_amount"] = np.where(
        initial_df["credit_or_debit"].eq("DEBIT"), -initial_df["amount"], initial_df["amount"]
    )
    initial_df["posted_date"] = pd.to_datetime(initial_df["posted_date"], errors="coerce")

    # --- Credit/Debit aggregates ---
    cd_df = (
        initial_df[["prism_consumer_id", "amount", "credit_or_debit"]]
        .groupby(["prism_consumer_id", "credit_or_debit"])["amount"]
        .sum()
        .reset_index()
        .pivot_table(index="prism_consumer_id", columns="credit_or_debit", values="amount", aggfunc="sum", fill_value=0)
    )
    cd_df["credit_debit_ratio"] = cd_df.get("CREDIT", 0) / (cd_df.get("DEBIT", 0) + 1)
    cd_df["net_flow_cd"] = cd_df.get("CREDIT", 0) - cd_df.get("DEBIT", 0)
    cd_df = cd_df.reset_index()

    # --- Monthly stability features (based on signed transactions) ---
    net_df = initial_df[["prism_consumer_id", "posted_date", "signed_amount"]].copy()
    net_df["month"] = net_df["posted_date"].dt.to_period("M")

    mn_df = (
        net_df.groupby(["prism_consumer_id", "month"])
        .agg(monthly_total=("signed_amount", "sum"), monthly_std=("signed_amount", "std"))
        .reset_index()
    )

    monthly_features = (
        mn_df.groupby("prism_consumer_id")
        .agg(
            monthly_net_total=("monthly_total", "sum"),
            monthly_net_avg=("monthly_total", "mean"),
            monthly_net_max=("monthly_total", "max"),
            monthly_net_min=("monthly_total", "min"),
            monthly_std_avg=("monthly_std", "mean"),
        )
        .reset_index()
    )
    monthly_features["net_range"] = monthly_features["monthly_net_max"] - monthly_features["monthly_net_min"]

    # A simpler monthly summary (matches your mtotal_df spirit)
    mtotal_df = (
        mn_df.groupby("prism_consumer_id")
        .agg(
            monthly_mean=("monthly_total", "mean"),
            monthly_max=("monthly_total", "max"),
            monthly_min=("monthly_total", "min"),
            month_count=("monthly_total", "count"),
        )
        .reset_index()
    )

    # --- Category sums pivot (signed) ---
    # Use category_name already attached to trxndf (we add it in earlier cells)
    cat_df = initial_df.groupby(["prism_consumer_id", "category_name"])["signed_amount"].sum().reset_index()
    cat_pivot_sum = cat_df.pivot(index="prism_consumer_id", columns="category_name", values="signed_amount").fillna(0)

    outflows = cat_pivot_sum.clip(upper=0).abs()
    inflows = cat_pivot_sum.clip(lower=0)

    cat_features = pd.DataFrame(index=cat_pivot_sum.index)
    cat_features["total_outflows"] = outflows.sum(axis=1)
    cat_features["total_inflows"] = inflows.sum(axis=1)
    cat_features["net_flow_cat"] = cat_pivot_sum.sum(axis=1)

    # Outflow ratios for each category (matches your loop)
    for col in outflows.columns:
        cat_features[f"{col}_outflow_ratio"] = outflows[col] / (cat_features["total_outflows"] + 1)

    # Key hand-picked ratios (as in your notebook)
    cat_features["paycheck_ratio"] = inflows.get("PAYCHECK", 0) / (cat_features["total_inflows"] + 1)
    cat_features["atm_cash_ratio"] = outflows.get("ATM_CASH", 0) / (cat_features["total_outflows"] + 1)
    cat_features["entertainment_ratio"] = outflows.get("ENTERTAINMENT", 0) / (cat_features["total_outflows"] + 1)
    cat_features["refund_ratio"] = inflows.get("REFUND", 0) / (cat_features["total_inflows"] + 1)

    cat_features = cat_features.reset_index().rename(columns={"index": "prism_consumer_id"})

    # --- Category transaction averages (trxnavg) ---
    # In your notebook you created cat_pivot with columns + "_trxnavg"
    cat_avg_df = (
        initial_df.groupby(["prism_consumer_id", "category_name"])["signed_amount"]
        .mean()
        .reset_index()
    )
    cat_pivot_avg = (
        cat_avg_df.pivot(index="prism_consumer_id", columns="category_name", values="signed_amount")
        .fillna(0)
    )
    cat_pivot_avg.columns = [f"{c}_trxnavg" for c in cat_pivot_avg.columns]
    cat_pivot_avg = cat_pivot_avg.reset_index()

    # Merge all w5 feature blocks (keyed by prism_consumer_id)
    w5 = (
        consdf[["prism_consumer_id"]]
        .merge(cd_df, on="prism_consumer_id", how="left")
        .merge(monthly_features, on="prism_consumer_id", how="left")
        .merge(mtotal_df, on="prism_consumer_id", how="left")
        .merge(cat_features, on="prism_consumer_id", how="left")
        .merge(cat_pivot_avg, on="prism_consumer_id", how="left")
    )

    # Prefix columns (except key) to avoid collisions with base notebook features
    keep = ["prism_consumer_id"]
    rename = {c: f"w5_{c}" for c in w5.columns if c not in keep}
    w5 = w5.rename(columns=rename)

    return w5.fillna(0)