In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## loading data

In [2]:
CONS_PATH = "/uss/hdsi-prismdata/q2-ucsd-consDF.pqt"
ACCT_PATH = "/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt"
TRXN_PATH = "/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt"
CATMAP_PATH = "/uss/hdsi-prismdata/q2-ucsd-cat-map.csv"

In [87]:
# Load data
consdf = pd.read_parquet(CONS_PATH)
acctdf = pd.read_parquet(ACCT_PATH)
trxndf = pd.read_parquet(TRXN_PATH)
cat_map = pd.read_csv(CATMAP_PATH)

print("consdf:", consdf.shape)
print("acctdf:", acctdf.shape)
print("trxndf:", trxndf.shape)
print("cat_map:", cat_map.shape)

consdf: (15000, 4)
acctdf: (24466, 5)
trxndf: (6407321, 6)
cat_map: (50, 2)


In [88]:
consdf

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0
...,...,...,...,...
14995,14995,2022-03-08,655.0,
14996,14996,2022-01-15,625.0,
14997,14997,2022-01-31,688.0,
14998,14998,2022-03-08,722.0,


In [89]:
acctdf

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.90
...,...,...,...,...,...
24461,11500,24461,CHECKING,2022-03-27,732.75
24462,11615,24462,SAVINGS,2022-03-30,5.00
24463,11615,24463,CHECKING,2022-03-30,1956.46
24464,12210,24464,CHECKING,2022-03-28,2701.51


In [90]:
trxndf

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
6407316,10533,6405304,31,4.96,DEBIT,2022-03-11
6407317,10533,6405305,12,63.48,DEBIT,2022-03-30
6407318,10533,6405306,12,53.99,DEBIT,2022-03-30
6407319,10533,6405307,12,175.98,DEBIT,2022-03-31


## data cleaning/prepping

In [92]:
consdf = consdf.copy()
consdf["evaluation_date"] = pd.to_datetime(consdf["evaluation_date"], errors="coerce")

# drop missing DQ_TARGET
consdf = consdf[consdf["DQ_TARGET"].notna()].copy()
consdf["DQ_TARGET"] = consdf["DQ_TARGET"].astype(int)

acctdf = acctdf.copy()
acctdf["balance_date"] = pd.to_datetime(acctdf["balance_date"], errors="coerce")

trxndf = trxndf.copy()
trxndf["posted_date"] = pd.to_datetime(trxndf["posted_date"], errors="coerce")

# Deduplicate transactions (use this whenever you build transaction features)
trxndf = (
    trxndf.sort_values(["posted_date"])
      .drop_duplicates(subset=["prism_transaction_id"], keep="first")
)


## scoring exclusions

In [93]:
# Accounts: how many accounts + how many balance snapshots
acct_stats = (
    acctdf.groupby("prism_consumer_id")
    .agg(
        n_accounts=("prism_account_id", "nunique"),
        n_balance_days=("balance_date", "nunique"),
        first_balance=("balance_date", "min"),
        last_balance=("balance_date", "max"),
    )
    .reset_index()
)

# Transactions: count + span + credits
tx_stats = (
    trxndf.groupby("prism_consumer_id")
    .agg(
        n_txn=("prism_transaction_id", "count"),
        first_txn=("posted_date", "min"),
        last_txn=("posted_date", "max"),
    )
    .reset_index()
)
tx_stats["txn_span_days"] = (tx_stats["last_txn"] - tx_stats["first_txn"]).dt.days

credit_stats = (
    trxndf.assign(is_credit=(trxndf["credit_or_debit"] == "CREDIT").astype(int))
    .groupby("prism_consumer_id")
    .agg(n_credit=("is_credit", "sum"))
    .reset_index()
)

# Combine into one scoring table (one row per consumer)
scoring = (
    consdf[["prism_consumer_id", "evaluation_date", "DQ_TARGET", "credit_score"]]
    .merge(acct_stats, on="prism_consumer_id", how="left")
    .merge(tx_stats, on="prism_consumer_id", how="left")
    .merge(credit_stats, on="prism_consumer_id", how="left")
)

# Fill missing stats with 0 where appropriate
for col in ["n_accounts", "n_balance_days", "n_txn", "txn_span_days", "n_credit"]:
    if col in scoring.columns:
        scoring[col] = scoring[col].fillna(0)


In [94]:
scoring.head()

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,credit_score,n_accounts,n_balance_days,first_balance,last_balance,n_txn,first_txn,last_txn,txn_span_days,n_credit
0,0,2021-09-01,0,726.0,2.0,1.0,2021-08-31,2021-08-31,408.0,2021-03-16,2021-09-11,179.0,38.0
1,1,2021-07-01,0,626.0,2.0,1.0,2021-06-30,2021-06-30,314.0,2021-01-15,2021-07-14,180.0,71.0
2,2,2021-05-01,0,680.0,2.0,1.0,2021-04-30,2021-04-30,448.0,2020-11-04,2021-05-03,180.0,81.0
3,3,2021-03-01,0,734.0,2.0,1.0,2021-02-28,2021-02-28,271.0,2020-09-23,2021-03-22,180.0,51.0
4,4,2021-10-01,0,676.0,2.0,2.0,2020-12-31,2021-09-30,306.0,2020-07-19,2021-06-21,337.0,40.0


In [95]:
# consumers with no accounts
consumers_with_accounts = set(acctdf["prism_consumer_id"].unique())
all_consumers = set(consdf["prism_consumer_id"].unique())

no_account_ids = list(all_consumers - consumers_with_accounts)

print("Total consumers:", len(all_consumers))
print("Consumers with NO accounts:", len(no_account_ids))

Total consumers: 12000
Consumers with NO accounts: 1592


In [96]:
1592/12000

0.13266666666666665

In [97]:
# checking to see how many "no accounts" have transactions
trx_no_account = trxndf[trxndf["prism_consumer_id"].isin(no_account_ids)]

consumers_no_account_with_txn = trx_no_account["prism_consumer_id"].nunique()

print("\nConsumers with NO accounts but WITH transactions:", consumers_no_account_with_txn)


# 3️⃣ Total transaction rows for these consumers
print("Total transaction rows for these consumers:", trx_no_account.shape[0])


# 4️⃣ Show example consumer IDs
print("\nExample consumer IDs (no account but with transactions):")
print(list(trx_no_account["prism_consumer_id"].unique())[:5])


# 5️⃣ Show sample transaction rows
print("\nSample transaction rows:")
display(trx_no_account.head(10))


Consumers with NO accounts but WITH transactions: 1479
Total transaction rows for these consumers: 961436

Example consumer IDs (no account but with transactions):
['1706', '3327', '2042', '1970', '114']

Sample transaction rows:


Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
858236,1706,857660,0,2000.0,DEBIT,2020-11-01
895947,3327,895260,0,300.0,CREDIT,2020-11-01
895953,3327,895266,0,200.0,DEBIT,2020-11-02
895954,3327,895267,0,100.0,DEBIT,2020-11-06
728042,2042,727489,2,500.0,CREDIT,2020-11-12
858237,1706,857661,1,5000.0,DEBIT,2020-11-12
728043,2042,727490,4,25.0,CREDIT,2020-11-18
728044,2042,727491,2,500.0,CREDIT,2020-11-18
728045,2042,727492,4,475.0,CREDIT,2020-11-19
88530,1970,88466,4,0.05,CREDIT,2020-11-20


In [98]:
scoring["n_txn"].describe()


count    12000.000000
mean       427.969500
std        393.403508
min          0.000000
25%        155.000000
50%        337.000000
75%        594.250000
max       8478.000000
Name: n_txn, dtype: float64

In [99]:
scoring["txn_span_days"].describe()


count    12000.000000
mean       153.381167
std         78.184482
min          0.000000
25%         88.000000
50%        178.000000
75%        242.000000
max        801.000000
Name: txn_span_days, dtype: float64

even the low activity consumers have 88 days (2-3 months) of transaction history

In [100]:
scoring["n_credit"].describe()


count    12000.000000
mean        73.214583
std         76.819153
min          0.000000
25%         28.000000
50%         52.000000
75%         93.000000
max       1553.000000
Name: n_credit, dtype: float64

bottom 25% has 28 credit transactions

In [101]:
RULES = {
    "no_accounts_and_no_transactions": 
    (scoring["n_accounts"] < 1) & (scoring["n_txn"] < 1),

    # short behavioral history (less than 30 days)
    "short_txn_history": scoring["txn_span_days"] < 30,
}

# store reasons
for name, mask in RULES.items():
    scoring[name] = mask

# Exclusion flag
scoring["excluded"] = scoring[list(RULES.keys())].any(axis=1)

# Summary
print("Total consumers:", scoring.shape[0])
print("Excluded:", scoring["excluded"].sum())
print("Eligible:", (~scoring["excluded"]).sum())
print("Exclusion rate:", scoring["excluded"].mean())


Total consumers: 12000
Excluded: 898
Eligible: 11102
Exclusion rate: 0.07483333333333334


In [102]:
eligible_ids = scoring.loc[~scoring["excluded"], "prism_consumer_id"]

print("Eligible consumers:", len(eligible_ids))


Eligible consumers: 11102


In [103]:
consdf_eligible = consdf[
    consdf["prism_consumer_id"].isin(eligible_ids)
].copy()

acctdf_eligible = acctdf[
    acctdf["prism_consumer_id"].isin(eligible_ids)
].copy()

trxndf_eligible = trxndf[
    trxndf["prism_consumer_id"].isin(eligible_ids)
].copy()


In [104]:
print("Consumers in consdf_eligible:", consdf_eligible["prism_consumer_id"].nunique())
print("Consumers in acctdf_eligible:", acctdf_eligible["prism_consumer_id"].nunique())
print("Consumers in trxndf_eligible:", trxndf_eligible["prism_consumer_id"].nunique())


Consumers in consdf_eligible: 11102
Consumers in acctdf_eligible: 9659
Consumers in trxndf_eligible: 11102


In [105]:
# consdf = consdf_eligible
# acctdf = acctdf_eligible
# trxndf = trxndf_eligible

## feature engineering

In [106]:
initial_df = (
    acctdf
    .merge(consdf, on='prism_consumer_id', how='inner')
    .groupby(['prism_consumer_id'])
    .agg(
        balance=('balance', 'sum'),
        balance_date=('balance_date', 'max')
    )
    .reset_index()
).merge(trxndf,on='prism_consumer_id')

In [107]:
mapping = dict(zip(cat_map["category_id"], cat_map["category"]))
initial_df["category"] = initial_df["category"].replace(mapping)
monthly_summary=initial_df.copy()
monthly_summary['amount'] = np.where(initial_df['credit_or_debit'] == 'DEBIT', -initial_df['amount'],initial_df['amount'])
monthly_summary['posted_date'] = pd.to_datetime(monthly_summary['posted_date'])
monthly_summary = (
    monthly_summary
    .groupby(['prism_consumer_id', monthly_summary['posted_date'].dt.to_period('M')])
    .agg(
        starting_balance=('balance', 'first'),
        monthly_total=('balance', 'sum'),
        trxndf_count = ('balance', 'count')
    )
    .reset_index()
)
monthly_summary['posted_date'] = monthly_summary['posted_date'].dt.to_timestamp()

In [108]:
monthly_summary = monthly_summary.merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [109]:
# ensure date type
monthly_summary["posted_date"] = pd.to_datetime(monthly_summary["posted_date"])

# sort properly
monthly_summary = monthly_summary.sort_values(["prism_consumer_id", "posted_date"])

# calculate running balance
monthly_summary["monthly_balance"] = (
    monthly_summary["starting_balance"]
    + monthly_summary.groupby("prism_consumer_id")["monthly_total"].cumsum()
)

In [110]:
del_df = monthly_summary[monthly_summary['DQ_TARGET'] == 1]
nondel_df = monthly_summary[monthly_summary['DQ_TARGET'] == 0]
ids_1 = del_df["prism_consumer_id"].dropna().unique()
ids_0 = del_df["prism_consumer_id"].dropna().unique()

In [111]:
mtotal_df = monthly_summary.groupby('prism_consumer_id').agg(
        DQ_TARGET = ('DQ_TARGET', 'first'),
        monthly_mean=('monthly_total', 'mean'),
        monthly_max=('monthly_total', 'max'),
        monthly_min=('monthly_total', 'min'),
        trxndf_count = ('trxndf_count','first'),
        month_count=('monthly_total', 'count')
    )

In [112]:
cd_df = initial_df[['prism_consumer_id','amount','credit_or_debit']].groupby(['prism_consumer_id','credit_or_debit']).sum().reset_index()


In [113]:
cd_df = (
    cd_df
    .pivot_table(
        index='prism_consumer_id',
        columns='credit_or_debit',
        values='amount',
        aggfunc='sum',
        fill_value=0
    )
    .assign(
        credit_debit_ratio=lambda x: x['CREDIT'] / (x['DEBIT'] + 1),
        net_flow=lambda x: x['CREDIT'] - x['DEBIT']
    )
)

In [114]:
cd_df = cd_df.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [115]:
net_df = initial_df[['prism_consumer_id','posted_date','category','credit_or_debit','amount']].copy()
net_df['amount'] = np.where(net_df['credit_or_debit'] == 'DEBIT', -net_df['amount'],net_df['amount'])
net_df['posted_date'] = pd.to_datetime(net_df['posted_date'])
net_df['month'] = net_df['posted_date'].dt.to_period('M')
mn_df = net_df.groupby(['prism_consumer_id','month']).agg(
        monthly_total=('amount', 'sum'),
        monthly_std =('amount','std')
    ).reset_index()


monthly features

In [116]:
monthly_features = mn_df.groupby(['prism_consumer_id']).agg(
    monthly_net_total=('monthly_total', 'sum'),
    monthly_net_avg=('monthly_total', 'mean'),
    monthly_net_max=('monthly_total', 'max'),
    monthly_net_min=('monthly_total', 'min'),
    monthly_std_avg=('monthly_std', 'mean')
).reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()
monthly_features['prism_consumer_id'] = monthly_features['prism_consumer_id'].astype(int)
mtotal_df = mtotal_df.reset_index()
mtotal_df['prism_consumer_id'] = mtotal_df['prism_consumer_id'].astype(int)
cd_df['prism_consumer_id'] = cd_df['prism_consumer_id'].astype(int)
monthly_features['net_range'] = monthly_features['monthly_net_max'] - monthly_features['monthly_net_min']

In [117]:
initial_df['amount'] = np.where(initial_df['credit_or_debit'] == 'DEBIT', -initial_df['amount'],initial_df['amount'])
cat_df = initial_df.groupby(['prism_consumer_id','category'])['amount'].sum().reset_index()

In [118]:
cat_pivot = (
    cat_df
    .pivot(
        index='prism_consumer_id',
        columns='category',
        values='amount'
    )
    .fillna(0)
)

In [119]:
outflows = cat_pivot.clip(upper=0).abs()
inflows  = cat_pivot.clip(lower=0)

cat_features = pd.DataFrame(index=cat_pivot.index)

cat_features['total_outflows'] = outflows.sum(axis=1)
cat_features['total_inflows']  = inflows.sum(axis=1)
cat_features['net_flow']       = cat_pivot.sum(axis=1)

In [120]:
for col in outflows.columns:
    cat_features[f'{col}_outflow_ratio'] = (
        outflows[col] / (cat_features['total_outflows'] + 1))

In [121]:
# Income reliance
cat_features['paycheck_ratio'] = (
    inflows.get('PAYCHECK', 0) / (cat_features['total_inflows'] + 1)
)

# Cash usage
cat_features['atm_cash_ratio'] = (
    outflows.get('ATM_CASH', 0) / (cat_features['total_outflows'] + 1)
)

# Entertainment vs essentials proxy
cat_features['entertainment_ratio'] = (
    outflows.get('ENTERTAINMENT', 0) / (cat_features['total_outflows'] + 1)
)

# Refund dependence
cat_features['refund_ratio'] = (
    inflows.get('REFUND', 0) / (cat_features['total_inflows'] + 1)
)

In [122]:
outflows = outflows.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [123]:
cat_features = cat_features.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()


In [124]:
add_df = cat_features[['prism_consumer_id','refund_ratio','paycheck_ratio']].copy()
add_df['prism_consumer_id'] = add_df['prism_consumer_id'].astype(int)
outflows['prism_consumer_id'] = outflows['prism_consumer_id'].astype(int)
out_df = outflows.copy()

In [125]:
initial_df['amount'] = np.where(initial_df['credit_or_debit'] == 'DEBIT', -initial_df['amount'],initial_df['amount'])
cat_df = initial_df.groupby(['prism_consumer_id','category'])['amount'].mean().reset_index()

In [126]:
cat_pivot = (
    cat_df
    .pivot(
        index='prism_consumer_id',
        columns='category',
        values='amount'
    )
    .fillna(0)
)
cat_pivot.columns = cat_pivot.columns + "_trxnavg"
cat_pivot = cat_pivot.reset_index().merge(consdf[['prism_consumer_id','DQ_TARGET']],on='prism_consumer_id').dropna()
cat_pivot['prism_consumer_id'] = cat_pivot['prism_consumer_id'].astype(int)

income

In [127]:
mapping = dict(zip(cat_map["category_id"], cat_map["category"]))
trxndf["category"] = trxndf["category"].replace(mapping)

income_categories = [
    'PAYCHECK',
    'DEPOSIT',
    'UNEMPLOYMENT_BENEFITS',
    'OTHER_BENEFITS',
    'PENSION',
    'INVESTMENT_INCOME'
]

income_df = trxndf[
    trxndf['category'].isin(income_categories)
].copy()
income_df['prism_transaction_id'].duplicated().sum()
income_df['posted_date'] = pd.to_datetime(income_df['posted_date'])

In [128]:
income_time = (
    income_df
    .groupby('prism_consumer_id')
    .agg(
        first_income_date=('posted_date', 'min'),
        last_income_date=('posted_date', 'max')
    )
    .reset_index()
)

income_time['income_span_days'] = (
    income_time['last_income_date'] - income_time['first_income_date']
).dt.days

In [129]:
income_df = income_time[['prism_consumer_id','income_span_days']]
income_df['prism_consumer_id'] = income_time['prism_consumer_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_df['prism_consumer_id'] = income_time['prism_consumer_id'].astype(int)


preliminary testing

In [130]:
cat_pivot= cat_pivot.drop(columns='DQ_TARGET')


In [131]:
main_df= monthly_features.merge(mtotal_df,on='prism_consumer_id')
main_df['DQ_TARGET'] = main_df['DQ_TARGET_x']
main_df = main_df.drop(columns=['DQ_TARGET_x','DQ_TARGET_y'])
cd_df = cd_df.drop(columns=['net_flow','DQ_TARGET'])
main_df= main_df.merge(cd_df,on='prism_consumer_id')
main_df= main_df.merge(add_df,on='prism_consumer_id')
main_df= main_df.merge(out_df,on='prism_consumer_id')
main_df= main_df.merge(income_df,on='prism_consumer_id')
main_df= main_df.merge(cat_pivot,on='prism_consumer_id')
main_df

Unnamed: 0,prism_consumer_id,monthly_net_total,monthly_net_avg,monthly_net_max,monthly_net_min,monthly_std_avg,net_range,monthly_mean,monthly_max,monthly_min,...,REFUND_trxnavg,RENT_trxnavg,RISK_CATCH_ALL_trxnavg,RTO_LTO_trxnavg,SELF_TRANSFER_trxnavg,TAX_trxnavg,TIME_OR_STUFF_trxnavg,TRANSPORATION_trxnavg,TRAVEL_trxnavg,UNEMPLOYMENT_BENEFITS_trxnavg
0,0,-521.59,-74.512857,830.73,-2584.24,213.544425,3414.97,1.867299e+04,27231.45,8970.36,...,19.960000,0.000000,0.0,0.0,116.685652,867.840,0.000000,2.480000,54.375000,0.0
1,1,1805.43,257.918571,1109.02,-940.73,292.763392,2049.75,1.481371e+05,208052.46,102375.02,...,2.420000,0.000000,0.0,0.0,233.410256,1162.700,0.000000,25.900000,0.000000,0.0
2,10,-1190.04,-170.005714,431.40,-971.45,260.603079,1402.85,4.015226e+04,60169.52,19781.76,...,18.466000,103.000000,0.0,0.0,237.568750,0.000,0.000000,17.520000,0.000000,0.0
3,100,-4505.77,-750.961667,1276.72,-3332.81,832.186871,4609.53,5.399456e+04,63731.28,45142.99,...,1.468750,0.000000,0.0,0.0,547.296667,0.000,0.000000,0.000000,0.000000,0.0
4,1000,438.08,62.582857,2982.67,-2884.56,1223.790895,5867.23,2.871107e+03,3524.25,476.25,...,1.370000,0.000000,0.0,0.0,828.920370,0.000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9589,995,21842.68,3120.382857,10212.74,-3244.20,1016.422674,13456.94,2.053630e+06,3162590.31,670852.49,...,1.463333,1134.158000,0.0,0.0,1250.937000,626.432,0.000000,0.000000,0.000000,0.0
9590,996,26713.18,3816.168571,41464.50,-16811.41,2623.765971,58275.91,0.000000e+00,0.00,0.00,...,2.944444,0.000000,0.0,0.0,3532.145542,0.000,0.000000,12.000000,493.909091,0.0
9591,997,-14899.66,-2128.522857,206.99,-3741.45,745.079512,3948.44,4.787867e+06,6756531.35,1404823.35,...,14.420000,0.000000,0.0,0.0,940.464154,2516.000,20.149683,0.000000,0.000000,0.0
9592,998,5507.73,786.818571,3359.83,-1022.35,537.836676,4382.18,7.987620e+05,1116887.94,275774.80,...,9.546429,231.316667,0.0,0.0,759.527885,2991.340,0.000000,12.700000,248.140000,0.0


In [132]:
# columns I will need: credit/debit, amount, posted date, evaluation date, prism consumer id, DQ_TARGET
merged = pd.merge(consdf.dropna(), trxndf, on='prism_consumer_id', how='left')

In [133]:
merged = merged[merged['posted_date'] <= merged['evaluation_date']]
credit_only = merged[merged['credit_or_debit'] == 'CREDIT'].copy()
credit_only['posted_date'] = pd.to_datetime(credit_only['posted_date'])
credit_only['Year-Month'] = credit_only['posted_date'].dt.to_period('M')
debt_only = trxndf[trxndf['credit_or_debit']=='DEBIT']
monthly_inflow = credit_only.groupby(['prism_consumer_id', 'Year-Month'])['amount'].sum().reset_index(name='monthly_inflow')
consdf['Evaluation Month'] = consdf['evaluation_date'].dt.to_period('M')
with_eval_month = pd.merge(consdf, monthly_inflow, on='prism_consumer_id', how='left')

In [134]:
with_eval_month['months_diff'] = (
    (with_eval_month['Evaluation Month'].dt.year - with_eval_month['Year-Month'].dt.year) * 12 +
    (with_eval_month['Evaluation Month'].dt.month - with_eval_month['Year-Month'].dt.month)
)
last_year = with_eval_month[(with_eval_month['months_diff'] >= 1) & (with_eval_month['months_diff'] <= 12)]
sum_yearly_inflow = last_year.groupby('prism_consumer_id')['monthly_inflow'].sum().reset_index(name='avg_yearly_inflow')
year_std = last_year.groupby('prism_consumer_id')['monthly_inflow'].std().reset_index()
year_std.columns = ['prism_consumer_id', 'std_inflow']

In [135]:
# Trend: Is income increasing or decreasing?
def calculate_trend(group):
    if len(group) < 2:
        return 0
    months = group['months_diff'].values
    inflows = group['monthly_inflow'].values
    return np.polyfit(months, inflows, 1)[0]  # slope

trend = last_year.groupby('prism_consumer_id').apply(calculate_trend, include_groups=False).reset_index()
trend.columns = ['prism_consumer_id', 'trend']
num_transactions = last_year.groupby('prism_consumer_id').size().reset_index()
num_transactions.columns = ['prism_consumer_id', 'num_transactions']

In [136]:
debt_only = trxndf[trxndf['credit_or_debit'] == 'DEBIT'].copy()
debt_only['posted_date'] = pd.to_datetime(debt_only['posted_date'])
# debt_only['category'] = debt_only['category'].astype(int)

# debt_with_category = pd.merge(debt_only, cat_map, left_on='category', right_on='category_id', how='left')[['prism_consumer_id',\
#     'prism_transaction_id', 'amount', 'credit_or_debit', 'posted_date', 'category_id', 'category_y']]
debt_with_category = debt_only.rename(columns={'category_y':'category'})
groceries_only = debt_with_category[debt_with_category['category']=='GROCERIES']

debt_with_eval = pd.merge(groceries_only, consdf[['prism_consumer_id', 'evaluation_date']], on='prism_consumer_id', how='left')

# Filter for transactions in the 3 months before evaluation_date
debt_with_eval['months_before_eval'] = (
    (debt_with_eval['evaluation_date'].dt.year - debt_with_eval['posted_date'].dt.year) * 12 +
    (debt_with_eval['evaluation_date'].dt.month - debt_with_eval['posted_date'].dt.month)
)

debt_9m = debt_with_eval[(debt_with_eval['months_before_eval'] >= 0) & 
                          (debt_with_eval['months_before_eval'] < 9)]

# total spend of groceries per consumer over a 9 month window (last 9 months before eval date)
total_spend_groceries_9m = debt_9m.groupby('prism_consumer_id')['amount'].sum().reset_index()
total_spend_groceries_9m.columns = ['prism_consumer_id', 'sum_groceries_9m']

In [137]:
# total spend of dining per consumer over a month window (last month before eval date)
dining_only = debt_with_category[debt_with_category['category']=='FOOD_AND_BEVERAGES']

debt_with_eval_dining = pd.merge(dining_only, consdf[['prism_consumer_id', 'evaluation_date']], on='prism_consumer_id', how='left')

# Filter for transactions in the 6 months before evaluation_date
debt_with_eval_dining['months_before_eval'] = (
    (debt_with_eval_dining['evaluation_date'].dt.year - debt_with_eval_dining['posted_date'].dt.year) * 12 +
    (debt_with_eval_dining['evaluation_date'].dt.month - debt_with_eval_dining['posted_date'].dt.month)
)

debt_6m = debt_with_eval_dining[(debt_with_eval_dining['months_before_eval'] >= 0) & 
                          (debt_with_eval_dining['months_before_eval'] < 6)]

# total spend of groceries per consumer over a 6 month window (last 6 months before eval date)
total_spend_dining_6m = debt_6m.groupby('prism_consumer_id')['amount'].sum().reset_index()
total_spend_dining_6m.columns = ['prism_consumer_id', 'sum_dining_6m']

In [138]:
# merge evaluation date ONCE
tx = debt_with_category.merge(
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

tx = tx[tx['credit_or_debit'] == 'DEBIT']
tx['amount'] = tx['amount'].abs()

# numerator
total_spend_gambling = tx[tx['category'] == 'GAMBLING'].groupby('prism_consumer_id')['amount'].sum()

# denominator
total_spend_all = tx.groupby('prism_consumer_id')['amount'].sum()

pct_spend_gambling = (total_spend_gambling / total_spend_all).fillna(0).reset_index(name='pct_spend_gambling')

In [139]:
essentials = ['RENT', 'MORTGAGE', 'BILLS_UTILITIES', 'ESSENTIAL_SERVICES', 'GROCERIES', 'AUTOMOTIVE', 'TRANSPORTATION', \
'HEALTHCARE_MEDICAL', 'INSURANCE', 'CHILD_DEPENDENTS', 'PETS', 'TAX', 'LOAN', 'AUTO_LOAN', 'DEBT', 'CREDIT_CARD_PAYMENT', \
'EDUCATION', 'LEGAL', 'GOVERNMENT_SERVICES']

total_spend_essentials = tx[tx['category'].isin(essentials)].groupby('prism_consumer_id')['amount'].sum()

pct_spend_essentials = (total_spend_essentials / total_spend_all).reset_index()

pct_spend_essentials = pct_spend_essentials.rename(columns={'amount':'pct_spend_essentials'})

In [140]:
# # change in groceries per consumer from the 3 most recent months to the prior 3-6 months before evaluation date
# lowers AUC from 0.721 to 0.71

# recent 3 months (0–2)
recent_3m = debt_with_eval[(debt_with_eval['months_before_eval'] >= 0) & (debt_with_eval['months_before_eval'] < 3)]

recent_spend = recent_3m.groupby('prism_consumer_id')['amount'].sum().reset_index(name='groceries_0_3m')

# prior 3 months (3–5)
prior_3m = debt_with_eval[(debt_with_eval['months_before_eval'] >= 3) & (debt_with_eval['months_before_eval'] < 6)]

prior_spend = prior_3m.groupby('prism_consumer_id')['amount'].sum().reset_index(name='groceries_3_6m')

# merge and compute delta
delta_groceries_3m = recent_spend.merge(
    prior_spend,
    on='prism_consumer_id',
    how='outer'
).fillna(0)

delta_groceries_3m['delta_groceries_3m'] = delta_groceries_3m['groceries_0_3m'] - delta_groceries_3m['groceries_3_6m']

delta_groceries_3m = delta_groceries_3m[['prism_consumer_id', 'delta_groceries_3m']]

utilities = ['BILLS_UTILITIES', 'ESSENTIAL_SERVICES']

total_spend_utilities = tx[tx['category'].isin(utilities)].groupby('prism_consumer_id')['amount'].sum()

pct_spend_utilities = (total_spend_utilities / total_spend_all).reset_index()

pct_spend_utilities = pct_spend_utilities.rename(columns={'amount':'pct_spend_utilities'})

In [141]:
# has overdraft - 6 months
# Merge evaluation dates with ALL debt transactions
debt_with_eval = pd.merge(
    debt_with_category, 
    consdf[['prism_consumer_id', 'evaluation_date']], 
    on='prism_consumer_id', 
    how='left'
)

# Calculate days before evaluation
debt_with_eval['days_before_eval'] = (
    debt_with_eval['evaluation_date'] - debt_with_eval['posted_date']
).dt.days

# Filter for OVERDRAFT category AND within 6 months
overdraft_6m = debt_with_eval[
    (debt_with_eval['category'] == 'OVERDRAFT') &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with overdrafts
has_overdraft_6m = overdraft_6m.groupby('prism_consumer_id').size().reset_index(name='overdraft_count')
has_overdraft_6m['has_overdraft_6m'] = 1

has_overdraft_6m = has_overdraft_6m[['prism_consumer_id', 'has_overdraft_6m']]

In [142]:
# has account fees - 6 months
# Merge evaluation dates with ALL debt transactions
debt_with_eval = pd.merge(
    debt_with_category, 
    consdf[['prism_consumer_id', 'evaluation_date']], 
    on='prism_consumer_id', 
    how='left'
)

# Calculate days before evaluation
debt_with_eval['days_before_eval'] = (
    debt_with_eval['evaluation_date'] - debt_with_eval['posted_date']
).dt.days

# Filter for ACCOUNT FEES category AND within 6 months
acct_fees_6m = debt_with_eval[
    (debt_with_eval['category'] == 'ACCOUNT_FEES') &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with acct fee
has_acct_fee_6m = acct_fees_6m.groupby('prism_consumer_id').size().reset_index(name='acct_fees_count')
has_acct_fee_6m['has_acct_fee_6m'] = 1

has_acct_fee_6m = has_acct_fee_6m[['prism_consumer_id', 'has_acct_fee_6m']]

In [143]:
#atm cash ratio per consumer

debt_with_eval = pd.merge(
    debt_with_category,
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

debt_with_eval['posted_date'] = pd.to_datetime(debt_with_eval['posted_date'])
debt_with_eval['evaluation_date'] = pd.to_datetime(debt_with_eval['evaluation_date'])

debt_with_eval = debt_with_eval[
    debt_with_eval['posted_date'] <= debt_with_eval['evaluation_date']
]

total_debt_spend = debt_with_eval.groupby('prism_consumer_id')['amount'].sum().reset_index(name='total_debit_spend')

In [144]:
atm_cash_spend = (
    debt_with_eval[debt_with_eval['category'] == 'ATM_CASH']
    .groupby('prism_consumer_id')['amount']
    .sum()
    .reset_index(name='atm_cash_spend')
)

atm_cash_ratio = total_debt_spend.merge(atm_cash_spend, on='prism_consumer_id',how='left').fillna(0)
atm_cash_ratio['atm_cash_ratio'] = atm_cash_ratio['atm_cash_spend'] / atm_cash_ratio['total_debit_spend']
atm_cash_ratio['atm_cash_ratio'] = (
    atm_cash_ratio['atm_cash_ratio']
    .replace([np.inf, -np.inf], 0)
    .fillna(0)
)


In [145]:
# Merge evaluation dates with ALL debt transactions
debt_with_eval = pd.merge(
    debt_with_category, 
    consdf[['prism_consumer_id', 'evaluation_date']], 
    on='prism_consumer_id', 
    how='left'
)

# Calculate days before evaluation
debt_with_eval['days_before_eval'] = (
    debt_with_eval['evaluation_date'] - debt_with_eval['posted_date']
).dt.days

atm_cash_freq_6m = acct_fees_6m.groupby('prism_consumer_id').size().reset_index(name='atm_cash_freq_6m')

In [146]:
# refund ratio
credit_only = trxndf[trxndf['credit_or_debit']=='CREDIT']
# merged_credit = pd.merge(credit_only, cat_map, left_on='category', right_on='category_id', how='left')[['prism_consumer_id', 'prism_transaction_id', 'amount', \
# 'credit_or_debit', 'posted_date', 'category_id', 'category_y']]
merged_credit = credit_only.rename(columns={'category_y': 'category'})

credit_with_eval = pd.merge(
    merged_credit,
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

credit_with_eval['posted_date'] = pd.to_datetime(credit_with_eval['posted_date'])
credit_with_eval['evaluation_date'] = pd.to_datetime(credit_with_eval['evaluation_date'])

credit_with_eval['days_before_eval'] = (credit_with_eval['evaluation_date'] - credit_with_eval['posted_date']).dt.days
window = credit_with_eval[(credit_with_eval['days_before_eval'] >= 0) & (credit_with_eval['days_before_eval'] <= 180)]

refund = window[window['category']=='REFUND'].groupby('prism_consumer_id')['amount'].sum().reset_index(name='refund_amount')

In [147]:
debit_only = trxndf[trxndf['credit_or_debit'] == 'DEBIT']
# merged_debit = pd.merge(
#     debit_only,
#     cat_map,
#     left_on='category',
#     right_on='category_id',
#     how='left'
# )[[
#     'prism_consumer_id',
#     'prism_transaction_id',
#     'amount',
#     'credit_or_debit',
#     'posted_date',
#     'category_id',
#     'category_y'
# ]]

merged_debit = debit_only.rename(columns={'category_y': 'category'})
debit_with_eval = pd.merge(
    merged_debit,
    consdf[['prism_consumer_id', 'evaluation_date']],
    on='prism_consumer_id',
    how='left'
)

debit_with_eval['posted_date'] = pd.to_datetime(debit_with_eval['posted_date'])
debit_with_eval['evaluation_date'] = pd.to_datetime(debit_with_eval['evaluation_date'])

debit_with_eval['days_before_eval'] = (
    debit_with_eval['evaluation_date'] - debit_with_eval['posted_date']
).dt.days

debit_window = debit_with_eval[
    (debit_with_eval['days_before_eval'] >= 0) &
    (debit_with_eval['days_before_eval'] <= 180)
]

debit_spend = debit_window[
    debit_window['category'] != 'REFUND'
]
denominator = (
    debit_spend
    .groupby('prism_consumer_id')['amount']
    .sum()
    .reset_index(name='total_debit_spend')
)


In [148]:
refund_ratio = denominator.merge(
    refund,
    on='prism_consumer_id',
    how='left'
).fillna(0)

refund_ratio['refund_ratio'] = (
    refund_ratio['refund_amount'] /
    refund_ratio['total_debit_spend']
)

refund_ratio['refund_ratio'] = (
    refund_ratio['refund_ratio']
    .replace([np.inf, -np.inf], 0)
    .fillna(0)
)
refund_ratio = refund_ratio[['prism_consumer_id', 'refund_ratio']]

In [149]:
# debt_payment_ratio
# (LOAN + CREDIT_CARD_PAYMENT + AUTO_LOAN + BNPL) / total_debit_spend
categories_of_interest = ['LOAN', 'CREDIT_CARD_PAYMENT', 'AUTO_LOAN', 'BNPL']

summary = (
    debit_with_eval
    .groupby('prism_consumer_id')
    .agg(
        total_debit_spend=('amount', 'sum'),
        debt_spend=('amount', lambda x: x[
            debit_with_eval.loc[x.index, 'category'].isin(categories_of_interest)
        ].sum())
    )
    .reset_index()
)

summary['debt_spend_ratio'] = summary['debt_spend'] / summary['total_debit_spend']

In [150]:
# bnpl usage flag
# Filter for BNPL category AND within 6 months
bnpl_usage_6m = debt_with_eval[
    (debt_with_eval['category'] == 'BNPL') &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with acct fee
has_bnpl_usage_6m = bnpl_usage_6m.groupby('prism_consumer_id').size().reset_index(name='bnpl_usage_flag')
has_bnpl_usage_6m['bnpl_usage_flag'] = 1

has_bnpl_usage_6m = has_bnpl_usage_6m[['prism_consumer_id', 'bnpl_usage_flag']]

In [151]:
debt_categories = ['LOAN', 'CREDIT_CARD_PAYMENT', 'AUTO_LOAN', 'BNPL']

debt_category_count = (
    debit_with_eval[debit_with_eval['category'].isin(debt_categories)]
    .groupby(['prism_consumer_id', 'category'])['amount']
    .sum()
    .reset_index()
)

# keep only categories with non-zero spend
debt_category_count = debt_category_count[debt_category_count['amount'] != 0]

debt_category_count = (
    debt_category_count
    .groupby('prism_consumer_id')
    .size()
    .reset_index(name='debt_category_count')
)

In [152]:
# discretionary drop flag
discretionary_cat_map = ['ENTERTAINMENT', 'TRAVEL', 'FITNESS']
df = debit_with_eval.copy()
df['month'] = df['posted_date'].dt.to_period('M')
monthly_disc = df[df['category'].isin(discretionary_cat_map)].groupby(['prism_consumer_id', 'month'])['amount'].sum().reset_index()

In [153]:

monthly_disc = monthly_disc.sort_values(['prism_consumer_id', 'month'])
monthly_disc['disc_3m_spend'] = monthly_disc.groupby('prism_consumer_id')['amount'].rolling(3, min_periods=3).sum().reset_index(drop=True)
monthly_disc['prev_disc_3m_spend'] = (
    monthly_disc
    .groupby('prism_consumer_id')['disc_3m_spend']
    .shift(3)
)

In [154]:

DROP_THRESHOLD = 0.30

monthly_disc['discretionary_drop_flag_3m'] = (
    (monthly_disc['prev_disc_3m_spend'] > 0) &
    ((monthly_disc['prev_disc_3m_spend'] - monthly_disc['disc_3m_spend'])
     / monthly_disc['prev_disc_3m_spend'] >= DROP_THRESHOLD)
).astype(int)

discretionary_drop_flag_3m = (
    monthly_disc
    .dropna(subset=['discretionary_drop_flag_3m'])
    .groupby('prism_consumer_id')
    .tail(1)
    [['prism_consumer_id', 'discretionary_drop_flag_3m']]
)

In [155]:
# essential spend volatility in 6 months
# Filter for essentials AND within 6 months
essential_spend_volatility_6m = debt_with_eval[
    (debt_with_eval['category'].isin(essentials)) &
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with acct fee
essential_spend_volatility_6m = essential_spend_volatility_6m.groupby('prism_consumer_id')['amount'].std().reset_index(name='essential_spend_volatility_6m')

essential_spend_volatility_6m = essential_spend_volatility_6m[['prism_consumer_id', 'essential_spend_volatility_6m']]

In [156]:
# child dependents spend sum in 6 months
# Filter for child dependents AND within 6 months
child_dependents_6m = debt_with_eval[
    (debt_with_eval['category']=='CHILD_DEPENDENTS')&
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with child dependents
has_child_deps_6m = bnpl_usage_6m.groupby('prism_consumer_id').size().reset_index(name='child_dependents_6m')
has_child_deps_6m['child_dependents_6m'] = 1

In [157]:

# child dependents spend sum in 6 months
# Filter for essentials AND within 6 months
pets_6m = debt_with_eval[
    (debt_with_eval['category']=='PETS')&
    (debt_with_eval['days_before_eval'] >= 0) & 
    (debt_with_eval['days_before_eval'] <= 180)
]

# Group to get consumers with child dependents
has_pets_6m = pets_6m.groupby('prism_consumer_id').size().reset_index(name='pets_6m')
has_pets_6m['pets_6m'] = 1


## prepping model

In [158]:
df_eval = pd.merge(consdf, sum_yearly_inflow, on="prism_consumer_id", how="left")
df_eval = pd.merge(df_eval, year_std, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, trend, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, num_transactions, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, total_spend_groceries_9m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, total_spend_dining_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, pct_spend_gambling, on='prism_consumer_id',how='left')
df_eval = pd.merge(df_eval, pct_spend_essentials, on='prism_consumer_id',how='left')
df_eval = pd.merge(df_eval, delta_groceries_3m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, pct_spend_utilities, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_overdraft_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, atm_cash_ratio, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_acct_fee_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, atm_cash_freq_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, refund_ratio, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, summary, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_bnpl_usage_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, debt_category_count, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, discretionary_drop_flag_3m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, essential_spend_volatility_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_child_deps_6m, on='prism_consumer_id', how='left')
df_eval = pd.merge(df_eval, has_pets_6m, on='prism_consumer_id', how='left')
df_eval['has_overdraft_6m'] = df_eval['has_overdraft_6m'].fillna(0).astype(int)
df_eval['has_acct_fee_6m'] = df_eval['has_acct_fee_6m'].fillna(0).astype(int)
df_eval['atm_cash_freq_6m'] = df_eval['atm_cash_freq_6m'].fillna(0).astype(int)
df_eval['bnpl_usage_flag'] = df_eval['bnpl_usage_flag'].fillna(0).astype(int)
df_eval['debt_category_count'] = df_eval['debt_category_count'].fillna(0).astype(int)
df_eval['child_dependents_6m'] = df_eval['child_dependents_6m'].fillna(0).astype(int)
df_eval['pets_6m'] = df_eval['pets_6m'].fillna(0).astype(int)

In [159]:
df_eval['prism_consumer_id'] =df_eval['prism_consumer_id'].astype(int)
df_eval = main_df.merge(df_eval, on="prism_consumer_id", how="right")


In [163]:

period_cols = [col for col in df_eval.columns 
               if str(df_eval[col].dtype).startswith('period')]

datetime_cols = df_eval.select_dtypes(include=['datetime64[ns]', 'datetimetz']).columns

time_cols = list(datetime_cols) + period_cols
df_eval = df_eval.drop(columns=time_cols)

In [164]:
df_eval = df_eval.drop(columns=['DQ_TARGET_y','DQ_TARGET_x','credit_score'])


In [165]:
# --- dtype alignment for merge key ---
df_eval = df_eval.copy()
scoring_merge = scoring[["prism_consumer_id", "excluded"]].copy()

df_eval["prism_consumer_id"] = df_eval["prism_consumer_id"].astype(str)
scoring_merge["prism_consumer_id"] = scoring_merge["prism_consumer_id"].astype(str)

# --------------------------------------------
# Build master eval table with exclusion flag
# --------------------------------------------
df_eval_master = df_eval.merge(
    scoring[["prism_consumer_id", "excluded"]],
    on="prism_consumer_id",
    how="left"
)

# If a consumer didn't get a scoring row, treat them as excluded by default (conservative)
df_eval_master["excluded"] = df_eval_master["excluded"].fillna(True)

print("df_eval_master rows:", df_eval_master.shape[0])
print("Excluded rows:", df_eval_master["excluded"].sum())
print("Eligible rows:", (~df_eval_master["excluded"]).sum())


df_eval_master rows: 12000
Excluded rows: 898
Eligible rows: 11102


## model testing

In [166]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# ----------------------------
# helper: evaluate one model
# ----------------------------
def eval_model_full(name, model, X_train, y_train, X_test, y_test, threshold=0.5):
    t0 = time.perf_counter()
    model.fit(X_train, y_train)
    t1 = time.perf_counter()

    # train probs + test probs
    t2 = time.perf_counter()
    ytr_prob = model.predict_proba(X_train)[:, 1]
    yte_prob = model.predict_proba(X_test)[:, 1]
    t3 = time.perf_counter()

    train_auc = roc_auc_score(y_train, ytr_prob)
    test_auc  = roc_auc_score(y_test,  yte_prob)

    # classification report at threshold
    yte_pred = (yte_prob >= threshold).astype(int)

    print(f"\n{name}")
    print(f"  train_auc: {train_auc:.4f} | test_auc: {test_auc:.4f}")
    print(f"  train_time: {t1-t0:.4f}s | score_time: {t3-t2:.4f}s")
    print(classification_report(y_test, yte_pred, digits=4))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc,
        "train_time": t1 - t0,
        "score_time": t3 - t2
    }


# ----------------------------
# common split function
# ----------------------------
def make_split(df, seed=42, test_size=0.2):
    y = df["DQ_TARGET"].astype(int)
    X = df.drop(columns=["prism_consumer_id", "DQ_TARGET"], errors="ignore")
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=seed)


# ----------------------------
# define models (robust to NaNs)
# ----------------------------
logreg = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

rf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample",
        max_depth=None
    ))
])

xgb = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", XGBClassifier(
        n_estimators=600,
        max_depth=6,
        learning_rate=0.03,
        subsample=0.85,
        colsample_bytree=0.85,
        min_child_weight=3,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1.0,
        eval_metric="auc",
        tree_method="hist",
        random_state=42
    ))
])

models = [
    ("Logistic Regression", logreg),
    ("Random Forest", rf),
    ("XGBoost", xgb),
]


# ----------------------------
# run comparison: before vs after
# ----------------------------
def run_suite(df, label, threshold=0.5):
    X_train, X_test, y_train, y_test = make_split(df, seed=42, test_size=0.2)
    results = []
    print(f"\n==================== {label} ====================")
    print("Rows:", df.shape[0], "| Pos rate:", df["DQ_TARGET"].mean())
    for name, model in models:
        results.append(eval_model_full(name, model, X_train, y_train, X_test, y_test, threshold=threshold))
    return pd.DataFrame(results)


# --------------------------------------------
# Build BEFORE/AFTER from df_eval_master
# --------------------------------------------
df_before = df_eval_master.drop(columns=["excluded"]).copy()
df_after  = df_eval_master.loc[~df_eval_master["excluded"]].drop(columns=["excluded"]).copy()

print("Rows before exclusions:", df_before.shape[0])
print("Rows after exclusions :", df_after.shape[0])

res_before = run_suite(df_before, "BEFORE EXCLUSIONS", threshold=0.5)
res_after  = run_suite(df_after,  "AFTER EXCLUSIONS",  threshold=0.5)

summary = res_before.merge(res_after, on="model", suffixes=("_before", "_after"))
summary["delta_test_auc"]  = summary["test_auc_after"]  - summary["test_auc_before"]
summary["delta_train_auc"] = summary["train_auc_after"] - summary["train_auc_before"]

print("\n\n======== AUC SUMMARY (Before vs After) ========")
display(summary.sort_values("delta_test_auc", ascending=False))


Rows before exclusions: 12000
Rows after exclusions : 11102

Rows: 12000 | Pos rate: 0.08383333333333333

Logistic Regression
  train_auc: 0.7699 | test_auc: 0.7126
  train_time: 22.8881s | score_time: 0.2932s
              precision    recall  f1-score   support

           0     0.9523    0.6803    0.7936      2199
           1     0.1520    0.6269    0.2447       201

    accuracy                         0.6758      2400
   macro avg     0.5521    0.6536    0.5191      2400
weighted avg     0.8852    0.6758    0.7477      2400


Random Forest
  train_auc: 0.9981 | test_auc: 0.7419
  train_time: 3.4935s | score_time: 0.3120s
              precision    recall  f1-score   support

           0     0.9188    0.9627    0.9403      2199
           1     0.1458    0.0697    0.0943       201

    accuracy                         0.8879      2400
   macro avg     0.5323    0.5162    0.5173      2400
weighted avg     0.8541    0.8879    0.8694      2400


XGBoost
  train_auc: 0.9928 | test_au

Unnamed: 0,model,train_auc_before,test_auc_before,train_time_before,score_time_before,train_auc_after,test_auc_after,train_time_after,score_time_after,delta_test_auc,delta_train_auc
0,Logistic Regression,0.769939,0.712574,22.888135,0.29315,0.780743,0.764935,16.68454,0.289651,0.052361,0.010804
1,Random Forest,0.998068,0.741935,3.493538,0.312002,1.0,0.786926,3.318265,0.298348,0.044991,0.001932
2,XGBoost,0.992816,0.75628,2.80095,0.05014,0.998097,0.794529,2.837834,0.047273,0.038249,0.005281


## feature selection

In [169]:
# Target
y = df_eval["DQ_TARGET"].astype(int)

# Features
X = df_eval.drop(columns=["prism_consumer_id", "DQ_TARGET"], errors="ignore")
X = X.fillna(0)

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [170]:
selector_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

selector_model.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(
    selector_model.feature_importances_,
    index=X_train.columns
)

# Select top 50
top_50 = importances.sort_values(ascending=False).head(50).index.tolist()

print("Top 10 Features:")
print(importances.sort_values(ascending=False).head(10))


Top 10 Features:
month_count            0.081337
num_transactions       0.064329
monthly_max            0.015243
CREDIT_CARD_PAYMENT    0.013678
debt_category_count    0.013290
monthly_min            0.012292
monthly_mean           0.012073
INSURANCE              0.010893
has_overdraft_6m       0.010556
refund_ratio_x         0.010434
dtype: float32


In [171]:
X_train_50 = X_train[top_50]
X_test_50  = X_test[top_50]


In [172]:
def eval_model(name, model, Xtr, ytr, Xte, yte):
    t0 = time.perf_counter()
    model.fit(Xtr, ytr)
    t1 = time.perf_counter()

    ytr_prob = model.predict_proba(Xtr)[:, 1]
    train_auc = roc_auc_score(ytr, ytr_prob)

    t2 = time.perf_counter()
    yte_prob = model.predict_proba(Xte)[:, 1]
    t3 = time.perf_counter()

    test_auc = roc_auc_score(yte, yte_prob)

    y_pred = model.predict(Xte)

    print(f"\n{name}")
    print(f"  Train AUC: {train_auc:.4f}")
    print(f"  Test  AUC: {test_auc:.4f}")
    print(f"  Overfit gap: {train_auc - test_auc:.4f}")
    print(f"  Train time: {t1 - t0:.4f}s")
    print(f"  Score time: {t3 - t2:.4f}s")
    print("\nClassification Report:")
    print(classification_report(yte, y_pred))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc,
        "gap": train_auc - test_auc
    }


In [173]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_50)
X_test_scaled  = scaler.transform(X_test_50)

logreg = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    random_state=42
)

res_log = eval_model(
    "Logistic Regression (Top 50)",
    logreg,
    X_train_scaled,
    y_train,
    X_test_scaled,
    y_test
)



Logistic Regression (Top 50)
  Train AUC: 0.7464
  Test  AUC: 0.7016
  Overfit gap: 0.0448
  Train time: 7.6128s
  Score time: 0.0003s

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.63      0.76      2199
           1       0.14      0.65      0.23       201

    accuracy                           0.63      2400
   macro avg       0.55      0.64      0.50      2400
weighted avg       0.88      0.63      0.72      2400



In [174]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=8,              # limit depth to prevent overfit
    min_samples_leaf=5,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

res_rf = eval_model(
    "Random Forest (Top 50)",
    rf,
    X_train_50,
    y_train,
    X_test_50,
    y_test
)



Random Forest (Top 50)
  Train AUC: 0.9233
  Test  AUC: 0.7501
  Overfit gap: 0.1732
  Train time: 1.5503s
  Score time: 0.1326s

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.82      0.88      2199
           1       0.19      0.46      0.27       201

    accuracy                           0.79      2400
   macro avg       0.57      0.64      0.57      2400
weighted avg       0.88      0.79      0.83      2400



In [175]:
xgb = XGBClassifier(
    n_estimators=600,
    max_depth=4,              # shallower to reduce overfit
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    gamma=0.2,
    reg_alpha=0.3,
    reg_lambda=1.5,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

res_xgb = eval_model(
    "XGBoost (Top 50)",
    xgb,
    X_train_50,
    y_train,
    X_test_50,
    y_test
)



XGBoost (Top 50)
  Train AUC: 0.9425
  Test  AUC: 0.7500
  Overfit gap: 0.1925
  Train time: 0.9087s
  Score time: 0.0080s

Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2199
           1       0.55      0.05      0.10       201

    accuracy                           0.92      2400
   macro avg       0.74      0.53      0.53      2400
weighted avg       0.89      0.92      0.88      2400



In [176]:
results_df = pd.DataFrame([res_log, res_rf, res_xgb])
results_df.sort_values("test_auc", ascending=False)


Unnamed: 0,model,train_auc,test_auc,gap
1,Random Forest (Top 50),0.92328,0.750083,0.173197
2,XGBoost (Top 50),0.94246,0.750004,0.192456
0,Logistic Regression (Top 50),0.746394,0.701633,0.044761


### summed

In [178]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# ----------------------------
# split helper
# ----------------------------
def make_split(df, seed=42, test_size=0.2):
    y = df["DQ_TARGET"].astype(int)
    X = df.drop(columns=["prism_consumer_id", "DQ_TARGET"], errors="ignore")
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=seed)

# ----------------------------
# feature selection helper (fit on TRAIN only)
# ----------------------------
def select_top_k_l1(X_train, y_train, X_test, k=50, C=0.2):
    """
    Fits an L1-logistic model on imputed+scaled TRAIN data to pick top-k features by abs coef.
    Returns reduced X_train, X_test, and the selected feature names.
    """
    # 1) impute + scale for selection model
    prep = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    Xtr_p = prep.fit_transform(X_train)
    Xte_p = prep.transform(X_test)

    # 2) L1 logistic for sparse selection
    sel_model = LogisticRegression(
        penalty="l1",
        solver="liblinear",
        class_weight="balanced",
        C=C,
        max_iter=3000
    )
    sel_model.fit(Xtr_p, y_train)

    # 3) rank features by abs coefficient
    coefs = np.abs(sel_model.coef_).ravel()
    feat_names = np.array(X_train.columns)

    # handle all-zero coef case
    if np.all(coefs == 0):
        # fallback: keep first k columns (better than crashing)
        idx = np.arange(min(k, len(feat_names)))
    else:
        idx = np.argsort(coefs)[::-1][:min(k, len(feat_names))]

    selected = feat_names[idx].tolist()

    # 4) return raw (unscaled) columns subset — your model pipelines still handle impute/scale
    return X_train[selected].copy(), X_test[selected].copy(), selected

# ----------------------------
# eval helper
# ----------------------------
def eval_model_full(name, model, X_train, y_train, X_test, y_test, threshold=0.5):
    t0 = time.perf_counter()
    model.fit(X_train, y_train)
    t1 = time.perf_counter()

    t2 = time.perf_counter()
    ytr_prob = model.predict_proba(X_train)[:, 1]
    yte_prob = model.predict_proba(X_test)[:, 1]
    t3 = time.perf_counter()

    train_auc = roc_auc_score(y_train, ytr_prob)
    test_auc  = roc_auc_score(y_test,  yte_prob)

    yte_pred = (yte_prob >= threshold).astype(int)

    print(f"\n{name}")
    print(f"  train_auc: {train_auc:.4f} | test_auc: {test_auc:.4f}")
    print(f"  train_time: {t1-t0:.4f}s | score_time: {t3-t2:.4f}s")
    print(classification_report(y_test, yte_pred, digits=4))

    return {
        "model": name,
        "train_auc": train_auc,
        "test_auc": test_auc,
        "train_time": t1 - t0,
        "score_time": t3 - t2
    }

# ----------------------------
# run one experiment (optionally with feature selection)
# ----------------------------
def run_suite(df, label, models, threshold=0.5, use_fs=False, k=50, fs_C=0.2, seed=42):
    X_train, X_test, y_train, y_test = make_split(df, seed=seed, test_size=0.2)

    selected = None
    if use_fs:
        X_train, X_test, selected = select_top_k_l1(X_train, y_train, X_test, k=k, C=fs_C)
        print(f"\n[{label}] Selected {len(selected)} features (top-{k})")
        # uncomment if you want to see them:
        # print(selected)

    results = []
    print(f"\n==================== {label} ====================")
    print("Rows:", df.shape[0], "| Pos rate:", df["DQ_TARGET"].mean(), "| Features:", X_train.shape[1])

    for name, model in models:
        results.append(eval_model_full(name, model, X_train, y_train, X_test, y_test, threshold=threshold))

    out = pd.DataFrame(results)
    out["label"] = label
    out["use_fs"] = use_fs
    out["k"] = (len(selected) if selected is not None else np.nan)
    return out

# ----------------------------
# build BEFORE/AFTER datasets from df_eval_master
# ----------------------------
df_before = df_eval_master.drop(columns=["excluded"]).copy()
df_after  = df_eval_master.loc[~df_eval_master["excluded"]].drop(columns=["excluded"]).copy()

# ----------------------------
# run 4 experiments
# ----------------------------
res_A = run_suite(df_before, "BEFORE exclusions | ALL features", models, use_fs=False)
res_B = run_suite(df_after,  "AFTER  exclusions | ALL features", models, use_fs=False)

res_C = run_suite(df_before, "BEFORE exclusions | TOP-50 features", models, use_fs=True, k=50, fs_C=0.2)
res_D = run_suite(df_after,  "AFTER  exclusions | TOP-50 features", models, use_fs=True, k=50, fs_C=0.2)

results_4way = pd.concat([res_A, res_B, res_C, res_D], ignore_index=True)

# ----------------------------
# summary table
# ----------------------------
summary = results_4way.pivot_table(
    index="model",
    columns="label",
    values=["test_auc", "train_auc"],
    aggfunc="first"
)

display(summary)



Rows: 12000 | Pos rate: 0.08383333333333333 | Features: 137

Logistic Regression
  train_auc: 0.7699 | test_auc: 0.7126
  train_time: 21.5427s | score_time: 0.1886s
              precision    recall  f1-score   support

           0     0.9523    0.6803    0.7936      2199
           1     0.1520    0.6269    0.2447       201

    accuracy                         0.6758      2400
   macro avg     0.5521    0.6536    0.5191      2400
weighted avg     0.8852    0.6758    0.7477      2400


Random Forest
  train_auc: 0.9981 | test_auc: 0.7419
  train_time: 3.4268s | score_time: 0.2873s
              precision    recall  f1-score   support

           0     0.9188    0.9627    0.9403      2199
           1     0.1458    0.0697    0.0943       201

    accuracy                         0.8879      2400
   macro avg     0.5323    0.5162    0.5173      2400
weighted avg     0.8541    0.8879    0.8694      2400


XGBoost
  train_auc: 0.9928 | test_auc: 0.7563
  train_time: 2.8713s | score_time

Unnamed: 0_level_0,test_auc,test_auc,test_auc,test_auc,train_auc,train_auc,train_auc,train_auc
label,AFTER exclusions | ALL features,AFTER exclusions | TOP-50 features,BEFORE exclusions | ALL features,BEFORE exclusions | TOP-50 features,AFTER exclusions | ALL features,AFTER exclusions | TOP-50 features,BEFORE exclusions | ALL features,BEFORE exclusions | TOP-50 features
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Logistic Regression,0.764935,0.761311,0.712574,0.707737,0.780743,0.770246,0.769939,0.75955
Random Forest,0.786926,0.775646,0.741935,0.732174,1.0,1.0,0.998068,0.998258
XGBoost,0.794529,0.76694,0.75628,0.752911,0.998097,0.993808,0.992816,0.986078


In [177]:
# --- Split data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Forward Selection ---
selected_features = []
remaining_features = list(X.columns)
target_feature_count = 50

for i in range(target_feature_count):
    best_auc = 0
    best_feat = None
    
    for feat in remaining_features:
        current_feats = selected_features + [feat]
        
        model = XGBClassifier(
            n_estimators=200,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="auc",
            random_state=42
        )
        model.fit(X_train[current_feats], y_train)
        
        y_probs = model.predict_proba(X_test[current_feats])[:, 1]
        auc = roc_auc_score(y_test, y_probs)
        
        if auc > best_auc:
            best_auc = auc
            best_feat = feat
    
    if best_feat is None:
        print("No improvement; stopping early.")
        break
    
    selected_features.append(best_feat)
    remaining_features.remove(best_feat)
    print(f"Step {i+1}: Added feature '{best_feat}' with AUC={best_auc:.4f}")

# --- Final Selected Features ---
print("Top 50 Selected Features:", selected_features)

# --- Train Final XGBoost Model ---
final_model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42
)
final_model.fit(X_train[selected_features], y_train)

y_probs = final_model.predict_proba(X_test[selected_features])[:, 1]
final_auc = roc_auc_score(y_test, y_probs)
print("Final Test ROC-AUC:", final_auc)

Step 1: Added feature 'month_count' with AUC=0.7001


KeyboardInterrupt: 

## comparison to: no scoring exclusion