In [1]:
import numpy as np
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import pickle
import catboost
from functools import reduce

pd.options.display.float_format = '{:,.4f}'.format
from sklearn.preprocessing import StandardScaler

In [2]:
# Load trained CatBoost model
with open("catboost_500_80.pkl", "rb") as f:
    model = pickle.load(f)

In [3]:
acctDF = pd.read_parquet('../../data/q2-ucsd-acctDF.pqt')
acctDF['balance_date'] = pd.to_datetime(acctDF['balance_date'])

consDF_all = pd.read_parquet('../../data/q2-ucsd-consDF.pqt')

consDF = consDF_all.dropna()
trxnDF = pd.read_parquet('../../data/q2-ucsd-trxnDF.pqt')
trxnDF['posted_date'] = pd.to_datetime(trxnDF['posted_date'])

cat_map = pd.read_csv('../../data/q2-ucsd-cat-map.csv')
cat_map = dict(zip(cat_map['category_id'], cat_map['category']))

trxnDF.category = trxnDF.category.replace(cat_map)

In [4]:
def calc_balances_all(acctDF, trxnDF):

    # grabbing checking account
    check_acct_totals = acctDF[acctDF.account_type == 'CHECKING'].groupby(['prism_consumer_id', 'balance_date']).sum()
    check_acct_totals = check_acct_totals.reset_index()
    check_acct_totals = check_acct_totals.drop(axis=1,labels='prism_account_id')
    
    # Merge transactions with account balances
    merged = trxnDF.merge(
        check_acct_totals[['prism_consumer_id', 'balance_date', 'balance']], 
        on='prism_consumer_id', how='left'
    )

    # Identify pre and post transactions
    merged['is_pre'] = merged['posted_date'] <= merged['balance_date']

    # Set adjustment values based on pre/post period
    merged['adjustment'] = 0.0
    merged.loc[merged['is_pre'] & (merged['credit_or_debit'] == 'CREDIT'), 'adjustment'] = -merged['amount']
    merged.loc[merged['is_pre'] & (merged['credit_or_debit'] == 'DEBIT'), 'adjustment'] = merged['amount']
    merged.loc[~merged['is_pre'] & (merged['credit_or_debit'] == 'CREDIT'), 'adjustment'] = merged['amount']
    merged.loc[~merged['is_pre'] & (merged['credit_or_debit'] == 'DEBIT'), 'adjustment'] = -merged['amount']

    # Pre-balance transactions: Sort descending and apply reverse cumsum
    pre_trans = merged[merged['is_pre']].sort_values(by=['prism_consumer_id', 'posted_date'], ascending=[True, False])
    pre_trans['curr_balance'] = pre_trans.groupby('prism_consumer_id')['adjustment'].cumsum() + pre_trans.groupby('prism_consumer_id')['balance'].transform('first')

    # Post-balance transactions: Sort ascending and apply forward cumsum
    post_trans = merged[~merged['is_pre']].sort_values(by=['prism_consumer_id', 'posted_date'], ascending=[True, True])
    post_trans['curr_balance'] = post_trans.groupby('prism_consumer_id')['adjustment'].cumsum() + post_trans.groupby('prism_consumer_id')['balance'].transform('first')

    # Combine results
    result = pd.concat([pre_trans, post_trans]).sort_values(by=['prism_consumer_id', 'posted_date'])

    # result['category'] = result.category.apply(lambda x: cat_dict[x])

    return result[['prism_consumer_id', 'prism_transaction_id', 'category', 'amount', 'credit_or_debit', 'posted_date', 'curr_balance']].sort_values(by='posted_date', ascending=False)

In [5]:
balanceDF = calc_balances_all(acctDF, trxnDF)

In [6]:
def filter_time_window(df, days=None, months=None, years=None):
    def filter_group(group):
        latest_date = group['posted_date'].max()  # Get latest transaction date per consumer
        cutoff_date = latest_date - pd.DateOffset(days=days or 0, months=months or 0, years=years or 0)
        return group[group['posted_date'] >= cutoff_date]  # Filter transactions

    return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)

def compute_balance_delta(balance_filtered, label):
    # Get last balance per consumer
    last_balance = balance_filtered.groupby('prism_consumer_id')['curr_balance'].last().reset_index()
    last_balance = last_balance.rename(columns={'curr_balance': 'curr_balance_last'})
    
    # Get earliest balance in the filtered dataset
    first_balance = balance_filtered.groupby('prism_consumer_id')['curr_balance'].first().reset_index()
    first_balance = first_balance.rename(columns={'curr_balance': 'curr_balance_first'})
    
    # Merge and compute balance delta
    df_merged = last_balance.merge(first_balance, on='prism_consumer_id', how='left')
    df_merged[f'balance_delta_{label}'] = df_merged['curr_balance_last'] - df_merged['curr_balance_first']
    
    # Keep only relevant columns
    return df_merged[['prism_consumer_id', f'balance_delta_{label}']]

def generate_category_features(trxnDF, cat_map, categories):
    """
    Generates transaction-based features for each selected category over multiple time windows.
    
    Parameters:
        trxnDF (pd.DataFrame): DataFrame containing transaction data.
        cat_map (pd.DataFrame): DataFrame mapping category IDs to category names.
        categories (str or list): One or more transaction categories to filter.

    Returns:
        pd.DataFrame: Aggregated features per prism_consumer_id.
    """
    if isinstance(categories, str):
        categories = [categories]

    # trxnDF = trxnDF.merge(cat_map, left_on='category', right_on='category_id', how='left')
    # trxnDF['category'] = trxnDF['category_y']
    # trxnDF = trxnDF.drop(columns=['category_id', 'category_y'])

    trxnDF['posted_date'] = pd.to_datetime(trxnDF['posted_date'])

    time_windows = {
        'overall': None,
        'last_14_days': 14,
        'last_30_days': 30,
        'last_3_months': 90,
        'last_6_months': 180,
        'last_year': 365
    }

    features_dict = {}

    for category in categories:
        # filter transactions for the current category
        filtered_trxn = trxnDF[trxnDF['category'] == category].copy()

        # get last posted date per consumer
        last_posted_dates = filtered_trxn.groupby('prism_consumer_id')['posted_date'].max()

        for window_name, days in time_windows.items():
            if days is None:
                df_time_filtered = filtered_trxn
            else:
                # consumer-specific time filters
                df_time_filtered = filtered_trxn.merge(last_posted_dates, on='prism_consumer_id', suffixes=('', '_latest'))
                df_time_filtered = df_time_filtered[df_time_filtered['posted_date'] >= (df_time_filtered['posted_date_latest'] - pd.Timedelta(days=days))]
                df_time_filtered = df_time_filtered.drop(columns=['posted_date_latest'])

            # aggregate features
            agg_features = df_time_filtered.groupby('prism_consumer_id')['amount'].agg(
                mean='mean',
                median='median',
                std='std',
                max='max',
                min='min',
                count='count'
            )

            # total transaction count per consumer in the time window
            total_trxn_counts = trxnDF.groupby('prism_consumer_id')['amount'].count()

            # compute percentage of transactions in this category for the time window
            percentage_trxn = (agg_features['count'] / total_trxn_counts).fillna(0)
            percentage_trxn = percentage_trxn.rename(f"{category}_{window_name}_percent")

            agg_features = agg_features.rename(columns=lambda x: f"{category}_{window_name}_{x}")

            features_dict[f"{category}_{window_name}"] = pd.concat([agg_features, percentage_trxn], axis=1)


    final_features = pd.concat(features_dict.values(), axis=1).fillna(0)
    return final_features

def compute_threshold_stats(df, thresholds, cat_label):
    # Count total gambling transactions
    counts = df.groupby('prism_consumer_id').size().reset_index(name=f'{cat_label}_count')

    # Check thresholds
    threshold_flags = df.groupby('prism_consumer_id')['amount'].agg(lambda x: [any(x >= t) for t in thresholds]).apply(pd.Series)
    threshold_flags.columns = [f'{cat_label}_over_{t}' for t in thresholds]
    threshold_flags = threshold_flags.astype(bool)  # Convert to True/False

    # Merge counts and flags
    result = counts.merge(threshold_flags, on='prism_consumer_id', how='left')

    return result



In [7]:
# creating relevant outflows df with only expenses
debits_not_expenses = ['SELF_TRANSFER', 'ATM_CASH']
credits_not_income = ['SELF_TRANSFER', 'LOAN', 'REFUND'] # maybe TAX

outflows_agg_df = trxnDF[(trxnDF.credit_or_debit == 'DEBIT') & (~trxnDF['category'].isin(debits_not_expenses))] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_agg_df.columns = ['outflows_amt_' + col for col in outflows_agg_df.columns]

inflows_agg_df = trxnDF[(trxnDF.credit_or_debit == 'CREDIT') & (~trxnDF['category'].isin(credits_not_income))] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_agg_df.columns = ['inflows_amt_' + col for col in inflows_agg_df.columns]

In [8]:
%%time

balance_ftrs = balanceDF.groupby('prism_consumer_id')['curr_balance'].agg(['mean', 'std', 'median', 'min', 'max'])
balance_ftrs.columns = ['balance_' + x for x in balance_ftrs.columns]
balance_last_14_days = filter_time_window(balanceDF, days=14)
balance_last_30_days = filter_time_window(balanceDF, days=30)
balance_last_3_months = filter_time_window(balanceDF, months=1)
balance_last_6_months = filter_time_window(balanceDF, months=6)
balance_last_year = filter_time_window(balanceDF, years=1)

balance_last_14_days_metrics = balance_last_14_days.groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
balance_last_14_days_metrics.columns = ['balance_last_14_days_' + x for x in balance_last_14_days_metrics.columns]

balance_last_30_days_metrics = balance_last_30_days.groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
balance_last_30_days_metrics.columns = ['balance_last_30_days_' + x for x in balance_last_30_days_metrics.columns]

balance_last_3_months_metrics = balance_last_3_months.groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
balance_last_3_months_metrics.columns = ['balance_last_3_months_' + x for x in balance_last_3_months_metrics.columns]

balance_last_6_months_metrics = balance_last_6_months.groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
balance_last_6_months_metrics.columns = ['balance_last_6_months_' + x for x in balance_last_6_months_metrics.columns]

balance_last_year_metrics = balance_last_year.groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
balance_last_year_metrics.columns = ['balance_last_1_year_' + x for x in balance_last_year_metrics.columns]

balance_dfs = [consDF[['prism_consumer_id']], balance_ftrs, balance_last_14_days_metrics, balance_last_30_days_metrics, balance_last_3_months_metrics, balance_last_6_months_metrics, balance_last_year_metrics]
balance_ftrs = reduce(lambda left, right: pd.merge(left, right, on='prism_consumer_id', how='left'), balance_dfs)

  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)


CPU times: user 46 s, sys: 1.95 s, total: 47.9 s
Wall time: 48 s


In [9]:
%%time

# balance_deltas
balance_delta_overall = compute_balance_delta(balanceDF, 'overall')
balance_delta_14d = compute_balance_delta(balance_last_14_days, '14d')
balance_delta_30d = compute_balance_delta(balance_last_30_days, '30d')
balance_delta_3m = compute_balance_delta(balance_last_3_months, '3m')
balance_delta_6m = compute_balance_delta(balance_last_6_months, '6m')
balance_delta_1y = compute_balance_delta(balance_last_year, '1y')


balance_deltas_dfs = [consDF[['prism_consumer_id']], balance_delta_overall, balance_delta_14d, balance_delta_30d, balance_delta_3m, balance_delta_6m, balance_delta_1y]
balance_deltas_ftrs = reduce(lambda left, right: pd.merge(left, right, on='prism_consumer_id', how='left'), balance_deltas_dfs)

CPU times: user 1.81 s, sys: 279 ms, total: 2.08 s
Wall time: 2.08 s


In [10]:
trxn_df_last_14_days = filter_time_window(trxnDF, days=14)
trxn_df_last_30_days = filter_time_window(trxnDF, days=30)
trxn_df_last_3_months = filter_time_window(trxnDF, months=3)
trxn_df_last_6_months = filter_time_window(trxnDF, months=6)
trxn_df_last_year = filter_time_window(trxnDF, years=1)

debits_df = trxnDF[trxnDF.credit_or_debit == 'DEBIT']
credits_df = trxnDF[trxnDF.credit_or_debit == 'CREDIT']

debits_df_last_14_days = trxn_df_last_14_days[trxn_df_last_14_days.credit_or_debit == 'DEBIT']
debits_df_last_30_days = trxn_df_last_30_days[trxn_df_last_30_days.credit_or_debit == 'DEBIT']
debits_df_last_3_months = trxn_df_last_3_months[trxn_df_last_3_months.credit_or_debit == 'DEBIT']
debits_df_last_6_months = trxn_df_last_6_months[trxn_df_last_6_months.credit_or_debit == 'DEBIT']
debits_df_last_year = trxn_df_last_year[trxn_df_last_year.credit_or_debit == 'DEBIT']

credits_df_last_14_days = trxn_df_last_14_days[trxn_df_last_14_days.credit_or_debit == 'CREDIT']
credits_df_last_30_days = trxn_df_last_30_days[trxn_df_last_30_days.credit_or_debit == 'CREDIT']
credits_df_last_3_months = trxn_df_last_3_months[trxn_df_last_3_months.credit_or_debit == 'CREDIT']
credits_df_last_6_months = trxn_df_last_6_months[trxn_df_last_6_months.credit_or_debit == 'CREDIT']
credits_df_last_year = trxn_df_last_year[trxn_df_last_year.credit_or_debit == 'CREDIT']

  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)


In [11]:
%%time

# creating windowed expenses aggregate metrics
outflows_ftrs = debits_df[~debits_df['category'].isin(debits_not_expenses)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_ftrs.columns = ['outflows_amt_' + col for col in outflows_ftrs.columns]

outflows_last_14_days_agg_df = debits_df_last_14_days[~debits_df_last_14_days['category'].isin(debits_not_expenses)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_last_14_days_agg_df.columns = ['outflows_amt_last_14_days_' + col for col in outflows_last_14_days_agg_df.columns]

outflows_last_30_days_agg_df = debits_df_last_30_days[~debits_df_last_30_days['category'].isin(debits_not_expenses)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_last_30_days_agg_df.columns = ['outflows_amt_last_30_days_' + col for col in outflows_last_30_days_agg_df.columns]

outflows_last_3_months_agg_df = debits_df_last_3_months[~debits_df_last_3_months['category'].isin(debits_not_expenses)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_last_3_months_agg_df.columns = ['outflows_amt_last_3_months_' + col for col in outflows_last_3_months_agg_df.columns]

outflows_last_6_months_agg_df = debits_df_last_6_months[~debits_df_last_6_months['category'].isin(debits_not_expenses)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_last_6_months_agg_df.columns = ['outflows_amt_last_6_months_' + col for col in outflows_last_6_months_agg_df.columns]

outflows_last_year_agg_df = debits_df_last_year[~debits_df_last_year['category'].isin(debits_not_expenses)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
outflows_last_year_agg_df.columns = ['outflows_amt_last_year_' + col for col in outflows_last_year_agg_df.columns]

outflows_df = [consDF[['prism_consumer_id']], outflows_ftrs, outflows_last_14_days_agg_df, outflows_last_30_days_agg_df, outflows_last_3_months_agg_df, outflows_last_6_months_agg_df, outflows_last_year_agg_df]
outflows_ftrs = reduce(lambda left, right: pd.merge(left, right, on='prism_consumer_id', how='left'), outflows_df)

# inflows
inflows_ftrs = credits_df[~credits_df['category'].isin(credits_not_income)] \
                        .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_ftrs.columns = ['outflows_amt_' + col for col in inflows_ftrs.columns]

inflows_last_14_days_agg_df = credits_df_last_14_days[~credits_df_last_14_days['category'].isin(credits_not_income)] \
                .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_last_14_days_agg_df.columns = ['inflows_amt_last_14_days_' + col for col in inflows_last_14_days_agg_df.columns]

inflows_last_30_days_agg_df = credits_df_last_30_days[~credits_df_last_30_days['category'].isin(credits_not_income)] \
                .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_last_30_days_agg_df.columns = ['inflows_amt_last_30_days_' + col for col in inflows_last_30_days_agg_df.columns]

inflows_last_3_months_agg_df = credits_df_last_3_months[~credits_df_last_3_months['category'].isin(credits_not_income)] \
                .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_last_3_months_agg_df.columns = ['inflows_amt_last_3_months_' + col for col in inflows_last_3_months_agg_df.columns]

inflows_last_6_months_agg_df = credits_df_last_6_months[~credits_df_last_6_months['category'].isin(credits_not_income)] \
                .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_last_6_months_agg_df.columns = ['inflows_amt_last_6_months_' + col for col in inflows_last_6_months_agg_df.columns]

inflows_last_year_agg_df = credits_df_last_year[~credits_df_last_year['category'].isin(credits_not_income)] \
                .groupby('prism_consumer_id')['amount'].agg(['mean', 'std', 'median', 'min', 'max'])
inflows_last_year_agg_df.columns = ['inflows_amt_last_year_' + col for col in inflows_last_year_agg_df.columns]

inflows_df = [consDF[['prism_consumer_id']], inflows_ftrs, inflows_last_14_days_agg_df, inflows_last_30_days_agg_df, inflows_last_3_months_agg_df, inflows_last_6_months_agg_df, inflows_last_year_agg_df]
inflows_ftrs = reduce(lambda left, right: pd.merge(left, right, on='prism_consumer_id', how='left'), inflows_df)

CPU times: user 3.41 s, sys: 355 ms, total: 3.76 s
Wall time: 3.76 s


In [12]:
cat_ok = ['SELF_TRANSFER', 'EXTERNAL_TRANSFER', 'DEPOSIT', 'PAYCHECK',
       'MISCELLANEOUS', 'PAYCHECK_PLACEHOLDER', 'REFUND',
       'INVESTMENT_INCOME', 'OTHER_BENEFITS', 
       'SMALL_DOLLAR_ADVANCE', 'TAX', 'LOAN', 'INSURANCE',
       'FOOD_AND_BEVERAGES', 'UNCATEGORIZED', 'GENERAL_MERCHANDISE',
       'AUTOMOTIVE', 'GROCERIES', 'ATM_CASH', 'ENTERTAINMENT', 'TRAVEL',
       'ESSENTIAL_SERVICES', 'ACCOUNT_FEES', 'HOME_IMPROVEMENT',
       'OVERDRAFT', 'CREDIT_CARD_PAYMENT', 'HEALTHCARE_MEDICAL', 'PETS',
       'EDUCATION', 'GIFTS_DONATIONS', 'BILLS_UTILITIES', 'MORTGAGE',
       'RENT', 'BNPL', 'AUTO_LOAN',
       'BANKING_CATCH_ALL', 'DEBT', 'FITNESS', 'TRANSPORATION', 'LEGAL',
       'GOVERNMENT_SERVICES', 'RISK_CATCH_ALL', 'RTO_LTO', 'INVESTMENT',
       'GAMBLING', 'CORPORATE_PAYMENTS', 'TIME_OR_STUFF', 'PENSION']

In [13]:
%%time

category_features = generate_category_features(trxnDF, cat_map, cat_ok)

CPU times: user 1min 38s, sys: 15.3 s, total: 1min 53s
Wall time: 1min 53s


In [14]:
%%time

gambling_df_all = balanceDF[balanceDF['category'] == 'GAMBLING']
gambling_thresholds = [50, 100, 500, 1000]

# Filter for different time periods
gambling_last_month = filter_time_window(gambling_df_all, months=1)
gambling_last_6m = filter_time_window(gambling_df_all, months=6)
gambling_last_year = filter_time_window(gambling_df_all, years=1)

gambling_stats_all = compute_threshold_stats(gambling_df_all, gambling_thresholds, 'all')
gambling_stats_month = compute_threshold_stats(gambling_last_month, gambling_thresholds, '1m')
gambling_stats_6m = compute_threshold_stats(gambling_last_6m, gambling_thresholds, '6m')
gambling_stats_year = compute_threshold_stats(gambling_last_year, gambling_thresholds, '1y')

gambling_df = [consDF[['prism_consumer_id']], gambling_stats_all, gambling_stats_month, gambling_stats_6m, gambling_stats_year]
gambling_ftrs = reduce(lambda left, right: pd.merge(left, right, on='prism_consumer_id', how='left'), gambling_df)


  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)
  return df.groupby('prism_consumer_id', group_keys=False).apply(filter_group)


CPU times: user 3.95 s, sys: 36.4 ms, total: 3.98 s
Wall time: 3.97 s


In [15]:
feature_dfs = [consDF[['prism_consumer_id', 'DQ_TARGET']], balance_ftrs, balance_deltas_ftrs, category_features, outflows_ftrs, inflows_ftrs, inflows_agg_df, outflows_agg_df, gambling_ftrs,]
features_all = reduce(lambda left, right: pd.merge(left, right, on='prism_consumer_id', how='left'), feature_dfs)


In [16]:
with open("catboost_500_80.pkl", "rb") as f:
    model = pickle.load(f)

top_500 = []
with open('500_ftrs.txt', 'r') as f:
    for line in f:
        top_500.append(line.strip())


In [17]:
# top_500

In [18]:
data = features_all.copy().drop(labels='prism_consumer_id', axis=1) 
binary_cols = [col for col in data.columns if data[col].nunique() == 2 and sorted(data[col].unique()) == [0, 1]]
continuous_cols = [col for col in data.columns if col not in binary_cols]
data.fillna(0, inplace=True)


In [19]:
avg_ftrs = data.drop('DQ_TARGET', axis=1).mean()
avg_ftrs.head()


balance_mean      2,418.8572
balance_std       2,383.4863
balance_median    2,183.6499
balance_min      -2,115.7357
balance_max       8,653.8907
dtype: object

In [20]:
scaler = StandardScaler()
data[continuous_cols] = scaler.fit_transform(data[continuous_cols])

data.mean().head()

DQ_TARGET         0.0838
balance_mean     -0.0000
balance_std      -0.0000
balance_median   -0.0000
balance_min      -0.0000
dtype: float64

In [21]:
X = data.drop(columns=["DQ_TARGET"])
y = data['DQ_TARGET']

### Features For User to Edit

- balance (balance_mean)
- monthly rent - last 6 months (RENT_last_6_months_min)
- minimum paycheck in the last quarter (PAYCHECK_last_3_months_min)
- outflow spenditure last quarter (outflows_amt_last_3_months_median)
- num of times gambled in last quarter (GAMBLING_last_3_months_count)

In [22]:
# user inputted features
example_input = {'balance_mean': 8000, 
                 'RENT_last_6_months_min': 700, 
                 'PAYCHECK_last_3_months_min': 8000, 
                 'outflows_amt_last_3_months_median': 0, 
                 'GAMBLING_last_3_months_count': 2}

In [23]:
def gen_user_ftrs(inp, overall):
    for key in inp.keys():
        overall[key] = inp[key]

    result = pd.DataFrame(overall,).T  
    result[continuous_cols] = scaler.transform(result[continuous_cols])

    return result

In [24]:
user_input = gen_user_ftrs(example_input, avg_ftrs)
user_input

Unnamed: 0,balance_mean,balance_std,balance_median,balance_min,balance_max,balance_last_14_days_mean,balance_last_14_days_std,balance_last_14_days_median,balance_last_14_days_min,balance_last_14_days_max,...,6m_count,6m_over_50,6m_over_100,6m_over_500,6m_over_1000,1y_count,1y_over_50,1y_over_100,1y_over_500,1y_over_1000
0,0.3055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
user_input['balance_mean']

0   0.3055
Name: balance_mean, dtype: float64

In [26]:
pred_proba = model.predict_proba(user_input)[:, 1][0]
pred_proba

0.9943242121700075

In [27]:
# Define interactive widgets
widget_layout = widgets.Layout(width='500')

avg_balance = widgets.IntSlider(min=-50000, max=50000, step=100, value=1000, description="Avg Balance", layout=widget_layout)
monthly_rent = widgets.IntSlider(min=0, max=5000, step=100, value=800, description="Monthly Rent", layout=widget_layout)
paycheck = widgets.IntSlider(min=0, max=20000, step=100, value=3500, description="Paycheck (Biweekly)", layout=widget_layout)
debit_outflows = widgets.IntSlider(min=0, max=5000, step=100, value=300, description="Debit Outflows", layout=widget_layout)
gambling_count = widgets.IntSlider(min=0, max=20, step=1, value=0, description="Gambling Transactions", layout=widget_layout)

# Output widget for displaying results
output = widgets.Output()

def gen_user_ftrs(inp, overall):
    for key in inp.keys():
        overall[key] = inp[key]

    result = pd.DataFrame(overall,).T  
    result[continuous_cols] = scaler.transform(result[continuous_cols])

    return result

# Function to predict default probability
def predict_default(avg_balance, monthly_rent, paycheck, debit_outflows, gambling_count):
    with output:
        clear_output(wait=True)  # Clear previous output
        
        user_inputs = {
            "balance_delta": avg_balance,
            "credit_inflow": paycheck,  # Adjust if necessary
            "debit_outflow": debit_outflows,
            "paycheck": paycheck,
            "gambling_amount": gambling_count,
        }

        print("Current Values:", user_inputs)  # Debugging print
        
        model_inputs = gen_user_ftrs(user_inputs, avg_ftrs)  # Ensure this function works correctly
        display(model_inputs)
        display(model_inputs[['RENT_last_6_months_min']])
        dq_proba = model.predict_proba(model_inputs)[0][1]
        
        print(f"Probability Consumer Defaults: {dq_proba:.2%}")


# Display interactive elements
interactive_ui = widgets.interactive_output(predict_default, {
    "avg_balance": avg_balance,
    "monthly_rent": monthly_rent,
    "paycheck": paycheck,
    "debit_outflows": debit_outflows,
    "gambling_count": gambling_count
})

# Button to trigger prediction
predict_button = widgets.Button(description="Predict Default Probability")
def on_button_click(_):
    predict_default(avg_balance.value, monthly_rent.value, paycheck.value, debit_outflows.value, gambling_count.value)

predict_button.on_click(on_button_click)



# Display interactive elements
display(avg_balance, monthly_rent, paycheck, debit_outflows, gambling_count, predict_button, output, interactive_ui)


IntSlider(value=1000, description='Avg Balance', layout=Layout(width='500'), max=50000, min=-50000, step=100)

IntSlider(value=800, description='Monthly Rent', layout=Layout(width='500'), max=5000, step=100)

IntSlider(value=3500, description='Paycheck (Biweekly)', layout=Layout(width='500'), max=20000, step=100)

IntSlider(value=300, description='Debit Outflows', layout=Layout(width='500'), max=5000, step=100)

IntSlider(value=0, description='Gambling Transactions', layout=Layout(width='500'), max=20)

Button(description='Predict Default Probability', style=ButtonStyle())

Output()

Output()

In [28]:
#  k nearest neighbor to find a similar consumer
# similar test

In [29]:
# Define interactive widgets
avg_balance = widgets.IntSlider(min=-5000, max=5000, step=100, value=0, description="Average Balance")
monthly_rent =  widgets.IntSlider(min=-5000, max=5000, step=100, value=0, description="Rent (per month)")
paycheck = widgets.IntSlider(min=0, max=10000, step=500, value=3000, description="Paycheck")
debit_outflows = widgets.IntSlider(min=0, max=10000, step=500, value=1500, description="Debit Outflows (per month)")
gambling_count = widgets.IntSlider(min=0, max=20, step=1, value=0, description="Gambling")

In [30]:
display(avg_balance, monthly_rent, paycheck, debit_outflows, gambling_count)

IntSlider(value=0, description='Average Balance', max=5000, min=-5000, step=100)

IntSlider(value=0, description='Rent (per month)', max=5000, min=-5000, step=100)

IntSlider(value=3000, description='Paycheck', max=10000, step=500)

IntSlider(value=1500, description='Debit Outflows (per month)', max=10000, step=500)

IntSlider(value=0, description='Gambling', max=20)

In [31]:
# Function to predict default probability
def predict_default():
    user_inputs = {
        "balance_delta": balance_delta.value,
        "credit_inflow": credit_inflow.value,
        "debit_outflows": debit_outflows.value,
        "paycheck": paycheck.value,
        "gambling_amount": gambling_amount.value,
    }

    model_inputs = gen_user_ftrs(user_inputs, avg_ftrs)
    dq_proba = model.predict_proba(model_inputs)[0][1]
    print(f"Probability Consumer Defaults: {dq_proba:.2%}")

In [114]:
# Display interactive elements
widgets.interactive(predict_default, 
                    avg_balance=avg_balance,
                    monthly_rent=monthly_rent,
                    paycheck=paycheck,
                    debit_outflows=debit_outflows,
                    gambling_count=gambling_count)


interactive(children=(Output(),), _dom_classes=('widget-interact',))

In [40]:
# INCLUDE SET THRESHOLD TO ALLOW CONSUMER TO DETERMINE WHETHER OR NOT TO EVEN OFFER A LOAN