In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
acctDF = pd.read_parquet('../../data/q2-ucsd-acctDF.pqt')
consDF = pd.read_parquet('../../data/q2-ucsd-consDF.pqt')
cat_map = pd.read_csv('../../data/q2-ucsd-cat-map.csv')
trxnDF = pd.read_parquet('../../data/q2-ucsd-trxnDF.pqt')

In [3]:
acctDF.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [4]:
consDF.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [6]:
cat_map

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS
5,5,PAYCHECK_PLACEHOLDER
6,6,REFUND
7,7,INVESTMENT_INCOME
8,8,OTHER_BENEFITS
9,9,UNEMPLOYMENT_BENEFITS


In [7]:
trxnDF.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [20]:
consDF = consDF.dropna()

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score,r2_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [31]:
consDF['evaluation_date'] = pd.to_datetime(consDF['evaluation_date'])

# Create a new feature 'days_since_evaluation' representing the number of days since the evaluation date
consDF['days_since_evaluation'] = (pd.Timestamp('now') - consDF['evaluation_date']).dt.total_seconds()

In [32]:
# Drop the original 'evaluation_date' column
X = consDF.drop(columns=['DQ_TARGET', 'evaluation_date'])
y = consDF['DQ_TARGET']

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

In [38]:
# Get feature importances
feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

most_important_feature = feature_importance_df.iloc[0]
print(f"The most important feature is: {most_important_feature['Feature']} with an importance score of {most_important_feature['Importance']:.4f}")

The most important feature is: prism_consumer_id with an importance score of 0.3850


In [36]:
X_train_single = X_train[[most_important_feature['Feature']]]
X_test_single = X_test[[most_important_feature['Feature']]]

single_feature_model = RandomForestClassifier(random_state=42)
single_feature_model.fit(X_train_single, y_train)

In [39]:
y_pred = single_feature_model.predict(X_test_single)
y_pred_proba = single_feature_model.predict_proba(X_test_single)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
r2 = r2_score(y_test, y_pred)

print(f"Accuracy using the single best feature: {accuracy:.4f}")
print(f"ROC AUC using the single best feature: {roc_auc:.4f}")
print(f"R² score using the single best feature: {r2:.4f}")

Accuracy using the single best feature: 0.8579
ROC AUC using the single best feature: 0.6155
R² score using the single best feature: -0.7950
