In [1]:
import pandas as pd
import joblib

In [2]:
#Read dataset files
train_df = pd.read_csv('dataset/train_data_cleaned.csv')

In [4]:
#Keep only the top 5 important features
train_df = train_df[['credit_score', 'prev_defaults', 'default_in_last_6months', 'net_yearly_income','yearly_debt_payments', 'credit_card_default']]
train_df

Unnamed: 0,credit_score,prev_defaults,default_in_last_6months,net_yearly_income,yearly_debt_payments,credit_card_default
0,544.0,2,1,107934.04,33070.28,1
1,857.0,0,0,109862.62,15329.53,0
2,650.0,0,0,230153.17,48416.60,0
3,754.0,0,0,122325.82,22574.36,0
4,927.0,0,0,387286.00,38282.95,0
...,...,...,...,...,...,...
43503,907.0,0,0,96207.57,11229.54,0
43504,679.0,0,0,383476.74,43369.91,0
43505,727.0,0,0,260052.18,22707.51,0
43506,805.0,0,0,157363.04,20150.10,0


In [5]:
#Create X and y variables
X = train_df.drop('credit_card_default', axis=1)
y = train_df['credit_card_default']
target_names = ["no-default", "default"]

print(X.shape, y.shape)

(43508, 5) (43508,)


In [7]:
#Split the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train

Unnamed: 0,credit_score,prev_defaults,default_in_last_6months,net_yearly_income,yearly_debt_payments
24499,939.0,0,0,125193.81,25006.02
31231,918.0,0,0,108973.86,7885.67
2236,675.0,0,0,105554.43,7990.64
11768,625.0,1,1,188799.59,30306.92
37990,827.0,0,0,570465.22,46915.08
...,...,...,...,...,...
6265,810.0,0,0,58975.54,13896.89
11284,917.0,0,0,240691.17,54907.07
38158,802.0,0,0,110443.29,13738.36
860,747.0,0,0,164539.88,58183.73


In [8]:
# Scale the data
from sklearn.preprocessing import StandardScaler 
X_scaler = StandardScaler().fit(X_train)

# Transform the X_train and X_test
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled

array([[ 1.55299498, -0.22920129, -0.23099881, -0.09796797, -0.39477863],
       [ 1.34417196, -0.22920129, -0.23099881, -0.11857892, -1.38567361],
       [-1.07220875, -0.22920129, -0.23099881, -0.12292405, -1.37959813],
       ...,
       [ 0.19067335, -0.22920129, -0.23099881, -0.11671169, -1.04693047],
       [-0.35624409, -0.22920129, -0.23099881, -0.04797027,  1.52548738],
       [ 1.27456428, -0.22920129, -0.23099881,  0.06722366, -0.05030439]])

In [9]:
#Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.9793141491220005

In [11]:
#Save the model
filename = 'saved_models/final_model_trained.joblib'
joblib.dump(rf, filename)

#Save the scaler
filename = 'scaler.joblib'
joblib.dump(X_scaler, filename)

['scaler.joblib']