- 최대한 전처리 없이 간단한 모델들로 baseline 모델 구성
- 생각보다 성능이 안나옴 -> preprocessing을 일단 적용

In [1]:
from glob import glob

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_path = "../take-home-interview/"

In [3]:
app = pd.read_csv(data_path + "loan_application.csv")

### Split

In [5]:
df = app[app['type'] == 'train'].reset_index(drop=True)

## 골고루 분포함을 확인
(df.application_id_current % 7).value_counts().sort_index()

train_index = df.application_id_current % 7 < 2
train_df = df[train_index]
test_df = df[~train_index]

## No preprocessing
- 결측 있는 칼럼 제외
- 스케일링 제외

In [6]:
missing_count = df.isna().sum()
no_missing_columns = missing_count[missing_count == 0].index
train_df_no_missing = train_df[no_missing_columns]
test_df_no_missing = test_df[no_missing_columns]

In [7]:
X_train = train_df_no_missing.drop(['application_id_current', 'target', 'type'], axis=1)
X_test = test_df_no_missing.drop(['application_id_current', 'target', 'type'], axis=1)

y_train = train_df_no_missing['target']
y_test = test_df_no_missing['target']

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
%%time
clf = RandomForestClassifier(max_depth=20)
clf.fit(X_train, y_train)

CPU times: user 11.5 s, sys: 41.7 ms, total: 11.6 s
Wall time: 11.6 s


RandomForestClassifier(max_depth=20)

In [10]:
from sklearn.metrics import confusion_matrix

In [11]:
y_pred_train = clf.predict(X_train)
confusion_matrix(y_train, y_pred_train)

array([[80946,     0],
       [ 4770,  2195]])

In [12]:
y_pred_test = clf.predict(X_test)
confusion_matrix(y_test, y_pred_test)

array([[201740,      0],
       [ 17859,      1]])

**칼럼 줄이기**

In [15]:
## 그냥 앞의 10개만 쓰기

In [27]:
first_10_columns = X_train.columns[:5]

X_train_col_10 = X_train[first_10_columns]
X_test_col_10 = X_test[first_10_columns]

In [31]:
%%time
clf = RandomForestClassifier()
clf.fit(X_train_col_10, y_train)

CPU times: user 8.88 s, sys: 29.6 ms, total: 8.91 s
Wall time: 8.91 s


RandomForestClassifier()

In [32]:
y_pred_train = clf.predict(X_train_col_10)
confusion_matrix(y_train, y_pred_train)

array([[80946,     0],
       [    9,  6956]])

In [33]:
y_pred_test = clf.predict(X_test_col_10)
confusion_matrix(y_test, y_pred_test)

array([[201495,    245],
       [ 17810,     50]])

### feature importance

In [34]:
%%time
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

CPU times: user 13.4 s, sys: 61 ms, total: 13.5 s
Wall time: 13.5 s


RandomForestClassifier()

In [36]:
y_pred_train = clf.predict(X_train)
confusion_matrix(y_train, y_pred_train)

array([[80946,     0],
       [    5,  6960]])

In [45]:
feature_importance = pd.Series(clf.feature_importances_, index=X_train.columns)

In [50]:
importance_top_5_features = feature_importance.nlargest(5).index

In [62]:
%%time
clf = RandomForestClassifier()
clf.fit(X_train[importance_top_5_features], y_train)

CPU times: user 16.3 s, sys: 29.8 ms, total: 16.3 s
Wall time: 16.4 s


RandomForestClassifier()

In [63]:
y_pred_train = clf.predict(X_train[importance_top_5_features])
confusion_matrix(y_train, y_pred_train)

array([[80946,     0],
       [    6,  6959]])

In [64]:
y_pred_test = clf.predict(X_test[importance_top_5_features])
confusion_matrix(y_test, y_pred_test)

array([[201705,     35],
       [ 17849,     11]])

In [65]:
%%time
clf = RandomForestClassifier(max_depth=20)
clf.fit(X_train[importance_top_5_features], y_train)

y_pred_train = clf.predict(X_train[importance_top_5_features])
print(confusion_matrix(y_train, y_pred_train))

y_pred_test = clf.predict(X_test[importance_top_5_features])
print(confusion_matrix(y_test, y_pred_test))

[[80946     0]
 [ 5949  1016]]
[[201736      4]
 [ 17859      1]]
CPU times: user 17.6 s, sys: 23.2 ms, total: 17.6 s
Wall time: 17.6 s


### Hyperparameter tuning

In [68]:
from sklearn.model_selection import RandomizedSearchCV

random_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits




RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [69]:
"""
{'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}
 """
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

### Preprocessing

In [None]:
feature_cols = [col for col in app.columns if col.startswith("col")]

In [35]:
pd.options.display.max_rows=200
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [38]:
range_df = pd.concat([
    app[feature_cols].min(),
    app[feature_cols].mean(), 
    app[feature_cols].median(), 
    app[feature_cols].max()
], axis=1)
range_df.columns = ['min', 'mean', 'median', 'max']

In [39]:
range_df

Unnamed: 0,min,mean,median,max
col_2,0.0,0.414,0.0,20.0
col_3,25650.0,170116.06,153000.0,117000000.0
col_4,45000.0,587767.414,500211.0,4050000.0
col_5,1615.5,27425.561,25078.5,258025.5
col_6,40500.0,528019.998,450000.0,4050000.0
col_7,0.0,0.021,0.019,0.073
col_8,-25229.0,-16041.249,-15755.0,-7338.0
col_9,-17912.0,64317.231,-1224.0,365243.0
col_10,-24672.0,-4983.594,-4502.0,0.0
col_11,-7197.0,-3002.071,-3252.0,0.0


In [None]:
integer_cols = [col2, ]