In [22]:
from sqlalchemy import create_engine
from sklearn import tree
import pandas as pd
import numpy as np 
import scipy
import json


class Config:
    engine = create_engine('mysql+pymysql://root:123@localhost:3307/give_me_some_credit')
    train = 'cs_training'
    label_field = 'SeriousDlqin2yrs'
    encoded_train = 'cs_training_encoded'
    train_dict_file = r"..\..\encoders\give_me_some_credit\encoded_train_dict.json"
    # TODO: add log function
    log_file = None
    feature_importance_file = r"\feature_importance.json"

    test = 'cs_test'
    encoded_test = 'cs_test_encoded'
    test_dict_file = r"..\..\encoders\give_me_some_credit\encoded_test_dict.json"


In [23]:
from sklearn.model_selection import train_test_split


def get_train_sample():
    label = pd.read_sql(sql=f"select {Config.label_field} from {Config.train}", con=Config.engine)
    encoded_data = pd.read_sql(sql=f"select * from {Config.encoded_train}", con=Config.engine)
    x_train, x_test, y_tran, y_test = train_test_split(encoded_data, label, train_size=0.8, random_state=10)
    return x_train, x_test, y_tran, y_test


def get_pred_sample():
    id = pd.read_sql(sql=f"select Id from {Config.test}", con=Config.engine)
    encoded_data = pd.read_sql(sql=f"select * from {Config.encoded_test}", con=Config.engine)
    return id,encoded_data

In [24]:
# tree model 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import pickle


param_grid = {
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_depth':[None, 8, 10, 12, 14],
    'min_samples_split':[2, 8, 14],
    'min_samples_leaf':[350, 400, 450],
    'max_leaf_nodes':[None, 200, 400],
    'random_state':[10,],
    'class_weight':[None, 'balanced',],
    # 'monotonic_cst':np.array([1, 0, -1,...)], 
}

class ModelConfig:
    cv_result = r".\cv_result.pkl"
    model_file = r".\clf.joblib"
    test_metrics = r".\test_metrics.pkl"
    pred_file = r".\sampleEntry.csv"
    

In [25]:
def trainer():
    x_train, x_test, y_train, y_test = get_train_sample()
    estimator = DecisionTreeClassifier()
    clf = GridSearchCV(estimator=estimator,
                     param_grid=param_grid,
                     scoring='roc_auc',
                     n_jobs=-1,
                     refit=True,
                     cv=5,
                     verbose=3,
                     return_train_score=False
                     )
    clf.fit(x_train, y_train)

    # save cross validate result using pickle file 
    with open(ModelConfig.cv_result, 'wb') as f:
        pickle.dump(clf.cv_results_, f)
    
    # best params and socres on validations
    print(f"best params:\n {clf.best_params_} \n {clf.best_score_}")
    print(f"best socres on validations:\n {clf.best_score_}")

    # save best estimator by joblib format
    joblib.dump(clf.best_estimator_, ModelConfig.model_file)

    # test performance 
    test_metrics = {}
    y_test_proba = clf.best_estimator_.predict_proba(x_test)
    proba = list(map(lambda row: row[1], y_test_proba))
    pred = clf.best_estimator_.predict(x_test)

    test_metrics['auc'] = roc_auc_score(y_test, proba)
    test_metrics['accuray'] = accuracy_score(y_test, pred)
    test_metrics['precision'] = precision_score(y_test, pred)
    test_metrics['recall'] = recall_score(y_test, pred)
    test_metrics['f1'] = f1_score(y_test, pred)
    
    with open(ModelConfig.test_metrics, 'wb') as f:
        pickle.dump(test_metrics, f)
    print(f"test metrics: \n {test_metrics}")


def predict():
    id, encoded_data = get_pred_sample()
    m = joblib.load(ModelConfig.model_file)
    proba = m.predict_proba(encoded_data)
    proba = list(map(lambda row: row[1], proba))
    id['Probability'] = proba
    id.to_csv(ModelConfig.pred_file, index=False)
    id.head(5)



In [26]:
if __name__ == "__main__":
    trainer()
    predict()

Fitting 5 folds for each of 810 candidates, totalling 4050 fits
best params:
 {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_leaf_nodes': None, 'min_samples_leaf': 450, 'min_samples_split': 2, 'random_state': 10} 
 0.8537265605451033
best socres on validations:
 0.8537265605451033
test metrics: 
 {'auc': 0.8598003876955467, 'accuray': 0.9372666666666667, 'precision': 0.6033690658499234, 'recall': 0.19533961328705998, 'f1': 0.2951310861423221}
