In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.dummy import DummyClassifier


import numpy as np


from datetime import datetime
from tqdm import tqdm
import yaml
import joblib
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import hashlib

# 1. Load Params

In [12]:
params_dir = "config/config.yaml"

In [13]:
def load_params(param_dir):
    "Function to read params config"
    with open(param_dir, 'r') as file:
        params = yaml.safe_load(file)
        
    return params

In [14]:
params = load_params(params_dir)

# 2. Load Dataset

In [42]:
x_train = joblib.load("data/processed/x_train_feng.pkl")
y_train = joblib.load("data/processed/y_train_feng.pkl")

x_valid = joblib.load("data/processed/x_valid_feng.pkl")
y_valid = joblib.load("data/processed/y_valid_feng.pkl")

x_test = joblib.load("data/processed/x_test_feng.pkl")
y_test = joblib.load("data/processed/y_test_feng.pkl")

In [43]:
x_train.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M
0,0.645921,0.402959,-0.543586,1.379991,-0.281424,-0.328918,0.814372,-0.656408
1,1.195562,0.74034,1.959851,-1.779083,1.401429,-0.328918,-1.22794,1.523443
2,1.095627,0.74034,-1.042033,0.856832,0.033128,-0.328918,0.814372,-0.656408
3,-0.303458,-0.609183,-0.918822,0.404099,-0.422972,3.040276,-1.22794,-0.656408
4,1.245529,1.280149,-0.095544,0.484585,-0.218513,-0.328918,0.814372,-0.656408


# 3. Create Model 

In [48]:
def create_model_param():
    """Create the model objects"""
    knn_params = {
        'n_neighbors': [50, 100, 200],
    }
    
    lgr_params = {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

    dt_params = {
        'min_samples_split': [2, 5, 10, 25, 50]
    }

    # Create model params
    list_of_param = {
        'KNeighborsClassifier': knn_params,
        'LogisticRegression': lgr_params,
        'DecisionTreeClassifier': dt_params
    }

    return list_of_param

In [52]:
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    knn = KNeighborsClassifier()
    lgr = LogisticRegression(solver='liblinear')
    dt = DecisionTreeClassifier()

    # Create list of model
    list_of_model = [
        {'model_name': knn.__class__.__name__, 'model_object': knn},
        {'model_name': lgr.__class__.__name__, 'model_object': lgr},
        {'model_name': dt.__class__.__name__, 'model_object': dt}
    ]

    return list_of_model

In [39]:
lgr_params = {
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

In [44]:
# Create model object
model = RandomizedSearchCV(estimator = LogisticRegression(solver = 'liblinear'),
                            param_distributions = lgr_params,
                            n_iter=5,
                            cv = 5,
                            random_state = 123,
                            n_jobs=1,
                            verbose=10,
                            scoring = 'roc_auc')

# Train model
model.fit(x_train, y_train)

# Predict
y_pred_proba_train = model.predict_proba(x_train)[:, 1]
y_pred_proba_valid = model.predict_proba(x_valid)[:, 1]

# Get score
train_score = roc_auc_score(y_train, y_pred_proba_train)
valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START C=0.01, max_iter=300........................................
[CV 1/5; 1/5] END .........C=0.01, max_iter=300;, score=0.882 total time=   0.0s
[CV 2/5; 1/5] START C=0.01, max_iter=300........................................
[CV 2/5; 1/5] END .........C=0.01, max_iter=300;, score=0.838 total time=   0.0s
[CV 3/5; 1/5] START C=0.01, max_iter=300........................................
[CV 3/5; 1/5] END .........C=0.01, max_iter=300;, score=0.837 total time=   0.0s
[CV 4/5; 1/5] START C=0.01, max_iter=300........................................
[CV 4/5; 1/5] END .........C=0.01, max_iter=300;, score=0.895 total time=   0.0s
[CV 5/5; 1/5] START C=0.01, max_iter=300........................................
[CV 5/5; 1/5] END .........C=0.01, max_iter=300;, score=0.874 total time=   0.0s
[CV 1/5; 2/5] START C=0.1, max_iter=100.........................................
[CV 1/5; 2/5] END ..........C=0.1, max_iter=100;,

In [61]:
def train_model(return_file=True):
    """Function to get the best model"""
    # Load dataset
    X_train = joblib.load(params['train_feng_set_path'][0])
    y_train = joblib.load(params['train_feng_set_path'][1])
    X_valid = joblib.load(params['valid_feng_set_path'][0])
    y_valid = joblib.load(params['valid_feng_set_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = RandomizedSearchCV(estimator = model_obj,
                                   param_distributions = model_param,
                                   n_iter=5,
                                   cv = 5,
                                   random_state = 123,
                                   n_jobs=1,
                                   verbose=10,
                                   scoring = 'roc_auc')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict
        y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        # Get score
        train_score = roc_auc_score(y_train, y_pred_proba_train)
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_auc': train_score,
            'valid_auc': valid_score,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")
   
    # Dump data
    joblib.dump(list_of_param, params['list_of_param_path'])
    joblib.dump(list_of_model, params['list_of_model_path'])
    joblib.dump(list_of_tuned_model, params['list_of_tuned_model_path'])


    if return_file:
        return list_of_param, list_of_model, list_of_tuned_model    

In [62]:
list_of_param, list_of_model, list_of_tuned_model = train_model()

Creating model objects
Training model : KNeighborsClassifier
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START n_neighbors=50..............................................
[CV 1/5; 1/3] END ...............n_neighbors=50;, score=0.948 total time=   0.1s
[CV 2/5; 1/3] START n_neighbors=50..............................................




[CV 2/5; 1/3] END ...............n_neighbors=50;, score=0.921 total time=   0.1s
[CV 3/5; 1/3] START n_neighbors=50..............................................
[CV 3/5; 1/3] END ...............n_neighbors=50;, score=0.912 total time=   0.2s
[CV 4/5; 1/3] START n_neighbors=50..............................................
[CV 4/5; 1/3] END ...............n_neighbors=50;, score=0.898 total time=   0.0s
[CV 5/5; 1/3] START n_neighbors=50..............................................
[CV 5/5; 1/3] END ...............n_neighbors=50;, score=0.933 total time=   0.0s
[CV 1/5; 2/3] START n_neighbors=100.............................................
[CV 1/5; 2/3] END ..............n_neighbors=100;, score=0.951 total time=   0.1s
[CV 2/5; 2/3] START n_neighbors=100.............................................
[CV 2/5; 2/3] END ..............n_neighbors=100;, score=0.914 total time=   0.1s
[CV 3/5; 2/3] START n_neighbors=100.............................................
[CV 3/5; 2/3] END ..........

In [65]:
def get_best_model(return_file=True):
    """Function to get the best model"""
    # Load tuned model
    list_of_tuned_model = joblib.load(params['list_of_tuned_model_path'])

    # Get the best model
    best_model_name = None
    best_model = None
    best_performance = -99999
    best_model_param = None

    for model_name, model in list_of_tuned_model.items():
        if model['valid_auc'] > best_performance:
            best_model_name = model_name
            best_model = model['model']
            best_performance = model['valid_auc']
            best_model_param = model['best_params']

    # Dump the best model
    joblib.dump(best_model, params['best_model_path'])

    # Print
    print('=============================================')
    print('Best model        :', best_model_name)
    print('Metric score      :', best_performance)
    print('Best model params :', best_model_param)
    print('=============================================')

    if return_file:
        return best_model

In [66]:
best_model = get_best_model()

Best model        : KNeighborsClassifier
Metric score      : 0.5
Best model params : {'n_neighbors': 50}


In [69]:
THRESHOLD = np.linspace(0, 1, 100)

In [85]:
def get_best_threshold(return_file=True):
    """Function to tune & get the best decision threshold"""
    # Load data & model
    x_valid = joblib.load(params['valid_feng_set_path'][0])
    y_valid = joblib.load(params['valid_feng_set_path'][1])
    best_model = joblib.load(params['best_model_path'])

    # Get the proba pred
    y_pred_proba = best_model.predict_proba(x_valid)[:, 1]

    # Initialize
    metric_threshold = pd.Series([])
    
    # Optimize
    for threshold_value in THRESHOLD:
        # Get predictions
        y_pred = (y_pred_proba >= threshold_value).astype(int)

        # Get the F1 score
        metric_score = f1_score(y_valid, y_pred, average='macro')

        # Add to the storage
        metric_threshold[metric_score] = threshold_value

    # Find the threshold @max metric score
    metric_score_max_index = metric_threshold.index.max()
    best_threshold = metric_threshold[metric_score_max_index]
    print('=============================================')
    print('Best threshold :', best_threshold)
    print('Metric score   :', metric_score_max_index)
    print('=============================================')
    
    # Dump file
    joblib.dump(best_threshold, params['best_threshold_path'])

    if return_file:
        return best_threshold

In [86]:
get_best_threshold()

  metric_threshold = pd.Series([])


Best threshold : 0.595959595959596
Metric score   : 0.4910941475826972


0.595959595959596