# Session 1: All is Function

---

Training PT. Astra Honda Motor with Pacmann AI

Buat semua menjadi fungsi
- `train_model`
- `get_best_model`
- `get_best_threshold`

## Fungsi `create_model_param` dan `create_model_object`
---

- Kita mesti memiliki beberapa model
- Misal
  - KNN
  - Decision Tree
  - Logistic Regression
  - Random Forest

- Kita definisikan parameternya dalam bentuk fungsi

In [1]:
def create_model_param():
    """Create the model objects"""
    knn_params = {
        'n_neighbors': [50, 100, 200],
    }

    dt_params = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, None]
    }
    
    lgr_params = {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

    rf_params = {
        'n_estimators': [100, 200, 300]
    }

    # Create model params
    list_of_param = {
        'KNeighborsClassifier': knn_params,
        'DecisionTreeClassifier': dt_params,
        'LogisticRegression': lgr_params,
        'RandomForestClassifier': rf_params
    }

    return list_of_param

In [2]:
list_of_param = create_model_param()
list_of_param

{'KNeighborsClassifier': {'n_neighbors': [50, 100, 200]},
 'DecisionTreeClassifier': {'criterion': ['gini', 'entropy'],
  'max_depth': [5, 10, None]},
 'LogisticRegression': {'penalty': ['l1', 'l2'],
  'C': [0.01, 0.1],
  'max_iter': [100, 300, 500]},
 'RandomForestClassifier': {'n_estimators': [100, 200, 300]}}

- Definisikan model

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
# Buat fungsi untuk membuat model
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    knn = KNeighborsClassifier()
    lgr = LogisticRegression(solver='liblinear')
    dt = DecisionTreeClassifier(random_state=123)
    rf = RandomForestClassifier(random_state=123)

    # Create list of model
    list_of_model = [
        {'model_name': knn.__class__.__name__, 'model_object': knn},
        {'model_name': lgr.__class__.__name__, 'model_object': lgr},
        {'model_name': dt.__class__.__name__, 'model_object': dt},
        {'model_name': rf.__class__.__name__, 'model_object': rf}
    ]

    return list_of_model

In [5]:
# Panggil
list_of_model = create_model_object()
list_of_model

Creating model objects


[{'model_name': 'KNeighborsClassifier',
  'model_object': KNeighborsClassifier()},
 {'model_name': 'LogisticRegression',
  'model_object': LogisticRegression(solver='liblinear')},
 {'model_name': 'DecisionTreeClassifier',
  'model_object': DecisionTreeClassifier(random_state=123)},
 {'model_name': 'RandomForestClassifier',
  'model_object': RandomForestClassifier(random_state=123)}]

## Fungsi `train_model`
---

- Kita akan melakukan cross-validation pada tahap ini.
- Untuk metriknya, kita pakai ROC AUC score

In [6]:
# Import library
import src.utils as utils
import copy as copy
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

- Dump beberapa hal
  - list parameter model
  - list model object
  - list model yang sudah di tuning

In [7]:
# Load config data
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_cols_path': 'data/output/input_cols.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_cols': 'Failure Type',
 'drop_cols': ['Product ID', 'Failure Type'],
 'seed': 123,
 'test_size': 0.2,
 'num_cols': ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'],
 'cat_cols': ['Type'],
 'num_imputer_path': 'data/output/num_imputer.pkl',
 'cat_imputer_path': 'data/output/cat_imputer.pkl',
 'scaler_path': 'data/output/scaler.pkl',
 'train_clean_path': 'data/output/X_train_clean.pkl',
 'valid_clean_path': 'data/output/X_valid_clean.pkl',
 'test_clean_path

In [8]:
def train_model():
    # Load dataset
    # Hanya menggunakan data train & valid
    X_train = utils.pickle_load(CONFIG_DATA['train_clean_path'])
    y_train = utils.pickle_load(CONFIG_DATA['train_set_path'][1])
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = GridSearchCV(estimator = model_obj,
                             param_grid = model_param,
                             cv = 5,
                             verbose=10,
                             scoring = 'roc_auc')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict
        y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        # Get score
        train_score = roc_auc_score(y_train, y_pred_proba_train)
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_auc': train_score,
            'valid_auc': valid_score,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")

    # Dump data
    utils.pickle_dump(list_of_param, CONFIG_DATA['list_of_param_path'])
    utils.pickle_dump(list_of_model, CONFIG_DATA['list_of_model_path'])
    utils.pickle_dump(list_of_tuned_model, CONFIG_DATA['list_of_tuned_model_path'])

    return list_of_param, list_of_model, list_of_tuned_model    

In [9]:
# Lakukan training
list_of_param, list_of_model, list_of_tuned_model = train_model()

Creating model objects
Training model : KNeighborsClassifier
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START n_neighbors=50..............................................
[CV 1/5; 1/3] END ...............n_neighbors=50;, score=0.897 total time=   0.1s
[CV 2/5; 1/3] START n_neighbors=50..............................................
[CV 2/5; 1/3] END ...............n_neighbors=50;, score=0.961 total time=   0.1s
[CV 3/5; 1/3] START n_neighbors=50..............................................
[CV 3/5; 1/3] END ...............n_neighbors=50;, score=0.943 total time=   0.1s
[CV 4/5; 1/3] START n_neighbors=50..............................................
[CV 4/5; 1/3] END ...............n_neighbors=50;, score=0.936 total time=   0.1s
[CV 5/5; 1/3] START n_neighbors=50..............................................
[CV 5/5; 1/3] END ...............n_neighbors=50;, score=0.959 total time=   0.1s
[CV 1/5; 2/3] START n_neighbors=100..................................

In [10]:
# Print hasil model yang sudah dituning
list_of_tuned_model

{'KNeighborsClassifier': {'model': GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
               param_grid={'n_neighbors': [50, 100, 200]}, scoring='roc_auc',
               verbose=10),
  'train_auc': 0.9700419098282484,
  'valid_auc': 0.9014214008142116,
  'best_params': {'n_neighbors': 50}},
 'LogisticRegression': {'model': GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
               param_grid={'C': [0.01, 0.1], 'max_iter': [100, 300, 500],
                           'penalty': ['l1', 'l2']},
               scoring='roc_auc', verbose=10),
  'train_auc': 0.8974317470594583,
  'valid_auc': 0.8716228719467062,
  'best_params': {'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}},
 'DecisionTreeClassifier': {'model': GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=123),
               param_grid={'criterion': ['gini', 'entropy'],
                           'max_depth': [5, 10, None]},
               scoring='roc_auc', verbose=10),
  'train_auc': 

## Fungsi `get_best_model`
---

- Fungsi untuk mencari model terbaik dari hasil tuning.
- Berarti model yang memiliki ROC AUC yang paling oke
- Dump best_model parameter

In [11]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_cols_path': 'data/output/input_cols.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_cols': 'Failure Type',
 'drop_cols': ['Product ID', 'Failure Type'],
 'seed': 123,
 'test_size': 0.2,
 'num_cols': ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'],
 'cat_cols': ['Type'],
 'num_imputer_path': 'data/output/num_imputer.pkl',
 'cat_imputer_path': 'data/output/cat_imputer.pkl',
 'scaler_path': 'data/output/scaler.pkl',
 'train_clean_path': 'data/output/X_train_clean.pkl',
 'valid_clean_path': 'data/output/X_valid_clean.pkl',
 'test_clean_path

In [12]:
def get_best_model():
    # Load tuned model
    list_of_tuned_model = utils.pickle_load(CONFIG_DATA['list_of_tuned_model_path'])

    # Get the best model
    best_model_name = None
    best_model = None
    best_performance = -99999
    best_model_param = None

    for model_name, model in list_of_tuned_model.items():
        if model['valid_auc'] > best_performance:
            best_model_name = model_name
            best_model = model['model']
            best_performance = model['valid_auc']
            best_model_param = model['best_params']

    # Dump the best model
    utils.pickle_dump(best_model, CONFIG_DATA['best_model_path'])

    # Print
    print('=============================================')
    print('Best model        :', best_model_name)
    print('Metric score      :', best_performance)
    print('Best model params :', best_model_param)
    print('=============================================')

    return best_model

In [13]:
# Pilih best model
best_model = get_best_model()

Best model        : RandomForestClassifier
Metric score      : 0.9309944022945966
Best model params : {'n_estimators': 200}


In [14]:
best_model

## Fungsi `get_best_threshold` (tambahan)
---

- Output dari klasifikasi bisa kita ibaratkan sebagai probability mendapatkan kelas 1.
- Untuk konversi dari probability menjadi kelas, kita butuh threshold. Contoh
- threshold = 0.3, maka P(y=1) = 0.4 masuk ke kelas 1.
- Threshold ini akan mempengaruhi seberapa besar bertemu kasus False Positive.
- Kita bisa optimalkan nilai `recall`
- Buat fungsi & dump hasil thresholdnya.

In [15]:
# Import library
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score

In [16]:
# Buat distribusi threshold dari 0-1 sebanyak 100 nilai
# karena probability paling kecil adalah 0 dan paling besar adalah 1
THRESHOLD = np.linspace(0, 1, 100)

In [17]:
# Update config
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_cols_path': 'data/output/input_cols.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_cols': 'Failure Type',
 'drop_cols': ['Product ID', 'Failure Type'],
 'seed': 123,
 'test_size': 0.2,
 'num_cols': ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'],
 'cat_cols': ['Type'],
 'num_imputer_path': 'data/output/num_imputer.pkl',
 'cat_imputer_path': 'data/output/cat_imputer.pkl',
 'scaler_path': 'data/output/scaler.pkl',
 'train_clean_path': 'data/output/X_train_clean.pkl',
 'valid_clean_path': 'data/output/X_valid_clean.pkl',
 'test_clean_path

In [18]:
# Buat fungsi
def get_best_threshold():
    # Load data & model
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][1])
    best_model = utils.pickle_load(CONFIG_DATA['best_model_path'])

    # Get the proba pred
    y_pred_proba = best_model.predict_proba(X_valid)[:, 1]

    # Initialize
    metric_threshold = pd.Series([])
    
    # Optimize
    for threshold_value in THRESHOLD:
        # Get predictions
        y_pred = (y_pred_proba >= threshold_value).astype(int)

        # Get the F1 score
        metric_score = recall_score(y_valid, y_pred, average='weighted')

        # Add to the storage
        metric_threshold[metric_score] = threshold_value

    # Find the threshold @max metric score
    metric_score_max_index = metric_threshold.index.max()
    best_threshold = metric_threshold[metric_score_max_index]
    print('=============================================')
    print('Best threshold :', best_threshold)
    print('Metric score   :', metric_score_max_index)
    print('=============================================')
    
    # Dump file
    utils.pickle_dump(best_threshold, CONFIG_DATA['best_threshold_path'])

    return best_threshold

In [19]:
# Panggil fufngsi
get_best_threshold()

Best threshold : 0.48484848484848486
Metric score   : 0.979375


0.48484848484848486

In [21]:
best_model = utils.pickle_load(CONFIG_DATA['best_model_path'])

# Get the proba pred
y_pred_proba = best_model.predict_proba(X_valid)[:, 1]

NameError: name 'X_valid' is not defined

Great! Sekarang tinggal dibuat `.py` file -nya

## Fungsi `split_train_test`
---

In [26]:
from src.data_preprocessing import preprocess_data

data = pd.read_csv('data/output/data.csv')
data_clean = preprocess_data(X=data, types='test', CONFIG_DATA=CONFIG_DATA)

data

ModuleNotFoundError: No module named 'utils'