In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from src import utils
import numpy as np

In [14]:
# Buat akses menuju data latih
X_train_data_path = './data/processed/X_train_ros.pkl'
y_train_data_path = './data/processed/y_train_ros.pkl' 

# Panggil fungsi deserialisasi untuk menampilkan data latih
X_train_clean = utils.deserialize_data(X_train_data_path)
y_train_clean = utils.deserialize_data(y_train_data_path)

In [15]:
import warnings

# Atasi warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')

# Atur hyperparameters untuk setiap model
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth'   : [None, 10, 20]
}

param_grid_dt = {
    'max_depth'        : [4, 6],
    'min_samples_split': [10, 15],
    'min_samples_leaf' : [2, 4]
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights'    : ['uniform', 'distance'],
    'algorithm'  : ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Buat model instance
models = {
    'RandomForest'      : RandomForestClassifier(n_jobs=-1),
    'DecisionTree'      : DecisionTreeClassifier(),
    'KNN'               : KNeighborsClassifier()
}

# Buat Hyperparameter instance
param_grids = {
    'RandomForest': param_grid_rf,
    'DecisionTree': param_grid_dt,
    'KNN'         : param_grid_knn
}

def train_model(model, param_grid, X, y):
    """
    Melatih model dengan menyetel hyperparameter.

    Parameter
    model (sklearn estimator) : Model pembelajaran mesin yang akan dilatih.
    param_grid (dict)         : Grid hyperparameter untuk penyetelan.
    X (numpy.ndarray)         : Fitur data latih.
    y (numpy.ndarray)         : Target data latih.

    Return
    best_model                : Model terbaik untuk setiap algoritma setelah pelatihan.
    """

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, verbose=3)
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    return best_model

trained_rfo = train_model(models['RandomForest'], param_grids['RandomForest'], X_train_clean, y_train_clean)
trained_dtr = train_model(models['DecisionTree'], param_grids['DecisionTree'], X_train_clean, y_train_clean)
trained_knn = train_model(models['KNN'], param_grids['KNN'], X_train_clean, y_train_clean)

    


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [16]:
# Simpan model yang sudah dilatih
utils.serialize_data(trained_rfo, 'models/trained_RandomForest.pkl')


['models/trained_RandomForest.pkl']

In [17]:
utils.serialize_data(trained_dtr, 'models/trained_DecisionTree.pkl')

['models/trained_DecisionTree.pkl']

In [18]:
utils.serialize_data(trained_knn, 'models/trained_KNearestNeighbor.pkl')

['models/trained_KNearestNeighbor.pkl']