## 0. Required Libraries

In [1]:

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import joblib
import json
import pandas as pd
import copy
import hashlib
import os

import src.util as util

## 1. Load Configuration File

In [2]:
params = util.load_config()

## 2. Load Dataset

In [3]:
def load_train_feng(params: dict) -> pd.DataFrame:
    # Load train set
    x_train = util.pickle_load(params["train_feng_set_path"][0])
    y_train = util.pickle_load(params["train_feng_set_path"][1])

    return x_train, y_train

def load_valid_feng(params: dict) -> pd.DataFrame:
    # Load valid set
    x_valid = util.pickle_load(params["valid_feng_set_path"][0])
    y_valid = util.pickle_load(params["valid_feng_set_path"][1])

    return x_valid, y_valid

def load_test_feng(params: dict) -> pd.DataFrame:
    # Load test set
    x_test = util.pickle_load(params["test_feng_set_path"][0])
    y_test = util.pickle_load(params["test_feng_set_path"][1])

    return x_test, y_test

In [4]:
def load_dataset(params: dict) -> pd.DataFrame:
    # Debug message
    util.print_debug("Loading dataset.")

    # Load train set
    x_train, y_train = load_train_feng(params)

    # Load valid set
    x_valid, y_valid = load_valid_feng(params)

    # Load test set
    x_test, y_test = load_test_feng(params)

    # Debug message
    util.print_debug("Dataset loaded.")

    # Return the dataset
    return x_train, y_train, x_valid, y_valid, x_test, y_test

## 3. Create Training Log Template

In [5]:
def training_log_template() -> dict:
    # Debug message
    util.print_debug("Creating training log template.")
    
    # Template of training log
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
        "performance" : [],
        "r2_score" : [],
        "data_configurations" : [],
    }

    # Debug message
    util.print_debug("Training log template created.")

    # Return training log template
    return logger

In [6]:
def training_log_updater(current_log: dict, params: dict) -> list:
    # Create copy of current log
    current_log = copy.deepcopy(current_log)

    # Path for training log file
    log_path = params["training_log_path"]

    # Try to load training log file
    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()

    # If file not found, create a new one
    except FileNotFoundError as fe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()

        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    # Add current log to previous log
    last_log.append(current_log)

    # Save updated log
    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    # Return log
    return last_log

## 4. Training and Evaluation

### 4.1. Create Model Object

In [7]:
def create_model_object(params: dict) -> list:
    # Debug message
    util.print_debug("Creating model objects.")

    # Create model objects
    lr = LinearRegression()
    rfr = RandomForestRegressor()
    dct = DecisionTreeRegressor()


    # Create list of model
    list_of_model = [
        { "model_name": lr.__class__.__name__, "model_object": lr, "model_uid": ""},
        { "model_name": rfr.__class__.__name__, "model_object": rfr, "model_uid": ""},
        { "model_name": dct.__class__.__name__, "model_object": dct, "model_uid": ""},
       
    ]

    # Debug message
    util.print_debug("Model objects created.")

    # Return the list of model
    return list_of_model

### 4.2. Training Baseline Model

In [8]:
def train_eval(configuration_model: str, params: dict, hyperparams_model: list = None):
    # Load dataset
    x_train, y_train, \
    x_valid, y_valid, \
    x_test, y_test = load_dataset(params)

    # Variabel to store trained models
    list_of_trained_model = dict()

    # Create log template
    training_log = training_log_template()

    # Training for every data configuration
    for config_data in x_train:
        # Debug message
        util.print_debug("Training model based on configuration data: {}".format(config_data))

        # Create model objects
        if hyperparams_model == None:
            list_of_model = create_model_object(params)
        else:
            list_of_model = copy.deepcopy(hyperparams_model)

        # Variabel to store tained model
        trained_model = list()

        # Load train data based on its configuration
        x_train_data = x_train[config_data]
        y_train_data = y_train[config_data]

        # Train each model by current dataset configuration
        for model in list_of_model:
            # Debug message
            util.print_debug("Training model: {}".format(model["model_name"]))

            # Training
            training_time = util.time_stamp()
            model["model_object"].fit(x_train_data, y_train_data)
            training_time = (util.time_stamp() - training_time).total_seconds()

            # Debug message
            util.print_debug("Evaluating model: {}".format(model["model_name"]))

            # Evaluation
            y_predict = model["model_object"].predict(x_valid)
            #performance = classification_report(y_valid, y_predict, output_dict = True)
            performance = mean_squared_error(y_valid, y_predict)
            r2 = r2_score(y_valid, y_predict)

            # Debug message
            util.print_debug("Logging: {}".format(model["model_name"]))

            # Create UID
            uid = hashlib.md5(str(training_time).encode()).hexdigest()

            # Assign model's UID
            model["model_uid"] = uid

            # Create training log data
            training_log["model_name"].append("{}-{}".format(configuration_model, model["model_name"]))
            training_log["model_uid"].append(uid)
            training_log["training_time"].append(training_time)
            training_log["training_date"].append(util.time_stamp())
            training_log["performance"].append(performance)
            training_log["r2_score"].append(r2)
            training_log["data_configurations"].append(config_data)

            # Collect current trained model
            trained_model.append(copy.deepcopy(model))

            # Debug message
            util.print_debug("Model {} has been trained for configuration data {}.".format(model["model_name"], config_data))
        
        # Collect current trained list of model
        list_of_trained_model[config_data] = copy.deepcopy(trained_model)
    
    # Debug message
    util.print_debug("All combination models and configuration data has been trained.")
    
    # Return list trained model
    return list_of_trained_model, training_log

In [9]:
list_of_trained_model, training_log = train_eval("Baseline", params)

2023-04-15 14:14:39.883328 Loading dataset.
2023-04-15 14:14:39.887686 Dataset loaded.
2023-04-15 14:14:39.887686 Creating training log template.
2023-04-15 14:14:39.887686 Training log template created.
2023-04-15 14:14:39.887686 Training model based on configuration data: Undersampling
2023-04-15 14:14:39.887686 Creating model objects.
2023-04-15 14:14:39.887686 Model objects created.
2023-04-15 14:14:39.887686 Training model: LinearRegression
2023-04-15 14:14:39.893925 Evaluating model: LinearRegression
2023-04-15 14:14:39.894431 Logging: LinearRegression
2023-04-15 14:14:39.895434 Model LinearRegression has been trained for configuration data Undersampling.
2023-04-15 14:14:39.895434 Training model: RandomForestRegressor
2023-04-15 14:14:39.985887 Evaluating model: RandomForestRegressor
2023-04-15 14:14:39.993894 Logging: RandomForestRegressor
2023-04-15 14:14:39.999887 Model RandomForestRegressor has been trained for configuration data Undersampling.
2023-04-15 14:14:39.999887 Tra

### 4.3. Choose Best Performance Baseline Model

In [11]:
def get_production_model(list_of_model, training_log, params):
    # Create copy list of model
    list_of_model = copy.deepcopy(list_of_model)
    
    # Debug message
    util.print_debug("Choosing model by metrics score.")

    # Create required predefined variabel
    curr_production_model = None
    prev_production_model = None
    production_model_log = None

    # Debug message
    util.print_debug("Converting training log type of data from dict to dataframe.")

    # Convert dictionary to pandas for easy operation
    training_log = pd.DataFrame(copy.deepcopy(training_log))

    # Debug message
    util.print_debug("Trying to load previous production model.")

    if os.path.exists(params["production_model_path"]):
        try:
            # Load the previous production model using JSON
            with open(params["production_model_path"], 'r') as f:
                serialized_model = f.read()
                prev_production_model = json.loads(serialized_model)
            util.print_debug("Previous production model loaded.")

        except Exception as e:
            util.print_debug("Failed to load previous production model: {}".format(str(e)))
            prev_production_model = None

    else:
        util.print_debug("No previous production model detected, choosing best model only from current trained model.")
        prev_production_model = None

             
    # If previous production model detected:
    if prev_production_model != None:
        # Debug message
        util.print_debug("Loading validation data.")
        x_valid, y_valid = load_valid_feng(params)
        
        # Debug message
        util.print_debug("Checking compatibilty previous production model's input with current train data's features.")

        # Check list features of previous production model and current dataset
        production_model_features = set(prev_production_model["model_data"]["model_object"].feature_names_in_)
        current_dataset_features = set(x_valid.columns)
        number_of_different_features = len((production_model_features - current_dataset_features) | (current_dataset_features - production_model_features))

        # If feature matched:
        if number_of_different_features == 0:
            # Debug message
            util.print_debug("Features compatible.")

            # Debug message
            util.print_debug("Reassesing previous model performance using current validation data.")

            # Re-predict previous production model to provide valid metrics compared to other current models
            y_pred = prev_production_model["model_data"]["model_object"].predict(x_valid)

            # Re-asses prediction result
            eval_res = classification_report(y_valid, y_pred, output_dict = True)

            # Debug message
            util.print_debug("Assessing complete.")

            # Debug message
            util.print_debug("Storing new metrics data to previous model structure.")

            # Update their performance log
            prev_production_model["model_log"]["performance"] = eval_res
            prev_production_model["model_log"]["r2_score"] = eval_res["r2-score"]

            # Debug message
            util.print_debug("Adding previous model data to current training log and list of model")

            # Added previous production model log to current logs to compere who has the greatest r2 score
            training_log = pd.concat([training_log, pd.DataFrame([prev_production_model["model_log"]])])

            # Added previous production model to current list of models to choose from if it has the greatest r2 score
            list_of_model["prev_production_model"] = [copy.deepcopy(prev_production_model["model_data"])]
        else:
            # To indicate that we are not using previous production model
            prev_production_model = None

            # Debug message
            util.print_debug("Different features between production model with current dataset is detected, ignoring production dataset.")

    # Debug message
    util.print_debug("Sorting training log by r2 macro avg and training time.")

    # Sort training log by MSE, r2 score and training time
    best_model_log = training_log.sort_values(["performance","r2_score", "training_time"], ascending = [True, False, True]).iloc[0]
    
    # Debug message
    util.print_debug("Searching model data based on sorted training log.")

    # Get model object with least MSE and greatest r2 score  by using UID
    for configuration_data in list_of_model:
        for model_data in list_of_model[configuration_data]:
            if model_data["model_uid"] == best_model_log["model_uid"]:
                curr_production_model = dict()
                curr_production_model["model_data"] = copy.deepcopy(model_data)
                curr_production_model["model_log"] = copy.deepcopy(best_model_log.to_dict())
                curr_production_model["model_log"]["model_name"] = "Production-{}".format(curr_production_model["model_data"]["model_name"])
                curr_production_model["model_log"]["training_date"] = str(curr_production_model["model_log"]["training_date"])
                production_model_log = training_log_updater(curr_production_model["model_log"], params)
                break
    
    # In case UID not found
    if curr_production_model == None:
        raise RuntimeError("The best model not found in your list of model.")
    
    # Debug message
    util.print_debug("Model chosen.")

    # Dump chosen production model
    util.pickle_dump(curr_production_model, params["production_model_path"])
    
    # Return current chosen production model, log of production models and current training log
    return curr_production_model, production_model_log, training_log
    

In [12]:
model, production_model_log, training_logs = get_production_model(list_of_trained_model, training_log, params)

2023-04-15 14:15:14.815538 Choosing model by metrics score.
2023-04-15 14:15:14.815538 Converting training log type of data from dict to dataframe.
2023-04-15 14:15:14.816538 Trying to load previous production model.
2023-04-15 14:15:14.825703 Failed to load previous production model: 'charmap' codec can't decode byte 0x81 in position 114: character maps to <undefined>
2023-04-15 14:15:14.825703 Sorting training log by r2 macro avg and training time.
2023-04-15 14:15:14.830002 Searching model data based on sorted training log.
2023-04-15 14:15:14.847929 Model chosen.


## 5. Hyperparameter Tuning

In [18]:
def create_dist_params(model_name: str) -> dict:
    # Define models paramteres


    dist_params_lr = {
        'fit_intercept': [True, False],
        'normalize': [True, False],
        'copy_X': [True, False]
    }
    
    dist_params_rfr = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    
    dist_params_dct = {
        'max_depth': [2, 4, 6, 8],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }


    # Make all models parameters in to one
    dist_params = {
        "DecisionTreeRegressor": dist_params_dct,
        "RandomForestRegressor": dist_params_rfr,
        "LinearRegression": dist_params_lr
        
    }

    # Return distribution of model parameters
    return dist_params[model_name]

In [19]:
def hyper_params_tuning(model: dict) -> list:
    # Create copy of current best baseline model
    model = copy.deepcopy(model)

    # Create model's parameter distribution
    dist_params = create_dist_params(model["model_data"]["model_name"])

    # Create model object
    model_rsc = RandomizedSearchCV(model["model_data"]["model_object"], dist_params, n_jobs = -1)
    model_data = {
        "model_name": model["model_data"]["model_name"],
        "model_object": model_rsc,
        "model_uid": ""
    }
    
    # Return model object
    return [model_data]

In [20]:
list_of_trained_model, training_log = train_eval("Hyperparams_Tuning", params, hyper_params_tuning(model))

2023-04-15 14:19:08.330910 Loading dataset.
2023-04-15 14:19:08.334950 Dataset loaded.
2023-04-15 14:19:08.334950 Creating training log template.
2023-04-15 14:19:08.334950 Training log template created.
2023-04-15 14:19:08.334950 Training model based on configuration data: Undersampling
2023-04-15 14:19:08.348914 Training model: RandomForestRegressor
2023-04-15 14:20:28.326864 Evaluating model: RandomForestRegressor
2023-04-15 14:20:28.341267 Logging: RandomForestRegressor
2023-04-15 14:20:28.370095 Model RandomForestRegressor has been trained for configuration data Undersampling.
2023-04-15 14:20:28.394446 Training model based on configuration data: Oversampling
2023-04-15 14:20:28.412117 Training model: RandomForestRegressor
2023-04-15 14:20:29.950587 Evaluating model: RandomForestRegressor
2023-04-15 14:20:29.956588 Logging: RandomForestRegressor
2023-04-15 14:20:29.979664 Model RandomForestRegressor has been trained for configuration data Oversampling.
2023-04-15 14:20:30.000665 T

In [21]:
model, production_model_log, training_logs = get_production_model(list_of_trained_model, training_log, params)

2023-04-15 14:20:50.309543 Choosing model by metrics score.
2023-04-15 14:20:50.310545 Converting training log type of data from dict to dataframe.
2023-04-15 14:20:50.311555 Trying to load previous production model.
2023-04-15 14:20:50.324551 Failed to load previous production model: 'charmap' codec can't decode byte 0x81 in position 114: character maps to <undefined>
2023-04-15 14:20:50.326540 Sorting training log by r2 macro avg and training time.
2023-04-15 14:20:50.328506 Searching model data based on sorted training log.
2023-04-15 14:20:50.348604 Model chosen.


## 6. Take a Look at Confusion Matrix

In [13]:
x_valid, y_valid = load_valid_feng(params)

In [14]:
y_pred = model["model_data"]["model_object"].predict(x_valid)

In [15]:
ConfusionMatrixDisplay.from_predictions(y_valid, y_pred)

ValueError: Unknown label type: (1642     0.831966
553      6.240909
1189     4.293156
1145    15.017377
965      1.396418
          ...    
992      2.426187
1433     1.467594
1601    10.622030
1277     3.062086
1950     2.127503
Name: EFConsPerCap, Length: 345, dtype: float64, array([ 1.64189472,  6.30147401,  2.13413277,  5.14406232,  2.30807866,
        3.97771209, 10.58480909,  1.6624049 ,  1.98284472,  0.92135653,
        1.4679926 ,  2.45073189,  7.21479844,  3.33425095,  5.71966222,
        9.78940817,  1.11951202,  1.60512817,  1.66998242,  1.44370112,
        4.89263718,  2.4995924 ,  4.85558378,  1.80765429,  1.64006374,
        7.52041564,  2.13143587,  6.5073347 ,  2.52077334,  1.6643239 ,
        0.91931713,  1.60526201,  2.76069943,  8.1447046 ,  3.7297187 ,
        1.14433117,  3.12229377,  0.58117691,  0.90968781,  2.1424704 ,
        0.97564222,  5.70250929,  2.38329385,  4.69143111,  1.31417459,
        1.1445893 ,  1.94814869,  6.34272353,  0.96978678,  6.19680845,
        1.43293805,  1.75130194,  0.6802435 ,  3.02119674,  3.73394675,
        0.9657133 ,  0.69384739,  1.12524062,  1.36270913,  1.20799645,
        3.56560493,  2.54746497,  1.21229285,  7.42299575,  5.45432758,
        2.41965436,  2.7170888 ,  0.83083389,  5.78381753,  0.94704813,
        3.12229377,  1.43055619,  7.29226105,  1.18172799,  1.96322841,
        2.94765816,  1.76670671,  3.18672042,  5.72795352,  3.28062827,
        4.89263718,  1.34133518,  2.91530623,  7.08622734,  2.50640465,
        0.89712635,  2.61260119,  5.98304428,  1.9067665 ,  5.7723028 ,
        8.83224593,  2.61889082, 10.80778678,  1.36197237,  6.19680845,
        5.31277088,  1.49084231,  5.76263881,  5.52742511,  1.6111836 ,
        1.11640929,  1.37888393,  1.6111836 ,  5.59475372,  1.11147778,
        2.88019331,  2.12246103,  1.25204235,  5.14406232,  5.45432758,
        1.62064134,  1.64078332,  1.20799645,  2.03998453,  5.76263881,
       11.8288133 ,  7.55707027,  1.69359952,  1.25442135,  2.14367889,
        1.11147778,  5.48919939,  1.97295378,  6.27619036,  0.9281833 ,
        3.33301411,  5.86444014,  6.75139305,  1.89990599,  3.02844556,
        2.77010611,  2.34057143,  2.07281282,  2.59518853,  2.20243836,
        5.39358172,  2.1909231 ,  2.24835585,  1.97412751,  2.90351501,
        2.60762743,  9.94666063,  7.55707027,  5.59780066,  1.16357161,
        2.38457425,  1.03521491,  1.20503413,  6.28023204,  4.14982368,
        1.21229285,  2.12246103,  2.48514903,  5.69180078,  2.24070607,
        2.35189993,  1.16305793,  4.43365022,  0.9663293 ,  0.97564222,
        0.91495166,  1.9802765 ,  2.19728638,  5.72795352,  0.96978678,
        2.68067988,  1.43055619,  5.87647502,  5.31782346,  0.85173862,
        5.9316496 ,  7.5934345 ,  0.8341973 ,  2.84624188,  8.1447046 ,
       10.84396207,  0.83067873,  1.63990185,  1.84977657,  3.59072849,
        6.73997544,  9.33085207,  1.51840554,  6.61181675,  7.5934345 ,
        2.74717187,  4.69143111,  8.68086327,  2.47756836,  2.05871168,
        1.02263218,  2.3097027 ,  1.62441698,  1.83559633,  5.32364474,
        0.90968781,  1.49567921,  5.45737827,  8.98259049,  5.90036015,
        1.56713123,  3.92289503,  2.60762743,  2.38329385,  6.327175  ,
        1.39889396,  1.66070906,  5.13418254,  1.7834883 ,  1.08769287,
        5.97966202,  3.83763027,  1.31417459,  5.48919939,  2.65381917,
        2.19431865,  1.32117837,  4.89263718, 10.49682228,  5.75252438,
        3.20757363,  6.80952214,  1.39413671,  5.22748967,  1.95288488,
        5.72795352,  1.18172799,  1.31417459,  1.00395551,  4.13127547,
        2.38934384,  2.23002633,  5.65140498,  2.17232943, 11.90042813,
        2.06572297,  2.74717187,  7.52041564,  0.70298976,  3.09369905,
       10.67942692,  2.12460872,  2.28838324,  6.95655408,  5.6281038 ,
        3.71046377,  4.85558378,  1.67284995,  2.36766112,  5.72198734,
        1.6485111 ,  1.0528694 ,  5.59780066,  1.82780242,  3.15333512,
        1.31603216,  1.12534064, 10.0324481 ,  2.04011644,  7.19700911,
        6.5073347 ,  0.97283367,  3.51100533,  1.26057808,  0.83083389,
        1.60512817,  1.4596551 ,  3.1123878 ,  4.23611949,  5.80695778,
        1.27110699,  2.02993837,  7.99406089,  2.13898415,  2.4216709 ,
        7.53575258,  2.12161535,  1.14433117,  1.54586856,  3.73394675,
        3.97304364,  2.0417353 ,  3.92289503,  3.612034  ,  2.83557762,
        2.15106204,  5.31782346,  1.03941309,  4.37777514,  1.27110699,
        5.85840447,  8.52619227,  1.78044315,  1.97604936,  1.21152569,
        7.19700911,  5.97966202,  6.73997544,  1.18571055,  0.90968781,
        3.13488386,  6.73997544,  1.66853846,  4.65161162,  1.00261982,
        2.94315455,  0.90425415,  1.31603216,  4.10054101,  2.28349677,
        3.22644849,  2.84594303, 13.57174038,  8.37677193,  4.89263718,
        1.03941309,  2.77655961,  7.93585119,  5.13418254,  1.07672724,
        2.15189212,  6.51315457,  1.60526201,  1.58542369,  2.55081207,
        2.13413277,  0.89588823,  1.7381054 ,  2.8446053 ,  1.57544902,
        5.52742511,  1.20799645,  1.95714038,  3.34209825,  5.84498092,
        0.69384739,  0.96684057,  1.70556371,  5.41951795,  7.66792277,
        2.20527088,  1.70960711, 10.67929376,  2.05335728,  2.2430925 ]))

In [None]:
ii