In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [57]:
def tune_and_train_RF(X_train, y_train, n_iter: int = 100):
    param_dist = {
        'n_estimators': [int(x) for x in range(5, 500, 100)],
        'max_depth': [int(x) for x in range(3, 25, 1)] + [None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }
    
    model = RandomForestRegressor()
    
    grid = RandomizedSearchCV(model, param_dist, cv=5, n_iter=n_iter, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    return grid.best_estimator_

def evaluate_RF(model, X_test, y_test):
    
    y_pred = model.predict(X_test)
    print(f'R2 Score: {r2_score(y_test, y_pred)}')
    print(f'Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}')
    print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
    print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    
    return pd.DataFrame({
        'y_true': y_test,
        'y_pred': y_pred,
        'residuals': y_test - y_pred
    })
    

In [36]:
class DataLoader:
    def __init__(self, data_path: str, target: str):
        self.data_path = data_path
        self.target = target
        self.df = None
        self.X = None
        self.y = None
        
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        
        self.scaler = None
        self.X_train_norm = None
        self.X_test_norm = None
        
    def _split_data(self):
        self.X = self.df.drop(self.target, axis=1)
        self.y = self.df[self.target]
        
    def load_data(self, index_col: int = None):
        self.df = pd.read_csv(self.data_path, index_col=index_col)
        self._split_data()

    def train_test_split(self, test_size: float = 0.2):
        if self.X is None or self.y is None:
            raise ValueError("X and y should not be None. Please load data first.")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size)

    def normalization(self, scaling_method: str = 'MinMax'):
        if self.X_train is None or self.X_test is None:
            raise ValueError("X_train and X_test should not be None. Please split data first.")
        if scaling_method == 'MinMax':
            self.scaler = MinMaxScaler()
        elif scaling_method == 'Standard':
            self.scaler = StandardScaler()
        else:
            raise ValueError("scaling_method should be either 'MinMax' or 'Standard'.")
        
        self.X_train_norm = self.scaler.fit_transform(self.X_train)
        self.X_test_norm = self.scaler.transform(self.X_test)

In [45]:
class Modelling(DataLoader):
    def __init__(self, data_path: str, target: str):
        super().__init__(data_path, target)
        self.estimator = None
        
    def get_trained_model(self, model: object):
        self.estimator = model

In [54]:
FILENAME = './data/carotenoid_production.csv'

m = Modelling(FILENAME, target='prod')
m.load_data()
m.train_test_split(test_size=0.2)
m.normalization(scaling_method='MinMax')

rf = tune_and_train_RF(m.X_train_norm, m.y_train)

In [62]:
results_rf = evaluate_RF(rf, m.X_test_norm, m.y_test)

R2 Score: 0.3926841360020088
Mean Absolute Error: 1.1195310161847074
Mean Squared Error: 2.257247412005278
Root Mean Squared Error: 1.5024138617588956
