 #########################################################################################################################
## PHASE 6: Modeling

**Description:** In this phase, the data scientist selects a suitable model, trains it on the data, and tests its performance.

**Inputs:**

* Engineered or preprocessed data: This input is the data prepared in the previous phase.
* Model selection criteria: This input consists of the criteria used to select a suitable model, such as accuracy, interpretability, and scalability.

**Outputs:**

* Trained model: The output of this phase is the trained model that can be used to make predictions or generate insights.

 #########################################################################################################################
## PHASE 7: Model Evaluation

**Description:** In this phase, the data scientist evaluates the performance of the model and fine-tunes its parameters to improve its accuracy and robustness.

**Inputs:**

* Trained model: This input is the model trained in the previous phase.
    Evaluation criteria: This input consists of the criteria used to evaluate the performance of the model. Classification models use metrics such as: accuracy, precision, recall, F1 score and ROC-AUC. Regression models use metrics such as: mean squared error, mean absolute error, negative mean absolute error, negative mean absolute error, mean absolute percentile error, r2, explained variance score, etc.

**Outputs:**

* Improved model: The output of this phase is the model that has been fine-tuned to achieve better performance.

 #########################################################################################################################

In [1]:
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {
            'normalize': [True, False]
        }
    },
    'Lasso Regression': {
        'model': Lasso(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10]
        }
    },
    'Ridge Regression': {
        'model': Ridge(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10]
        }
    },
    'Elastic Net Regression': {
        'model': ElasticNet(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10],
            'l1_ratio': [0.25, 0.5, 0.75]
        }
    },
    'Support Vector Regression': {
        'model': SVR(),
        'params': {
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'C': [0.1, 1, 10],
            'epsilon': [0.01, 0.1, 1]
        }
    },
    'Gradient Boosting Regression': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 1],
            'max_depth': [3, 5, 7],
            'subsample': [0.5, 0.75, 1]
        }
    },
    'AdaBoost Regression': {
        'model': AdaBoostRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.01, 0.1, 1],
            'loss': ['linear', 'square', 'exponential']
        }
    },
    'XGBoost Regression': {
        'model': XGBRegressor(),
        'params': {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01, 0.001],
            'n_estimators': [50, 100, 150]
        }
    },
    'Random Forest Regression': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7],
            'max_features': ['sqrt', 'log2']
        }
    },
    'Extra Trees Regression': {
        'model': ExtraTreesRegressor(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7],
            'max_features': ['sqrt', 'log2']
        }
    }
}

NameError: name 'LinearRegression' is not defined

In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


# assume X_train, y_train, X_valid, y_valid, X_test, y_test are pandas dataframes
preprocessing_pipeline = Pipeline([
    # add your preprocessing steps here
])

X_train_transformed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_valid_transformed = preprocessing_pipeline.transform(X_valid)
X_test_transformed = preprocessing_pipeline.transform(X_test)

ValueError: not enough values to unpack (expected 2, got 0)

## Define models dictionary

In [5]:
models = {}

# Linear Regression
models['Linear Regression'] = {
    'model': LinearRegression(),
    'params': {
        'normalize': [True, False]
    }
}

# Lasso Regression
models['Lasso Regression'] = {
    'model': Lasso(),
    'params': {
        'alpha': [0.1, 0.5, 1, 2, 5, 10],
        'normalize': [True, False],
        'max_iter': [1000, 5000, 10000]
    }
}

# Ridge Regression
models['Ridge Regression'] = {
    'model': Ridge(),
    'params': {
        'alpha': [0.1, 0.5, 1, 2, 5, 10],
        'normalize': [True, False],
        'max_iter': [1000, 5000, 10000]
    }
}

# Support Vector Regression
models['SVR'] = {
    'model': SVR(),
    'params': {
        'C': [0.1, 0.5, 1, 2, 5, 10],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    }
}

# Random Forest Regression
models['Random Forest Regression'] = {
    'model': RandomForestRegressor(),
    'params': {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [3, 5, 7, 9],
        'min_samples_split': [2, 3, 4],
        'min_samples_leaf': [1, 2, 3],
        'bootstrap': [True, False]
    }
}

# XGBoost Regression
models['XGBoost Regression'] = {
    'model': XGBRegressor(),
    'params': {
        'learning_rate': [0.05, 0.1, 0.15],
        'max_depth': [3, 5, 7, 9],
        'n_estimators': [50, 100, 150, 200],
        'objective': ['reg:squarederror']
    }
}

NameError: name 'XGBRegressor' is not defined

## Let's first try a models dictionary with 1 model and only 4 possible combinations of hyperparameters

In [None]:
models = {}
# XGBoost Regression
models['XGBoost Regression'] = {
    'model': XGBRegressor(),
    'params': {
        'learning_rate': [0.05],
        'max_depth': [3, 5, 7, 9],
        'n_estimators': [200],
        'objective': ['reg:squarederror']
    }
}

## Loop each model to train, tune, and retrain final model with best parameters and joined training+validation datasets

In [None]:
for model_name, model in models.items():
    gs = GridSearchCV(model['model'], model['params'], cv=5,
                      scoring='neg_mean_squared_error', return_train_score=True,
                      n_jobs=-1, verbose=1, refit=True)
    gs.fit(X_train_transformed, y_train,
           eval_set=[(X_valid_transformed, y_valid)],
           early_stopping_rounds=10)

    # Get the best estimator and store it
    best_model = gs.best_estimator_
    model['best_model'] = best_model

    # Join training and validation datasets for retraining best model with best parameters
    X_train_valid = pd.concat([X_train, X_valid], axis=0)
    y_train_valid = pd.concat([y_train, y_valid], axis=0)
    
    # Retrain best model with joined training+validation dataset using best hyperparameters
    best_model.fit(X_train_valid, y_train_valid)
    
    # Store the final best model retrained with joined training+validation dataset and best hyperparameters
    model['final_best_model'] = best_model
    
    # Evaluate the model on the test set and store the predictions and metric score
    y_test_pred = best_model.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    model['test_predictions'] = y_test_pred
    model['test_mse'] = test_mse

## 

## Print the metrics yielded by each model in sorted order

In [None]:
# Print the test mean squared error for each model (not ordered)
for model_name, model in models.items():
    print(f'{model_name} test mean squared error: {model["test_mse"]:.2f}')

# Print the top models with the lowest test mean squared error (in ascending order)
sorted_models = sorted(models.items(), key=lambda x: x[1]['test_mse'])
print('\nTop Models:')
for i in range(len(models)):
    model_name, model = sorted_models[i]
    print(f'{i+1}: {model_name} test mean squared error: {model["test_mse"]:.2f}')