# ___

# [ Machine Learning in Geosciences ]

**Department of Applied Geoinformatics and Carthography, Charles University** 

*Lukas Brodsky lukas.brodsky@natur.cuni.cz*

    
___


# Machine Learning Project!

Step 5 and 6

Goal: This notebook demonstrates the **selected model fitting and fine-tuning** steps for machine learning algorithms.  

Content: **Select model and fine tune** the model

    5.1/ Select model, train it and evaluate
    5.2/ Cros-validate
    
    6.1/ Fine-tune the model with grid search
    6.2/ and randomized search
    
    6.3/ Analyze the best model
    6.4/ and evaluate it on the test data set. 
___    

## Setup environment

In [None]:
# Common imports
import numpy as np
import os

# add more based on the topic of the lab

# to make this notebook's output stable across runs
np.random.seed(42)

# plotting 
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# path to the current lab directory - set individually!!!
# TODO HERE! 
PROJECT_DIR = "./"
if os.path.isdir(PROJECT_DIR): 
    print('Ok continue.')
else: 
    print('Nok, set correct path to your project directory!')


### Loading data

In [None]:
import pandas as pd

# check the data set dir 
forest_path = os.path.join(PROJECT_DIR, "forest_fires")

# function to read the csv file 
def load_local_data(data_path, csv_file):
    csv_path = os.path.join(data_path, csv_file)
    return pd.read_csv(csv_path)

# load data 
fires = load_local_data(forest_path, "forestfires.csv")

# check header and some values 
fires.head()

### Splitting data to train and test

In [None]:
# random split 
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(fires
                                       , test_size=0.3, random_state=42)

In [None]:
type(train_set)

In [None]:
train_set.shape

In [None]:
test_set.shape

In [None]:
# convert the selected attributes to Numpy ndarray 
X_train = np.array(train_set[['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']], 
                   dtype=np.float64)
y_train = np.array(train_set[['area']].values.ravel(), dtype=np.float64) 

# .values will give the values in a numpy array (shape: (n,1))
# .ravel will convert that array shape to (n, ) (i.e. flatten it)
#.values.ravel()

In [None]:
X_test = np.array(test_set[['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']], 
                   dtype=np.float64)
y_test = np.array(test_set[['area']].values.ravel(), dtype=np.float64) 

In [None]:
X_test.dtype

In [None]:
y_test.dtype

## Select and train a model 

### Decision tree (default param) 
Use a non-linear model - (assumption: the data relation is non-linear) 

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)

In [None]:
# the first fitting to evaluate everything works
tree_reg.fit(X_train, y_train)

In [None]:
# prediction
fires_predictions = tree_reg.predict(X_test)

In [None]:
# Select our metrics

# from sklearn import metrics
# sorted(metrics.SCORERS.keys())
# or https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
# the firs model evaluation (RMSE)
from sklearn.metrics import mean_squared_error

tree_mse = mean_squared_error(y_test, fires_predictions)
tree_rmse = np.sqrt(tree_mse)
print(round(tree_rmse, 2))

In [None]:
from sklearn.metrics import mean_absolute_error

tree_mae = mean_absolute_error(y_test, fires_predictions)
print(round(tree_mae, 2))

### Fine-tune the model!

In [None]:
from sklearn.model_selection import cross_val_score

# Decision Tree regressor 
scores = cross_val_score(tree_reg, X_train, y_train,
                         scoring="neg_mean_absolute_error", cv=10)
tree_mae_scores = (-scores)

In [None]:
def display_scores(scores):
    # print("Scores:", scores)
    print("Mean MAE:", round(scores.mean(), 2))
    print("Standard deviation:", round(scores.std(), 2))

display_scores(tree_mae_scores)

### Grid Search

In [None]:
# Grid Search Cross-validation
from sklearn.model_selection import GridSearchCV

# the model hyper-parameters 
# help: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html 
# DecisionTreeRegressor(splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, ...)

param_grid = [
    # try more combinations of hyperparameters
    # 5 * 5 * 2 = 50
     {'max_depth': [3, 4, 5, 10], 'min_samples_split': [3, 4, 5, 10], 
     'splitter': ['random', 'best']}
  ]

# grid seach application
tree_reg = DecisionTreeRegressor(random_state=42)

# train across 5 folds, that's a total of 50 * 5 = 250 rounds of training 
grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring="neg_mean_absolute_error",
                            return_train_score=True)


In [None]:
# BEAWARE this step may take long time! 
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
# Let's look at the score of each hyperparameter combination tested during the grid search:

In [None]:
# grid_search.cv_results_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)
    
# TODO: store the model with min. MAE

### Randomized Search

In [None]:
# BEAWARE it can run long time! 

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'max_depth': randint(low=3, high=10),
        'min_samples_split': randint(low=3, high=10),
    }

tree_reg = DecisionTreeRegressor(random_state=42)
rnd_search = RandomizedSearchCV(tree_reg, param_distributions=param_distribs,
                                n_iter=10, cv=10, scoring="neg_mean_absolute_error", random_state=42) 

rnd_search.fit(X_train, y_train)

In [None]:
rnd_search.best_estimator_

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

In [None]:
# Select the model and evaluate it with test set!
sel_model = grid_search.best_estimator_
sel_predictions = sel_model.predict(X_test)
print('MAE: {}'.format(round(mean_absolute_error(y_test,  sel_predictions), 2)))

In [None]:
# Select final model and evaluate it with test set!
sel_model = rnd_search.best_estimator_
sel_predictions = sel_model.predict(X_test)
print('MAE: {}'.format(round(mean_absolute_error(y_test,  sel_predictions), 2)))