In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import joblib
%store -r X_features_full
%store -r X_features_poly_full
%store -r y_labels_full

X_features = X_features_full
X_features_poly = X_features_poly_full
y_labels = y_labels_full
from sklearn.metrics import mean_absolute_percentage_error


In [2]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_features[0], y_labels[0])
from sklearn.metrics import mean_squared_error
visc_predictions = np.exp(np.power(tree_reg.predict(X_features[1]),2))-0.0012
tree_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_mse

0.05333497005753779

In [3]:
joblib.dump(tree_reg, "tree_regressor.pkl")

['tree_regressor.pkl']

In [4]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100)
forest_reg.fit(X_features[0], y_labels[0])
visc_predictions = np.exp(np.power(forest_reg.predict(X_features[1]),2))-0.0012
forest_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_mse

0.03582726686915107

In [5]:
joblib.dump(forest_reg, "forest_regressor.pkl")

['forest_regressor.pkl']

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit


split_index = [-1]*len(X_features[0]) + [0]*len(X_features[1])
X = np.concatenate((X_features[0], X_features[1]), axis=0)
y = np.concatenate((y_labels[0], y_labels[1]), axis=0)
pds = PredefinedSplit(test_fold = split_index)


# Fit with all data







param_grid = [
    
    {'n_estimators': [50, 100,150], 'max_features': [4,5]},
    
    {'bootstrap': [False], 'n_estimators': [50, 100,150], 'max_features': [4,5]},
  ]

forest_reg = RandomForestRegressor()
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=pds,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,)
grid_search.fit(X,y)

In [7]:
grid_search.best_estimator_

In [8]:
joblib.dump(grid_search.best_estimator_, "optimized_forest_regressor.pkl")

['optimized_forest_regressor.pkl']

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

split_index = [-1]*len(X_features[0]) + [0]*len(X_features[1])
X = np.concatenate((X_features[0], X_features[1]), axis=0)
y = np.concatenate((y_labels[0], y_labels[1]), axis=0)
pds = PredefinedSplit(test_fold = split_index)

param_distribs = {
        'n_estimators': randint(low=50, high=300),
        'max_features': randint(low=5, high=6),
    }

forest_reg = RandomForestRegressor()
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=50, cv=pds, scoring='neg_mean_squared_error')
rnd_search.fit(X, y)

In [10]:
rnd_search.best_estimator_

In [11]:
joblib.dump(rnd_search.best_estimator_, "_randomly_optimized_forest_regressor.pkl")

['_randomly_optimized_forest_regressor.pkl']

In [12]:
poly_tree_reg = DecisionTreeRegressor()
poly_tree_reg.fit(X_features_poly[0], y_labels[0])
visc_predictions = np.exp(np.power(poly_tree_reg.predict(X_features_poly[1]),2))-0.0012
poly_tree_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
poly_tree_rmse = np.sqrt(poly_tree_mse)
poly_tree_mse

0.05154851003161185

In [13]:
joblib.dump(poly_tree_reg, "poly_tree_regressor.pkl")

['poly_tree_regressor.pkl']

In [14]:
poly_forest_reg = RandomForestRegressor(n_estimators=100)
poly_forest_reg.fit(X_features_poly[0], y_labels[0])
visc_predictions = np.exp(np.power(poly_forest_reg.predict(X_features_poly[1]),2))-0.0012
poly_forest_mse = mean_absolute_percentage_error(np.exp(np.power(y_labels[1],2))-0.0012, visc_predictions)
poly_forest_rmse = np.sqrt(poly_forest_mse)
poly_forest_mse

0.03604442899722421

In [15]:
joblib.dump(poly_forest_reg, "poly_forest_regressor.pkl")

['poly_forest_regressor.pkl']