In [7]:
# imports: this cell will have all the imports used in this notebook
import random
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import bernoulli
from bitstring import BitArray

# sklearn imports
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split as split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

# sklearn genetics
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Integer, Categorical, Continuous
from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space
from sklearn_genetic.callbacks import LogbookSaver, ProgressBar
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor

# importing the minmaxscaler to normalize data between 0 and 1
from sklearn.preprocessing import MinMaxScaler

# keres imports
from keras.layers import LSTM, Input, Dense, Dropout, Activation
from keras.models import Model
from keras.models import Sequential

# Deap for genetic algorithm imports
from deap import base, creator, tools, algorithms

# seeding to get reproducible results with Keras and numpy
from numpy.random import seed
import tensorflow

seed(1)
tensorflow.random.set_seed(2)

%matplotlib inline

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Use TeX fonts
plt.rc("text", usetex=False)
plt.rc('font', **{'family': 'serif', 'serif': ['cmr10']})
plt.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
plt.rc('font', size=15.0)
plt.rc('font', weight='normal')
plt.rc('legend', fontsize=12.0)
plt.rc('axes', grid=False) 
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [None]:
# read and convert csv data into a dataframe and print it out
df_inp = pd.read_csv("inputs_sample_m.csv")
df_inp.head(n=10)

In [None]:
df_out = pd.read_csv("output_sample_m.csv")
df_out.head(n=10)

In [None]:
lst = list(df_inp.columns)
df_inp[lst].hist(figsize=(15,10))

In [None]:
%matplotlib inline
from subprocess import check_output
%config Completer.use_jedi = False

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import  mean_squared_error, r2_score

Data cleaning process

In [None]:
df_inp.isna().any()

In [None]:
df_inp.isna().sum()

In [None]:
df_out.isna().any()

In [None]:
df_out.isna().sum()

In [None]:
df_inp.duplicated().sum()

In [None]:
df_out.duplicated().sum()

In [None]:
df_inp.dropna(inplace=True)
df_out.dropna(inplace=True)

In [None]:
df_inp.isna().sum()

In [None]:
df_out.isna().sum()

In [None]:
X = df_inp
y_opt = df_out

In [None]:
X.shape, y_opt.shape

In [None]:
distributions = [
    ('Unscaled data', X),
    ('Data after standard scaling',
        StandardScaler().fit_transform(X)),
    ('Data after min-max scaling',
        MinMaxScaler().fit_transform(X)),
    ('Data after max-abs scaling',
        MaxAbsScaler().fit_transform(X)),
    ('Data after robust scaling',
        RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Data after power transformation (Yeo-Johnson)',
     PowerTransformer(method='yeo-johnson').fit_transform(X)),
#     ('Data after power transformation (Box-Cox)',
#      PowerTransformer(method='box-cox').fit_transform(X)),
    ('Data after quantile transformation (uniform pdf)',
        QuantileTransformer(output_distribution='uniform')
        .fit_transform(X)),
    ('Data after quantile transformation (gaussian pdf)',
        QuantileTransformer(output_distribution='normal')
        .fit_transform(X)),
    ('Data after sample-wise L2 normalizing',
        Normalizer().fit_transform(X)),
]

# scale the output between 0 and 1 for the colorbar
y = minmax_scale(y_opt)

# X_train, X_test, y_train, y_test = data_preprocess(X,y)
item_idx = 1 
title, X = distributions[item_idx]
print(title)

Splitting into testing and training sets

In [None]:
X_train, X_test, y_train, y_test = split(X, y_opt, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Using GA

In [None]:
# from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
# from sklearn.gaussian_process.kernels import RBF,  DotProduct, ConstantKernel as C
pipe = RandomForestRegressor()

param_grid = {'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform'),
              'bootstrap': Categorical([True, False]),
              'max_depth': Integer(2, 30), 
              'max_leaf_nodes': Integer(2, 35), 
              'n_estimators': Integer(100, 300)}

# Define the GASearchCV options
evolved_estimator = GASearchCV(
    estimator=pipe,
    cv=10,
    scoring="r2",
    population_size=15,
    generations=20,
    tournament_size=3,
    elitism=True,
    keep_top_k=4,
    crossover_probability=0.9,
    mutation_probability=0.05,
    param_grid=param_grid,
#     param_grid=None,
    criteria="max",
    algorithm="eaMuCommaLambda",
    n_jobs=-1)

#Optionally, create some Callbacks
callbacks = [LogbookSaver(checkpoint_path="./logbook.pkl"), ProgressBar()]

# Fit the model and see some results
evolved_estimator.fit(X_train, y_train, callbacks=callbacks)
y_predict_ga = evolved_estimator.predict(X_test)
r_squared = r2_score(y_test, y_predict_ga)

print(evolved_estimator.best_params_)
print("r-squared: ", "{:.2f}".format(r_squared))
print("Best k solutions: ", evolved_estimator.hof)

In [None]:
plot = plot_fitness_evolution(evolved_estimator, metric="fitness")

In [None]:
plt.rc("text", usetex=False)
plot_search_space(evolved_estimator)

In [None]:
plt.figure(figsize=[6,5])
plt.scatter(y_test, y_predict_ga, s=90)
plt.xlim(left=0.86)
plt.ylim(bottom=0.88, top=0.96)

In [None]:
plt.rc("text", usetex=True)
import scipy 

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[8,5], dpi=80)

fig.suptitle(r'Model Performance of Hydrogen')
ax1.scatter(y_test, y_predict_ga, s=120, alpha=0.5)
ax1.set_xlim(left=0.86)
ax1.set_ylim(bottom=0.88, top=0.96)
ax1.set_xlabel(r'Scaled Actual Output')
ax1.set_ylabel(r'Scaled Predicted Output')
ax1.set_title(r'$R^2 = {:.4g}$'.format(r2_score(y_test, y_predict_ga)))

# best fit of data
data_res = y_test - y_predict_ga 
(mu, sigma) = norm.fit(data_res[data_res > -0.08])

# the histogram of the data
n, bins, patches =  ax2.hist(y_test - y_predict_ga, 30, density=1, alpha=0.5)

# add a 'best fit' line
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
l = plt.plot(bins, best_fit_line, 'r--', linewidth=2)


ax2.set_xlim(left=-0.08)
ax2.set_xlabel(r'Residual')
ax2.set_ylabel(r'Frequency')
ax2.set_title(r'$\sigma = {:.4f}$'.format(mean_squared_error(y_test, y_predict_ga, squared=False)))

fig.tight_layout()

MLP

In [None]:
from sklearn import neural_network

pipe = neural_network.MLPRegressor()

param_grid = {
#     'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform'),
#               'bootstrap': Categorical([True, False]),
#               'max_depth': Integer(2, 30), 
#               'max_leaf_nodes': Integer(2, 35), 
#               'n_estimators': Integer(100, 300)             
          'solver': Categorical(['lbfgs', 'sgd', 'adam']),
          'max_iter': Integer(500,1500),
          'alpha': Continuous(10.0**(-7), 10.0**(-1), distribution='log-uniform'),
          'hidden_layer_sizes': Integer(5, 12),
          'random_state':Integer(0, 9)
             
             }

# Define the GASearchCV options
evolved_estimator = GASearchCV(
    estimator=pipe,
    cv=10,
    scoring="r2",
    population_size=15,
    generations=20,
    tournament_size=3,
    elitism=True,
    keep_top_k=4,
    crossover_probability=0.9,
    mutation_probability=0.05,
    param_grid=param_grid,
#     param_grid=None,
    criteria="max",
    algorithm="eaMuCommaLambda",
    n_jobs=-1)

#Optionally, create some Callbacks
callbacks = [LogbookSaver(checkpoint_path="./logbook.pkl"), ProgressBar()]

# Fit the model and see some results
evolved_estimator.fit(X_train, y_train, callbacks=callbacks)
y_predict_ga = evolved_estimator.predict(X_test)
r_squared = r2_score(y_test, y_predict_ga)

print(evolved_estimator.best_params_)
print("r-squared: ", "{:.2f}".format(r_squared))
print("Best k solutions: ", evolved_estimator.hof)

In [None]:
plot = plot_fitness_evolution(evolved_estimator, metric="fitness")

In [None]:
plt.rc("text", usetex=False)
plot_search_space(evolved_estimator)

In [None]:
plt.figure(figsize=[6,5])
plt.scatter(y_test, y_predict_ga, s=90)
plt.xlim(left=0.86)
plt.ylim(bottom=0.88, top=0.96)

In [None]:
plt.rc("text", usetex=False)
import scipy 

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[8,5], dpi=80)

fig.suptitle(r'Model Performance of Hydrogen')
ax1.scatter(y_test, y_predict_ga, s=120, alpha=0.5)
ax1.set_xlim(left=0.86)
ax1.set_ylim(bottom=0.88, top=0.96)
ax1.set_xlabel(r'Scaled Actual Output')
ax1.set_ylabel(r'Scaled Predicted Output')
ax1.set_title(r'$R^2 = {:.4g}$'.format(r2_score(y_test, y_predict_ga)))

# best fit of data
data_res = y_test - y_predict_ga 
(mu, sigma) = norm.fit(data_res[data_res > -0.08])

# the histogram of the data
n, bins, patches =  ax2.hist(y_test - y_predict_ga, 30, density=1, alpha=0.5)

# add a 'best fit' line
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
l = plt.plot(bins, best_fit_line, 'r--', linewidth=2)


ax2.set_xlim(left=-0.08)
ax2.set_xlabel(r'Residual')
ax2.set_ylabel(r'Frequency')
ax2.set_title(r'$\sigma = {:.4f}$'.format(mean_squared_error(y_test, y_predict_ga, squared=False)))

fig.tight_layout()

In [None]:
from sklearn.linear_model import BayesianRidge

param_grid = {'n_iter': Integer(300, 500),
              'tol': Continuous(0.0001, 0.1),
              'alpha_1': Continuous(1.0e-7, 1e-1),
              'alpha_2': Continuous(1.0e-7, 1e-1),
              'lambda_1': Continuous(1.0e-7, 1e-1),
              'lambda_2': Continuous(1.0e-7, 1e-1),
              'normalize': Categorical([False, True])}



pipe = BayesianRidge()

# param_grid = {'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform'),
#               'bootstrap': Categorical([True, False]),
#               'max_depth': Integer(2, 30), 
#               'max_leaf_nodes': Integer(2, 35), 
#               'n_estimators': Integer(100, 300)}
# Create the CV strategy and define the param grid
cv = KFold(n_splits=10, shuffle=True)

# Define the GASearchCV options
evolved_estimator = GASearchCV(
    estimator=pipe,
    cv=cv,
    scoring="r2",
    population_size=15,
    generations=20,
    tournament_size=3,
    elitism=True,
    keep_top_k=4,
    crossover_probability=0.9,
    mutation_probability=0.05,
    param_grid=param_grid,
#     param_grid=None,
    criteria="max",
    algorithm="eaMuCommaLambda",
    n_jobs=-1)

#Optionally, create some Callbacks
callbacks = [LogbookSaver(checkpoint_path="./logbook.pkl"), ProgressBar()]

# Fit the model and see some results
evolved_estimator.fit(X_train, y_train, callbacks=callbacks)
y_predict_ga = evolved_estimator.predict(X_test)
r_squared = r2_score(y_test, y_predict_ga)

print(evolved_estimator.best_params_)
print("r-squared: ", "{:.2f}".format(r_squared))
print("Best k solutions: ", evolved_estimator.hof)

In [None]:
plot = plot_fitness_evolution(evolved_estimator, metric="fitness")

In [None]:
plt.rc("text", usetex=False)
plot_search_space(evolved_estimator)

In [None]:
plt.figure(figsize=[6,5])
plt.scatter(y_test, y_predict_ga, s=90)
plt.xlim(left=0.86)
plt.ylim(bottom=0.88, top=0.96)

In [None]:
plt.rc("text", usetex=False)
import scipy 

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[8,5], dpi=80)

fig.suptitle(r'Model Performance of Hydrogen')
ax1.scatter(y_test, y_predict_ga, s=120, alpha=0.5)
ax1.set_xlim(left=0.86)
ax1.set_ylim(bottom=0.88, top=0.96)
ax1.set_xlabel(r'Scaled Actual Output')
ax1.set_ylabel(r'Scaled Predicted Output')
ax1.set_title(r'$R^2 = {:.4g}$'.format(r2_score(y_test, y_predict_ga)))

# best fit of data
data_res = y_test - y_predict_ga 
(mu, sigma) = norm.fit(data_res[data_res > -0.08])

# the histogram of the data
n, bins, patches =  ax2.hist(y_test - y_predict_ga, 30, density=1, alpha=0.5)

# add a 'best fit' line
best_fit_line = scipy.stats.norm.pdf(bins, mu, sigma)
l = plt.plot(bins, best_fit_line, 'r--', linewidth=2)


ax2.set_xlim(left=-0.08)
ax2.set_xlabel(r'Residual')
ax2.set_ylabel(r'Frequency')
ax2.set_title(r'$\sigma = {:.4f}$'.format(mean_squared_error(y_test, y_predict_ga, squared=False)))

fig.tight_layout()