# Notebook B: Model Training
This notebook trains 6 algoritms to predict the production rates of 5 outputs of syngas fermentation based on the extracellular metabolite concentration, and gas composition.

### Set up imports

In [14]:
import pandas as pd
import numpy as np
import sklearn.preprocessing, sklearn.neural_network, sklearn.svm, sklearn.ensemble, sklearn
import pickle

### Load data that was generated in notebook A

In [2]:
rates_df = pd.read_csv(f'../data/rates_data.csv')
print(f'Shape of the rates data: {rates_df.shape[0]} rows by {rates_df.shape[1]} columns')

Shape of the rates data: 836 rows by 18 columns


### Create train and test sets 

In [3]:
train_data = rates_df[rates_df.composition.isin([1,2,3,4,5,6,7])]
test_data = rates_df[rates_df.composition.isin([8,9,10])]
print(f'Shape of the training data: {train_data.shape[0]} rows by {train_data.shape[1]} columns')
print(f'Shape of the testing data: {test_data.shape[0]} rows by {test_data.shape[1]} columns')

Shape of the training data: 657 rows by 18 columns
Shape of the testing data: 179 rows by 18 columns


### Define a function that generates the input and output arrays for scikit learn's API

In [4]:
def get_X_y_arrays(data):

    # prevent set with copy error
    data_copy = data.copy()
    
    # ML input
    X = data_copy [[
        'biomass (g/L)', 'ethanol (mM)', 'acetate (mM)', 'butanol (mM)', 
         'butyrate (mM)', 'N2', 'CO', 'CO2', 'H2', 'flow rate (mL/min)'
    ]]
    
    # ML output
    y = data_copy [[
        'biomass rate', 'ethanol rate', 'acetate rate', 'butanol rate', 'butyrate rate'
    ]]
 
    return np.array(X), np.array(y)

In [5]:
X_train, y_train = get_X_y_arrays(train_data)
X_test, y_test = get_X_y_arrays(test_data)

print(f'Shape of the train X array: {X_train.shape[0]} rows by {X_train.shape[1]} columns')
print(f'Shape of the trainn y array: {y_train.shape[0]} rows by {y_train.shape[1]} columns')
print(f'Shape of the test X array: {X_test.shape[0]} rows by {X_test.shape[1]} columns')
print(f'Shape of the test y array: {y_test.shape[0]} rows by {y_test.shape[1]} columns')

Shape of the train X array: 657 rows by 10 columns
Shape of the trainn y array: 657 rows by 5 columns
Shape of the test X array: 179 rows by 10 columns
Shape of the test y array: 179 rows by 5 columns


## Train 30 different models (5 outputs each modeled with 6 algorithms)
algorithms = neural network, support vector machine, random forest, support vector, neural net, lasso <br>
outputs = acetate, biomass, butanol, butyrate, ethanol

### Define a functions to generate neural network architectures

In [6]:
def gen_NN_fixed_n_layers(n_layers, n_neurons, neuron_step):
    """Generate NN hidden_layer_sizes of n_layers and up to n_neurons per layer 
    """
    # print (n_layers)
    if n_layers == 1: 
        return [[i] for i in range(neuron_step, n_neurons+1, neuron_step)]
    else:
        pairs =  [  (i,  tail) for tail in gen_NN_fixed_n_layers(n_layers-1, n_neurons+1, neuron_step) for i in range(neuron_step, n_neurons+1, neuron_step) ]
        return [[i]+ t for (i, t) in pairs]

# print (gen_NN_fixed_n_layers(4, 10, 5))

def gen_NN_uni(n_layers, n_neurons, layer_step, neuron_step):
    """Generate hidden layers of various number of layers and number of neurons 
    """ 
    various_NNs = [ gen_NN_fixed_n_layers(i , n_neurons, neuron_step) for i in range(2, n_layers+1, layer_step)]
    return  functools.reduce(operator.add, various_NNs)

### Define a model configuration dictionary to guide ML training

Test grid is used for debugging, should be replaced with full grid

In [7]:
model_cfgs = {
        "nn":{
            'estimator': sklearn.neural_network.MLPRegressor(shuffle=True),
            # Test grid
            'param_grid':   {
                'activation': ['tanh', 'logistic', 'relu'], 
                'max_iter':   [400*i for i in range(1, 2)]
            }
            # Full grid
            # 'param_grid':   {
                # 'hidden_layer_sizes': gen_NN_uni(5, 100, 1, 10),  
                # 'activation':         ['tanh', 'logistic', 'relu'], 
                # 'max_iter':           [400*i for i in range(1, 10, 2)]
            # }                
        },
        "svm_rbf":{
            'estimator': sklearn.svm.SVR(kernel='rbf'),
            # Test grid
            'param_grid':   {
                'C':       [10**i for i in range(-1, 1)], 
                'epsilon': [10**i for i in range(-1, 1)],
            }
                # Full grid
                # 'param_grid':   {
                    # 'C':       [10**i for i in range(-5, 5)], 
                    # 'epsilon': [10**i for i in range(-5, 5)],
                    # 'gamma':   [10**i for i in range(-5, 5)] # gamma gave me an error
            # }
        },
        "rf":{
            'estimator': sklearn.ensemble.RandomForestRegressor(),
            # Test grid
            'param_grid':   {
                'n_estimators': [10*i for i in range(1, 2)],
                'max_depth':     [2*i for i in range(1, 1+1)],
            }
            # Full grid 
            # 'param_grid':   {
                # 'n_estimators': [10*i for i in range(1, 20)],
                # 'max_depth':     [2*i for i in range(20)], 
                # 'max_samples': [0.05*i for i in range(1, 10+1)] # max samples gave me an error
            # }
        },
        'en': {
            'estimator': sklearn.linear_model.ElasticNet(),
            # Test grid
            'param_grid':   {
                'alpha': [0.0001, 0.001, 0.01, 0.1],
                'l1_ratio': [0.1, 1],
            }
            # Full grid 
            # 'param_grid': {
                # 'alpha': [0.0001, 0.001, 0.01, 0.1],
                # 'l1_ratio': [0.1, 1],
            #}
        },
        'lasso': {
            'estimator': sklearn.linear_model.Lasso(),
            # Test grid
            'param_grid':   {
                'alpha': [0.0001, 0.001, 0.01, 0.1],
            }
            # Full grid 
            # 'param_grid':   {
                # 'alpha': [0.0001, 0.001, 0.01, 0.1],
            # }
        },
        'knn': {
            'estimator': sklearn.neighbors.KNeighborsRegressor(),
            # Test grid
            'param_grid':   {
                'algorithm': ['ball_tree', ],
                'leaf_size': [4,5,6],
                'n_neighbors': [2,3,4],
                'weights': ['distance'],
            }
            # Full grid 
            # 'param_grid':   {
                # 'algorithm': [0.0001, 0.001, 0.01, 0.1],
                # 'leaf_size': [4, 5, 6],
                # 'n_neighbors': [2, 3, 4],
                # 'weights': ['distance'],
            # }
        },
        "bayesian":{
            'estimator': sklearn.linear_model.BayesianRidge(),
            'param_grid':   {
                'n_iter':  [300, 500], 
                'alpha_1': [10**i for i in range(-1, 1)], 
                'alpha_2': [10**i for i in range(-1, 1)], 
                'lambda_1': [10**i for i in range(-1, 1)], 
                'lambda_2': [10**i for i in range(-1, 1)], 
            }
        },
    
    }

### Perform grid search for each output and algorithm

In [8]:
# define a dictionary to hold results for all outputs
trained_model_dictionary = {}

# define a scaler to standardize the input values of all features between 0 and 1
Scaler = sklearn.preprocessing.MinMaxScaler()
X = Scaler.fit_transform(X_train, y_train)

# loop over outputs
for index, output in enumerate(['biomass', 'ethanol', 'acetate', 'butanol', 'butyrate']):
    print(f'{output}\n')
    
    # define a dictionary to hold results for a single output
    trained_models = {} 
    
    # loop over models
    for model_name, model_conf in model_cfgs.items():
        print (model_name)
        
        # define grid search parameters
        search = sklearn.model_selection.GridSearchCV(
            estimator = model_conf["estimator"], 
            param_grid = model_conf["param_grid"], 
            scoring = "r2",
            refit = True,
            cv = sklearn.model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=0), 
            n_jobs=30, # This is a limitation of the server I am using. -gr
            verbose=3
        )

        # output array is a vector of a single output, not 2d array of all outputs
        y_output=y_train[:,index]

        # run grid search
        search.fit(X_train, y_output)
        
        # report results
        print("Best CV score: %0.3f:" % search.best_score_)
        print("Best parameters:",  search.best_params_, '\n')
        
        # save results of each model to a dictionary
        trained_models[model_name] = search 

    # save results from each output to a dictionary
    trained_model_dictionary[output] = trained_models

biomass

nn
Fitting 10 folds for each of 3 candidates, totalling 30 fits
[CV 1/10] END ...activation=tanh, max_iter=400;, score=-0.144 total time=   0.5s
[CV 3/10] END ....activation=tanh, max_iter=400;, score=0.213 total time=   0.5s
[CV 2/10] END ...activation=tanh, max_iter=400;, score=-0.938 total time=   0.5s
[CV 7/10] END ...activation=tanh, max_iter=400;, score=-0.283 total time=   0.5s
[CV 4/10] END ...activation=tanh, max_iter=400;, score=-1.635 total time=   0.6s
[CV 6/10] END ....activation=tanh, max_iter=400;, score=0.218 total time=   0.6s
[CV 10/10] END activation=logistic, max_iter=400;, score=0.330 total time=   0.3s
[CV 5/10] END ...activation=tanh, max_iter=400;, score=-0.770 total time=   0.7s
[CV 9/10] END ....activation=tanh, max_iter=400;, score=0.042 total time=   0.6s
[CV 8/10] END ...activation=tanh, max_iter=400;, score=-0.878 total time=   0.7s
[CV 10/10] END ..activation=tanh, max_iter=400;, score=-0.068 total time=   0.4s
[CV 1/10] END activation=logistic, 



[CV 3/10] END ....activation=relu, max_iter=400;, score=0.829 total time=   3.1s
[CV 5/10] END ....activation=relu, max_iter=400;, score=0.894 total time=   3.1s
[CV 1/10] END ....activation=relu, max_iter=400;, score=0.859 total time=   3.1s
[CV 4/10] END ....activation=relu, max_iter=400;, score=0.929 total time=   3.1s




[CV 1/10] END activation=logistic, max_iter=400;, score=0.798 total time=   3.6s
[CV 3/10] END activation=logistic, max_iter=400;, score=0.856 total time=   3.6s
[CV 2/10] END activation=logistic, max_iter=400;, score=0.853 total time=   3.6s
[CV 4/10] END activation=logistic, max_iter=400;, score=0.911 total time=   3.6s
[CV 9/10] END activation=logistic, max_iter=400;, score=0.813 total time=   3.6s
[CV 5/10] END activation=logistic, max_iter=400;, score=0.864 total time=   3.6s
[CV 6/10] END activation=logistic, max_iter=400;, score=0.879 total time=   3.6s
[CV 10/10] END activation=logistic, max_iter=400;, score=0.864 total time=   3.6s
[CV 7/10] END activation=logistic, max_iter=400;, score=0.804 total time=   3.6s
[CV 8/10] END activation=logistic, max_iter=400;, score=0.898 total time=   3.6s




[CV 2/10] END ....activation=tanh, max_iter=400;, score=0.894 total time=   4.2s
[CV 1/10] END ....activation=tanh, max_iter=400;, score=0.870 total time=   4.2s
[CV 4/10] END ....activation=tanh, max_iter=400;, score=0.944 total time=   4.2s
[CV 7/10] END ....activation=tanh, max_iter=400;, score=0.853 total time=   4.2s
[CV 5/10] END ....activation=tanh, max_iter=400;, score=0.909 total time=   4.2s
[CV 9/10] END ....activation=tanh, max_iter=400;, score=0.845 total time=   4.2s
[CV 8/10] END ....activation=tanh, max_iter=400;, score=0.919 total time=   4.2s
[CV 10/10] END ...activation=tanh, max_iter=400;, score=0.884 total time=   4.2s
[CV 3/10] END ....activation=tanh, max_iter=400;, score=0.882 total time=   4.2s
[CV 6/10] END ....activation=tanh, max_iter=400;, score=0.917 total time=   4.2s




[CV 6/10] END ....activation=relu, max_iter=400;, score=0.892 total time=   1.8s
[CV 9/10] END ....activation=relu, max_iter=400;, score=0.838 total time=   1.8s
[CV 8/10] END ....activation=relu, max_iter=400;, score=0.899 total time=   1.8s
[CV 10/10] END ...activation=relu, max_iter=400;, score=0.871 total time=   1.8s
[CV 7/10] END ....activation=relu, max_iter=400;, score=0.795 total time=   1.9s




Best CV score: 0.892:
Best parameters: {'activation': 'tanh', 'max_iter': 400} 

svm_rbf
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV 1/10] END ...............C=0.1, epsilon=0.1;, score=0.462 total time=   0.0s
[CV 2/10] END ...............C=0.1, epsilon=0.1;, score=0.495 total time=   0.0s
[CV 3/10] END ...............C=0.1, epsilon=0.1;, score=0.433 total time=   0.0s
[CV 4/10] END ...............C=0.1, epsilon=0.1;, score=0.410 total time=   0.0s
[CV 5/10] END ...............C=0.1, epsilon=0.1;, score=0.339 total time=   0.1s
[CV 6/10] END ...............C=0.1, epsilon=0.1;, score=0.443 total time=   0.1s
[CV 7/10] END ...............C=0.1, epsilon=0.1;, score=0.407 total time=   0.1s
[CV 1/10] END .................C=0.1, epsilon=1;, score=0.458 total time=   0.0s
[CV 8/10] END ...............C=0.1, epsilon=0.1;, score=0.414 total time=   0.1s
[CV 9/10] END ...............C=0.1, epsilon=0.1;, score=0.348 total time=   0.1s
[CV 2/10] END .................C=0.1, ep

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


[CV 10/10] END .........alpha=0.001, l1_ratio=1;, score=0.509 total time=   0.0s
[CV 7/10] END ..........alpha=0.001, l1_ratio=1;, score=0.523 total time=   0.0s
[CV 1/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.530 total time=   0.0s
[CV 2/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.620 total time=   0.0s
[CV 3/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.538 total time=   0.0s
[CV 4/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.646 total time=   0.0s
[CV 5/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.568 total time=   0.0s
[CV 6/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.518 total time=   0.0s
[CV 7/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.495 total time=   0.0s
[CV 8/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.575 total time=   0.0s
[CV 9/10] END .........alpha=0.01, l1_ratio=0.1;, score=0.483 total time=   0.0s
[CV 10/10] END ........alpha=0.01, l1_ratio=0.1;, score=0.520 total time=   0.0s
[CV 1/10] END ...........alp

  positive)
  positive)
  positive)
  positive)


[CV 3/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.929 total time=   0.0s
[CV 4/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.948 total time=   0.0s
[CV 5/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.936 total time=   0.0s
[CV 6/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.908 total time=   0.0s
[CV 7/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.955 total time=   0.0s
[CV 8/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.893 total time=   0.0s
[CV 9/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.894 total time=   0.0s
[CV 10/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.902 total time=   0.0s
[CV 1/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=3, weights=distance;, score=0.911 t



[CV 3/10] END ....activation=relu, max_iter=400;, score=0.742 total time=   3.4s
[CV 5/10] END ....activation=relu, max_iter=400;, score=0.910 total time=   3.4s
[CV 2/10] END ....activation=relu, max_iter=400;, score=0.762 total time=   3.4s
[CV 1/10] END ....activation=relu, max_iter=400;, score=0.730 total time=   3.5s
[CV 4/10] END ....activation=relu, max_iter=400;, score=0.881 total time=   3.5s




[CV 1/10] END activation=logistic, max_iter=400;, score=0.685 total time=   4.0s
[CV 8/10] END activation=logistic, max_iter=400;, score=0.739 total time=   4.0s
[CV 3/10] END activation=logistic, max_iter=400;, score=0.619 total time=   4.0s
[CV 10/10] END activation=logistic, max_iter=400;, score=0.733 total time=   4.0s
[CV 7/10] END activation=logistic, max_iter=400;, score=0.807 total time=   4.0s
[CV 5/10] END activation=logistic, max_iter=400;, score=0.813 total time=   4.0s
[CV 9/10] END activation=logistic, max_iter=400;, score=0.731 total time=   4.0s
[CV 2/10] END activation=logistic, max_iter=400;, score=0.744 total time=   4.1s
[CV 4/10] END activation=logistic, max_iter=400;, score=0.786 total time=   4.1s
[CV 6/10] END activation=logistic, max_iter=400;, score=0.745 total time=   4.1s




[CV 1/10] END ....activation=tanh, max_iter=400;, score=0.811 total time=   4.6s
[CV 5/10] END ....activation=tanh, max_iter=400;, score=0.946 total time=   4.6s
[CV 4/10] END ....activation=tanh, max_iter=400;, score=0.885 total time=   4.6s
[CV 3/10] END ....activation=tanh, max_iter=400;, score=0.754 total time=   4.6s
[CV 2/10] END ....activation=tanh, max_iter=400;, score=0.784 total time=   4.6s
[CV 6/10] END ....activation=tanh, max_iter=400;, score=0.853 total time=   4.6s
[CV 7/10] END ....activation=tanh, max_iter=400;, score=0.887 total time=   4.6s
[CV 8/10] END ....activation=tanh, max_iter=400;, score=0.885 total time=   4.6s
[CV 10/10] END ...activation=tanh, max_iter=400;, score=0.844 total time=   4.6s
[CV 9/10] END ....activation=tanh, max_iter=400;, score=0.877 total time=   4.6s




[CV 6/10] END ....activation=relu, max_iter=400;, score=0.856 total time=   1.6s
[CV 10/10] END ...activation=relu, max_iter=400;, score=0.775 total time=   1.6s
[CV 7/10] END ....activation=relu, max_iter=400;, score=0.873 total time=   1.6s
[CV 9/10] END ....activation=relu, max_iter=400;, score=0.883 total time=   1.6s
[CV 8/10] END ....activation=relu, max_iter=400;, score=0.897 total time=   1.7s




Best CV score: 0.853:
Best parameters: {'activation': 'tanh', 'max_iter': 400} 

svm_rbf
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV 2/10] END ...............C=0.1, epsilon=0.1;, score=0.062 total time=   0.0s
[CV 1/10] END ..............C=0.1, epsilon=0.1;, score=-0.051 total time=   0.0s
[CV 3/10] END ...............C=0.1, epsilon=0.1;, score=0.126 total time=   0.0s
[CV 4/10] END ..............C=0.1, epsilon=0.1;, score=-0.045 total time=   0.1s
[CV 5/10] END ..............C=0.1, epsilon=0.1;, score=-0.077 total time=   0.1s
[CV 7/10] END ..............C=0.1, epsilon=0.1;, score=-0.097 total time=   0.1s
[CV 8/10] END ..............C=0.1, epsilon=0.1;, score=-0.159 total time=   0.1s
[CV 6/10] END ..............C=0.1, epsilon=0.1;, score=-0.118 total time=   0.1s
[CV 9/10] END ..............C=0.1, epsilon=0.1;, score=-0.012 total time=   0.1s
[CV 1/10] END ................C=0.1, epsilon=1;, score=-0.049 total time=   0.1s
[CV 10/10] END .............C=0.1, epsil

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  po

[CV 6/10] END .........alpha=0.0001, l1_ratio=1;, score=0.528 total time=   0.0s
[CV 7/10] END .........alpha=0.0001, l1_ratio=1;, score=0.513 total time=   0.0s
[CV 8/10] END .........alpha=0.0001, l1_ratio=1;, score=0.507 total time=   0.0s
[CV 9/10] END .........alpha=0.0001, l1_ratio=1;, score=0.419 total time=   0.0s
[CV 10/10] END ........alpha=0.0001, l1_ratio=1;, score=0.483 total time=   0.0s
[CV 1/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.386 total time=   0.0s
[CV 2/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.508 total time=   0.0s
[CV 3/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.372 total time=   0.0s
[CV 4/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.501 total time=   0.0s
[CV 5/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.558 total time=   0.0s
[CV 6/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.528 total time=   0.0s
[CV 7/10] END ........alpha=0.001, l1_ratio=0.1;, score=0.513 total time=   0.0s
[CV 8/10] END ........alpha=

  positive)
  positive)
  positive)
  positive)
  positive)


[CV 1/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.557 total time=   0.0s
[CV 2/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.653 total time=   0.0s
[CV 4/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.936 total time=   0.0s
[CV 5/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.951 total time=   0.0s
[CV 6/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.799 total time=   0.0s
[CV 7/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.885 total time=   0.0s
[CV 8/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.926 total time=   0.0s
[CV 9/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.934 total time=   0.0s
[CV 3/10] END algorithm=ball_tree, leaf_size=4, n_neighbors=2, weights=distance;, score=0.753 to



[CV 2/10] END ....activation=relu, max_iter=400;, score=0.933 total time=   3.4s
[CV 9/10] END ....activation=relu, max_iter=400;, score=0.868 total time=   1.8s
[CV 4/10] END ....activation=relu, max_iter=400;, score=0.927 total time=   3.4s
[CV 1/10] END ....activation=relu, max_iter=400;, score=0.887 total time=   3.4s




[CV 5/10] END activation=logistic, max_iter=400;, score=0.952 total time=   3.9s
[CV 8/10] END activation=logistic, max_iter=400;, score=0.975 total time=   3.9s
[CV 7/10] END activation=logistic, max_iter=400;, score=0.951 total time=   3.9s
[CV 4/10] END activation=logistic, max_iter=400;, score=0.940 total time=   4.0s
[CV 9/10] END activation=logistic, max_iter=400;, score=0.920 total time=   3.9s
[CV 10/10] END activation=logistic, max_iter=400;, score=0.959 total time=   3.9s
[CV 6/10] END activation=logistic, max_iter=400;, score=0.965 total time=   4.0s
[CV 3/10] END activation=logistic, max_iter=400;, score=0.957 total time=   4.0s
[CV 2/10] END activation=logistic, max_iter=400;, score=0.962 total time=   4.0s
[CV 1/10] END activation=logistic, max_iter=400;, score=0.906 total time=   4.0s
[CV 4/10] END ....activation=tanh, max_iter=400;, score=0.966 total time=   4.0s
[CV 7/10] END ....activation=relu, max_iter=400;, score=0.915 total time=   2.5s
[CV 8/10] END ....activatio



Best CV score: 0.969:
Best parameters: {'activation': 'tanh', 'max_iter': 400} 

svm_rbf
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV 2/10] END ...............C=0.1, epsilon=0.1;, score=0.553 total time=   0.0s
[CV 1/10] END ...............C=0.1, epsilon=0.1;, score=0.634 total time=   0.0s
[CV 3/10] END ...............C=0.1, epsilon=0.1;, score=0.500 total time=   0.0s
[CV 4/10] END ...............C=0.1, epsilon=0.1;, score=0.544 total time=   0.0s
[CV 5/10] END ...............C=0.1, epsilon=0.1;, score=0.602 total time=   0.0s
[CV 1/10] END .................C=0.1, epsilon=1;, score=0.589 total time=   0.0s
[CV 7/10] END ...............C=0.1, epsilon=0.1;, score=0.583 total time=   0.0s
[CV 3/10] END .................C=0.1, epsilon=1;, score=0.496 total time=   0.0s
[CV 9/10] END ...............C=0.1, epsilon=0.1;, score=0.452 total time=   0.0s
[CV 6/10] END ...............C=0.1, epsilon=0.1;, score=0.628 total time=   0.1s
[CV 8/10] END ...............C=0.1, epsi

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


[CV 9/10] END ............alpha=0.1, l1_ratio=1;, score=0.762 total time=   0.0s
[CV 10/10] END ...........alpha=0.1, l1_ratio=1;, score=0.797 total time=   0.0s
Best CV score: 0.743:
Best parameters: {'alpha': 0.0001, 'l1_ratio': 0.1} 

lasso
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV 1/10] END .....................alpha=0.0001;, score=0.678 total time=   0.0s
[CV 2/10] END .....................alpha=0.0001;, score=0.718 total time=   0.0s
[CV 4/10] END .....................alpha=0.0001;, score=0.718 total time=   0.0s
[CV 3/10] END .....................alpha=0.0001;, score=0.821 total time=   0.0s
[CV 5/10] END .....................alpha=0.0001;, score=0.765 total time=   0.0s
[CV 6/10] END .....................alpha=0.0001;, score=0.719 total time=   0.0s
[CV 7/10] END .....................alpha=0.0001;, score=0.668 total time=   0.0s
[CV 8/10] END .....................alpha=0.0001;, score=0.796 total time=   0.0s
[CV 10/10] END ....................alpha=0.0001



[CV 5/10] END ....activation=relu, max_iter=400;, score=0.783 total time=   3.5s
[CV 3/10] END ....activation=relu, max_iter=400;, score=0.821 total time=   3.5s
[CV 1/10] END ....activation=relu, max_iter=400;, score=0.794 total time=   3.6s




[CV 1/10] END activation=logistic, max_iter=400;, score=0.828 total time=   4.1s
[CV 4/10] END activation=logistic, max_iter=400;, score=0.881 total time=   4.1s
[CV 10/10] END activation=logistic, max_iter=400;, score=0.817 total time=   4.0s
[CV 8/10] END activation=logistic, max_iter=400;, score=0.857 total time=   4.0s
[CV 3/10] END activation=logistic, max_iter=400;, score=0.862 total time=   4.1s
[CV 9/10] END activation=logistic, max_iter=400;, score=0.851 total time=   4.0s
[CV 5/10] END activation=logistic, max_iter=400;, score=0.824 total time=   4.1s
[CV 2/10] END activation=logistic, max_iter=400;, score=0.900 total time=   4.1s
[CV 5/10] END ....activation=tanh, max_iter=400;, score=0.914 total time=   4.1s
[CV 7/10] END activation=logistic, max_iter=400;, score=0.934 total time=   4.1s
[CV 6/10] END activation=logistic, max_iter=400;, score=0.927 total time=   4.1s
[CV 9/10] END ....activation=relu, max_iter=400;, score=0.817 total time=   2.4s




[CV 4/10] END ....activation=tanh, max_iter=400;, score=0.914 total time=   4.5s
[CV 3/10] END ....activation=tanh, max_iter=400;, score=0.893 total time=   4.5s
[CV 2/10] END ....activation=tanh, max_iter=400;, score=0.931 total time=   4.5s
[CV 1/10] END ....activation=tanh, max_iter=400;, score=0.870 total time=   4.5s
[CV 7/10] END ....activation=tanh, max_iter=400;, score=0.964 total time=   4.5s
[CV 6/10] END ....activation=tanh, max_iter=400;, score=0.937 total time=   4.5s
[CV 10/10] END ...activation=tanh, max_iter=400;, score=0.840 total time=   4.5s
[CV 8/10] END ....activation=tanh, max_iter=400;, score=0.892 total time=   4.5s
[CV 9/10] END ....activation=tanh, max_iter=400;, score=0.887 total time=   4.5s
[CV 6/10] END ....activation=relu, max_iter=400;, score=0.904 total time=   2.8s
[CV 7/10] END ....activation=relu, max_iter=400;, score=0.881 total time=   2.8s
[CV 8/10] END ....activation=relu, max_iter=400;, score=0.837 total time=   2.8s
[CV 10/10] END ...activation

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Best CV score: 0.406:
Best parameters: {'alpha': 0.0001, 'l1_ratio': 0.1} 

lasso
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV 1/10] END .....................alpha=0.0001;, score=0.471 total time=   0.0s
[CV 2/10] END .....................alpha=0.0001;, score=0.448 total time=   0.0s
[CV 3/10] END .....................alpha=0.0001;, score=0.522 total time=   0.0s
[CV 4/10] END .....................alpha=0.0001;, score=0.406 total time=   0.0s
[CV 5/10] END ....................alpha=0.0001;, score=-0.027 total time=   0.0s
[CV 6/10] END .....................alpha=0.0001;, score=0.512 total time=   0.0s
[CV 7/10] END .....................alpha=0.0001;, score=0.416 total time=   0.0s
[CV 8/10] END .....................alpha=0.0001;, score=0.374 total time=   0.0s
[CV 9/10] END .....................alpha=0.0001;, score=0.494 total time=   0.0s
[CV 10/10] END ....................alpha=0.0001;, score=0.445 total time=   0.0s
[CV 1/10] END ......................alpha=0.001

### Train and serialize models with best hyperparameters
These parameters come from running the parameter search with the full grid


In [11]:
optimized_parameters = {
    "acetate": {
        "nn_fine": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [50, 40],
            max_iter = 5000
        ),
        "nn_coarse": sklearn.neural_network.MLPRegressor(
            shuffle=True,
            activation = 'tanh', 
            hidden_layer_sizes = [40, 20],
            max_iter = 5000
        ),
        "svm_rbf": sklearn.svm.SVR(
            kernel = 'rbf', 
            C = 10000, 
            epsilon = 0.1, 
            gamma = 0.01
        ),
        'rf': sklearn.ensemble.RandomForestRegressor(
            max_depth = 32,
            # max_samples = 0.5,
            n_estimators = 130
        ),
        'en': sklearn.linear_model.ElasticNet(
            alpha =  0.1,
            l1_ratio = 0.4
        ),
        'lasso': sklearn.linear_model.Lasso(
            alpha = 0.1
        ),
        'knn': sklearn.neighbors.KNeighborsRegressor(
            algorithm = 'ball_tree',
            leaf_size = 5,
            n_neighbors = 4,
            weights = 'distance'
        ),
        'bayesian': sklearn.linear_model.BayesianRidge(
          alpha_1 = 1, 
          alpha_2 = 0.1, 
          lambda_1 = 0.1, 
          lambda_2 =  1, 
          n_iter = 300
        ),
    },
    "biomass": {
        "nn_fine": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [100, 80, 60, 70],
            max_iter = 5000
        ),
        "nn_coarse": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [100, 100, 60, 80],
            max_iter = 5000
        ),
        "svm_rbf": sklearn.svm.SVR(
            kernel = 'rbf', 
            C = 10000,
            epsilon = 0.0001, 
            gamma = 0.001
        ),
        'rf': sklearn.ensemble.RandomForestRegressor(
            max_depth = 32,
            # max_samples = 0.5,
            n_estimators = 80
        ),
        'en': sklearn.linear_model.ElasticNet(
            alpha = 1e-05, 
            l1_ratio = 0.1
        ),
        'lasso': sklearn.linear_model.Lasso(
            alpha = 1e-06
        ),
        'knn': sklearn.neighbors.KNeighborsRegressor(
            algorithm = 'ball_tree',
            leaf_size = 5,
            n_neighbors = 4,
            weights = 'distance'
        ),
        'bayesian': sklearn.linear_model.BayesianRidge(
          alpha_1 = 0.1, 
          alpha_2 = 1, 
          lambda_1 = 1, 
          lambda_2 =  1, 
          n_iter = 300
        ),
    },
    "butanol": {
        "nn_fine": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [90, 60, 10, 80],
            max_iter = 5000
        ),
        "nn_coarse": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [60, 20],
            max_iter = 5000
        ),
        "svm_rbf": sklearn.svm.SVR(
            kernel = 'rbf', 
            C = 1000, 
            epsilon = 0.01, 
            gamma = 0.01
        ),
        'rf': sklearn.ensemble.RandomForestRegressor(
            max_depth = 28,
            n_estimators = 120
        ),
        'en': sklearn.linear_model.ElasticNet(
            alpha =  1e-10,
            l1_ratio = 0.1
        ),
        'lasso': sklearn.linear_model.Lasso(
            alpha = 1e-10
        ),
        'knn': sklearn.neighbors.KNeighborsRegressor(
            algorithm = 'ball_tree',
            leaf_size = 5,
            n_neighbors = 2,
            weights = 'distance'
        ),
        'bayesian': sklearn.linear_model.BayesianRidge(
          alpha_1 = 1, 
          alpha_2 = 0.1, 
          lambda_1 = 0.1, 
          lambda_2 =  1, 
          n_iter = 300
        ),
    },
    "butyrate": {
        "nn_fine": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [90, 30, 20],
            max_iter = 5000
        ),
        "nn_coarse": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [60, 20],
            max_iter = 5000
        ),
        "svm_rbf": sklearn.svm.SVR(
            kernel = 'rbf', 
            C = 10000, 
            epsilon = 0.01, 
            gamma = 0.01
        ),
        'rf': sklearn.ensemble.RandomForestRegressor(
            max_depth = 22,
            # max_samples = 0.5,
            n_estimators = 130
        ),
        'en': sklearn.linear_model.ElasticNet(
            alpha = 0.0001,
            l1_ratio = 0.1
        ),
        'lasso': sklearn.linear_model.Lasso(
            alpha = 1e-10
        ),
        'knn': sklearn.neighbors.KNeighborsRegressor(
            algorithm = 'ball_tree',
            leaf_size = 5,
            n_neighbors = 4,
            weights = 'distance'
        ),
        'bayesian': sklearn.linear_model.BayesianRidge(
          alpha_1 = 1, 
          alpha_2 = 0.1, 
          lambda_1 = 0.1, 
          lambda_2 =  1, 
          n_iter = 300
        ),
    },
    "ethanol": {
        "nn_fine": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [80, 50],
            max_iter = 5000
        ),
        "nn_coarse": sklearn.neural_network.MLPRegressor(
            shuffle=True, 
            activation = 'tanh', 
            hidden_layer_sizes = [80, 60],
            max_iter = 5000
        ),
        "svm_rbf": sklearn.svm.SVR(
            kernel = 'rbf', 
            C = 10000, 
            epsilon = 0.0001, 
            gamma = 0.001
        ),
        'rf': sklearn.ensemble.RandomForestRegressor(
            max_depth = 22,
            # max_samples = 0.5,
            n_estimators = 100
        ),
        'en': sklearn.linear_model.ElasticNet(
            alpha =  0.001,
            l1_ratio = 0.1
        ),
        'lasso': sklearn.linear_model.Lasso(
            alpha = 0.01
        ),
        'knn': sklearn.neighbors.KNeighborsRegressor(
            algorithm = 'ball_tree',
            leaf_size = 5,
            n_neighbors = 4,
            weights = 'distance'
        ),
        'bayesian': sklearn.linear_model.BayesianRidge(
          alpha_1 = 1, 
          alpha_2 = 0.1, 
          lambda_1 = 0.1, 
          lambda_2 =  1, 
          n_iter = 300
        ),
    },
}

In [15]:
# Scale X training data
X_scaled_train = Scaler.fit_transform(X_train, y_train)

for index, output in enumerate(['biomass', 'ethanol', 'acetate', 'butanol', 'butyrate']):
    print(output)
    
    # separate out the output of interest
    y_train_output=y_train[:,index]

    
    for algorithm in ['nn_fine', 'nn_coarse', 'svm_rbf', 'rf', 'en', 'lasso', 'knn', 'bayesian']:
        print(algorithm)

        # train the model
        model = optimized_parameters[output][algorithm].fit(X_scaled_train, y_train_output)

        # serialize the model
        filename = f'../trained_models/{output}/{algorithm}.pkl'

        with open(filename, 'wb') as file:  
            pickle.dump(model, file)


biomass
nn_fine
nn_coarse
svm_rbf
rf
en
lasso
knn
bayesian
ethanol
nn_fine
nn_coarse
svm_rbf
rf
en
lasso
knn
bayesian
acetate
nn_fine
nn_coarse
svm_rbf
rf
en
lasso
knn
bayesian
butanol
nn_fine
nn_coarse
svm_rbf
rf
en
lasso
knn
bayesian
butyrate
nn_fine


  positive)


nn_coarse
svm_rbf
rf
en
lasso
knn
bayesian


  positive)
  positive)
