### Notebook D: Plotting time course curves

Set up imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from itertools import *
import sklearn
import scipy

Load trained models to a dictionary

In [2]:
# dictionary to hold all trained models
models = {}

for algorithm in ['nn_fine', 'nn_coarse', 'svm_rbf', 'rf', 'en', 'lasso', 'knn', 'bayesian']:
    models[algorithm] = {}
    for output in ['biomass', 'ethanol', 'acetate', 'butanol', 'butyrate']:
        filename = f'../trained_models/{output}/{algorithm}.pkl'
        models[algorithm][output] = pickle.load(open(filename,'rb'))

Load smoothed data

In [3]:
rates_df = pd.read_csv(f'../data/rates_data.csv')
print(f'Shape of the rates data: {rates_df.shape[0]} rows by {rates_df.shape[1]} columns')

Shape of the rates data: 836 rows by 18 columns


Fit Min Max Scaler to match scaler that was applied to training data

In [4]:
train_data = rates_df[rates_df.composition.isin([1,2,3,4,5,6,7])]
X = train_data [[
    'biomass (g/L)', 'ethanol (mM)', 'acetate (mM)', 'butanol (mM)', 
    'butyrate (mM)', 'N2', 'CO', 'CO2', 'H2', 'flow rate (mL/min)'
]]

Scaler = sklearn.preprocessing.MinMaxScaler()
Scaler.fit(X)

MinMaxScaler()

Get list of unique trials

In [5]:
trials = []

for index, row in rates_df.iterrows():

    # add unique trials to list as a tuple
    trial = (row['composition'], row['trial'])
    
    if (trial not in trials):
        trials.append(trial)

len(trials)
  

17

Generate simulated data for each trial

In [7]:
# algorithm = 'rf'

time_step = 0.1

outputs = ['biomass', 'ethanol', 'acetate', 'butanol', 'butyrate']
concentration_labels = ['biomass (g/L)', 'ethanol (mM)', 'acetate (mM)', 'butanol (mM)', 'butyrate (mM)']
gas_labels = ['N2', 'CO', 'CO2', 'H2', 'flow rate (mL/min)']

for algorithm in ['nn_fine', 'nn_coarse', 'svm_rbf', 'rf', 'en', 'lasso', 'knn', 'bayesian']:
    print(algorithm)
    # save output data as a list of dictionaries. Will be converted to data frame below
    output_data = []

    for trial in trials:
        # print(trial)

        comp_num = trial[0]
        trial_num = trial[1]

        # get trial data
        trial_data = rates_df[(rates_df['composition'] == comp_num)  & (rates_df['trial'] == trial_num)]

        # get max time
        max_time = round(max(trial_data['time']), 1)

        # get initial concentrations and gas composition for trial
        initial_values = trial_data[trial_data['time'] == 0]

        # get the row number of the initial values
        index = initial_values.index[0]

        # make list of gas values
        gas_values = [initial_values[label][index]  for label in gas_labels]

        # make list of inital concentration values
        concentration_values = [initial_values[label][index]  for label in concentration_labels]

        # loop over all time points of each trial
        time = 0
        while (round(time, 1) <= max_time):
            # add current data to output_data list
            output_data.append({
              'composition': comp_num,
              'trial': trial_num,
              'time': time,
              'biomass (g/L)': concentration_values[0],
              'ethanol (mM)': concentration_values[1],
              'acetate (mM)': concentration_values[2],
              'butanol (mM)': concentration_values[3],
              'butyrate (mM)': concentration_values[4],
            })

            # make array for ml input
            input_values = np.array(list(chain(concentration_values,gas_values))).reshape(1, -1)

            # Scale values to match how the model was trained
            input_values = Scaler.transform(input_values)

            # get rates for each output
            rates = [models[algorithm][output].predict(input_values) for output in outputs]

            # update concentration values -- not allowing for negative concentrations
            concentration_values = [max(concentration + time_step * rate[0], 0) for concentration, rate in zip(concentration_values, rates)]

            # increment time
            time += time_step


    prediction_df = pd.DataFrame(output_data)

    # save data as csv
    filename = f'../data/rate_time_course_data/{algorithm}.csv'
    prediction_df.to_csv(filename, index=False)

    display(prediction_df)



nn_fine


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.432014,15.479542,16.906172,0.038598,0.056240
2,1.0,1.0,0.2,0.434586,15.085316,16.943084,0.049784,0.042031
3,1.0,1.0,0.3,0.437249,14.688834,17.014004,0.063562,0.027509
4,1.0,1.0,0.4,0.440008,14.290437,17.132495,0.080086,0.012624
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.426869,85.963025,0.000000,5.168106,0.414633
832,10.0,2.0,3.6,0.422695,87.125709,0.000000,5.263269,0.360459
833,10.0,2.0,3.7,0.418362,88.245367,0.000000,5.350965,0.304709
834,10.0,2.0,3.8,0.413871,89.325630,0.000000,5.430966,0.247447


nn_coarse


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.431311,15.569167,17.028067,0.032532,0.082760
2,1.0,1.0,0.2,0.433155,15.268922,17.166065,0.037896,0.097939
3,1.0,1.0,0.3,0.435060,14.970760,17.307119,0.046235,0.115877
4,1.0,1.0,0.4,0.437029,14.675046,17.450478,0.057844,0.136749
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.466077,67.614329,0.000000,10.159369,0.000000
832,10.0,2.0,3.6,0.464660,68.306439,0.000000,10.369557,0.000000
833,10.0,2.0,3.7,0.463139,68.933360,0.000000,10.580341,0.000000
834,10.0,2.0,3.8,0.461517,69.507298,0.000000,10.791943,0.000000


svm_rbf


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.427480,15.832392,18.391188,0.032225,0.056949
2,1.0,1.0,0.2,0.425907,15.811894,19.878014,0.037095,0.050772
3,1.0,1.0,0.3,0.424807,15.810819,21.350286,0.044662,0.051966
4,1.0,1.0,0.4,0.424173,15.830296,22.804078,0.055131,0.060872
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,1.074453,56.871514,27.457137,15.395463,0.000000
832,10.0,2.0,3.6,1.092864,59.483784,27.333337,16.193389,0.000000
833,10.0,2.0,3.7,1.110921,62.185418,27.274922,17.005200,0.000000
834,10.0,2.0,3.8,1.128627,64.978143,27.288947,17.830097,0.000000


rf


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.425759,15.658972,17.482583,0.031916,0.076854
2,1.0,1.0,0.2,0.421964,15.401434,18.630320,0.035440,0.081348
3,1.0,1.0,0.3,0.420243,15.113986,20.574013,0.038417,0.082554
4,1.0,1.0,0.4,0.420516,14.744349,23.250898,0.041873,0.087806
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.671244,52.331725,29.119306,10.786392,3.673986
832,10.0,2.0,3.6,0.673559,54.117135,27.891867,11.286113,3.557339
833,10.0,2.0,3.7,0.675849,55.970434,26.663549,11.784055,3.446796
834,10.0,2.0,3.8,0.678039,57.838796,25.435418,12.277473,3.330053


en


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.432037,16.013903,18.260049,0.107784,0.234883
2,1.0,1.0,0.2,0.434644,16.182674,19.591642,0.193432,0.405254
3,1.0,1.0,0.3,0.437344,16.377525,20.886915,0.287082,0.581064
4,1.0,1.0,0.4,0.440132,16.598493,22.144498,0.389010,0.762065
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.614279,50.754610,39.717259,7.176111,2.763627
832,10.0,2.0,3.6,0.620437,52.506765,39.330646,7.604363,2.747658
833,10.0,2.0,3.7,0.626539,54.296126,38.891499,8.048661,2.720214
834,10.0,2.0,3.8,0.632580,56.122346,38.400341,8.509246,2.680408


lasso


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.432036,16.016089,18.596465,0.107784,0.235610
2,1.0,1.0,0.2,0.434674,16.192511,20.240359,0.194157,0.407948
3,1.0,1.0,0.3,0.437434,16.400294,21.822592,0.289261,0.586894
4,1.0,1.0,0.4,0.440308,16.639262,23.340638,0.393371,0.772123
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.612423,50.776583,33.701417,7.248235,2.741759
832,10.0,2.0,3.6,0.617981,52.438738,32.695054,7.664781,2.701489
833,10.0,2.0,3.7,0.623429,54.126142,31.621313,8.095096,2.647123
834,10.0,2.0,3.8,0.628766,55.837890,30.482708,8.539261,2.577708


knn


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.427040,15.740754,17.044157,0.028288,0.057515
2,1.0,1.0,0.2,0.422349,15.524351,17.650487,0.026712,0.045338
3,1.0,1.0,0.3,0.414849,15.232301,18.707065,0.025136,0.032566
4,1.0,1.0,0.4,0.406761,14.861683,20.092511,0.024390,0.021919
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.525739,63.136891,22.700234,6.889430,2.307406
832,10.0,2.0,3.6,0.525739,63.788829,21.920976,7.064213,2.377510
833,10.0,2.0,3.7,0.525739,64.440693,21.141714,7.238995,2.447619
834,10.0,2.0,3.8,0.525739,65.092487,20.362448,7.413776,2.517733


bayesian


Unnamed: 0,composition,trial,time,biomass (g/L),ethanol (mM),acetate (mM),butanol (mM),butyrate (mM)
0,1.0,1.0,0.0,0.429529,15.871137,16.893540,0.029863,0.070178
1,1.0,1.0,0.1,0.432038,16.016624,18.511826,0.110446,0.232064
2,1.0,1.0,0.2,0.434673,16.192371,20.062051,0.199193,0.400361
3,1.0,1.0,0.3,0.437422,16.398086,21.541751,0.296357,0.574726
4,1.0,1.0,0.4,0.440278,16.633409,22.948622,0.402182,0.754789
...,...,...,...,...,...,...,...,...
831,10.0,2.0,3.5,0.600663,47.392041,25.945417,6.488046,1.974172
832,10.0,2.0,3.6,0.605871,48.858047,25.009363,6.849406,1.906514
833,10.0,2.0,3.7,0.610998,50.346140,24.033811,7.222358,1.826244
834,10.0,2.0,3.8,0.616041,51.855843,23.022166,7.606991,1.732648


Define a function to help calculate r squared and rmse of predicted curves

In [8]:
# function to take a dataframe with two trials and get average values
def average_of_trials(two_trial_df):

  # get list of unique times
  times = []
  for _, row in two_trial_df.iterrows():
    if row.time not in times:
      times.append(row.time)

  # for each time, get the average values of the rows
  average_values = []
  for time in times:
    rows = two_trial_df[two_trial_df['time'] == time]
    average_values.append(rows.mean())

  return pd.DataFrame(average_values)

Get table of r squared values for each composition

In [9]:
r_squared_data = {}
rmse_data = {}
for comp_num in range(1,11):
  r_squared_comp_data = {}
  rmse_comp_data = {}
  for algorithm in ['nn_fine', 'nn_coarse', 'svm_rbf', 'rf', 'en', 'lasso', 'knn', 'bayesian']:

    # get the average of experimental concentrations for compositions with multiple trials
    experimental_df = rates_df[rates_df['composition'] == comp_num]
    experimental_df =  average_of_trials(experimental_df)

    # get the average of predictions for compositions with multiple trials
    filename = f'../data/rate_time_course_data/{algorithm}.csv'
    prediction_df = pd.read_csv(filename)
    prediction_df = prediction_df[prediction_df['composition'] == comp_num]
    prediction_df =  average_of_trials(prediction_df)

    experimental_concentrations = []
    predicted_concentrations = []

    for output in ['ethanol (mM)',	'acetate (mM)',	'butanol (mM)',	'butyrate (mM)']:
      experimental_concentrations.extend(list(experimental_df[output]))
      predicted_concentrations.extend(list(prediction_df[output]))

    _, _, r_value, _, _ = scipy.stats.linregress(experimental_concentrations, predicted_concentrations)
    r_squared_comp_data[algorithm] = round(r_value**2, 3)

    rmse_value = sklearn.metrics.mean_squared_error(experimental_concentrations, predicted_concentrations, squared=False)
    rmse_comp_data[algorithm] = round(rmse_value, 2)
  
  r_squared_data[comp_num] = r_squared_comp_data
  rmse_data[comp_num] = rmse_comp_data

r_squared_df = pd.DataFrame(r_squared_data).transpose()
rmse_df = pd.DataFrame(rmse_data).transpose()

print('r squared data')
display(r_squared_df)

print('rmse data')
display(rmse_df)

    # print(f'composition {comp_num} {algorithm} r-squared = {r_value**2}')


r squared data


Unnamed: 0,nn_fine,nn_coarse,svm_rbf,rf,en,lasso,knn,bayesian
1,0.851,0.91,0.82,0.942,0.668,0.653,0.839,0.642
2,0.753,0.909,0.922,0.632,0.873,0.925,0.941,0.929
3,0.896,0.617,0.942,0.969,0.847,0.925,0.971,0.946
4,0.935,0.95,0.867,0.453,0.841,0.819,0.897,0.835
5,0.946,0.992,0.98,0.977,0.956,0.974,0.997,0.981
6,0.945,0.875,0.868,0.818,0.587,0.378,0.944,0.343
7,0.864,0.82,0.811,0.894,0.763,0.786,0.897,0.809
8,0.861,0.789,0.838,0.923,0.892,0.912,0.449,0.904
9,0.208,0.085,0.638,0.891,0.716,0.469,0.016,0.134
10,0.684,0.556,0.873,0.809,0.869,0.896,0.958,0.927


rmse data


Unnamed: 0,nn_fine,nn_coarse,svm_rbf,rf,en,lasso,knn,bayesian
1,6.75,9.26,6.31,4.83,9.08,8.87,8.32,8.76
2,15.19,8.44,8.11,16.67,10.72,7.66,6.82,7.04
3,10.99,20.87,7.94,6.17,13.62,8.33,5.22,7.08
4,4.88,4.3,6.49,17.16,7.22,7.45,5.66,7.09
5,23.23,11.36,2.61,6.57,7.38,12.72,1.23,14.55
6,4.98,14.96,7.39,20.54,13.81,17.05,7.28,17.56
7,7.26,9.3,8.91,6.2,9.7,9.03,6.14,8.49
8,32.65,14.72,8.57,7.0,7.31,10.82,18.75,12.77
9,26.94,25.58,16.15,11.18,15.56,19.78,24.59,23.5
10,14.17,14.47,7.0,8.17,6.74,6.02,4.42,6.07
