# Data Processsing
This notebook takes experimental data and converts it to a form that is usable for machine learning algorithm training.
This involves smoothing the data using the Savitzky–Golay filter, and adding slope values by comparing consecutive concentration values, and dividing by time elapsed. 


### Setup
Import neccessary packages and set current directory to the directory that contains this notebook

In [4]:
import pandas as pd
import numpy as np
from scipy.signal import savgol_filter 
import os
home_dir = os.getcwd().split('/notebooks')[0]
home_dir

'/scratch/garrettroell/machine_learning_clostridium/slope_prediction_lib'

### Import experimental data

In [5]:
exp_data = pd.read_csv(f'{home_dir}/experimental_data/experimental_data.csv')
exp_data.set_index(['composition','trial','time'],drop=True,inplace=True)

### Define and apply function to remove the first 24 hrs data
This is done to remove the effects of glucose feeding that take place in the first 24 hrs. All data points are shifted down by one day, and data points from the first day are removed

In [6]:
def subtract_one_day(df):
    df_copy = df.copy()
    df_copy.reset_index(inplace=True)
    df_copy = df_copy[df_copy['time'] > 1]
    df_copy['time'] = df_copy['time'] - 1
    df_copy.set_index(['composition', 'trial', 'time'], inplace=True, drop = True)
    return df_copy

In [7]:
raw_data = subtract_one_day(exp_data)
raw_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,acetate,biomass,butanol,butyrate,ethanol,flow rate,H2,CO,CO2
composition,trial,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.580,21.610,0.41,0.040,0.060,10.940,20,0.125,0.500,0.375
1,1,0.650,44.310,0.39,0.050,0.080,15.890,20,0.125,0.500,0.375
1,1,1.020,46.190,0.46,0.210,0.640,8.140,20,0.125,0.500,0.375
1,1,1.670,46.160,0.49,1.180,3.640,10.810,20,0.125,0.500,0.375
1,1,3.700,34.390,0.64,8.440,9.760,20.340,20,0.125,0.500,0.375
1,1,5.060,27.080,0.67,14.090,8.560,28.920,20,0.125,0.500,0.375
1,1,5.510,19.960,0.67,16.060,4.900,32.730,20,0.125,0.500,0.375
1,2,0.580,22.980,0.38,0.390,0.060,11.180,20,0.125,0.500,0.375
1,2,0.650,44.310,0.32,0.450,0.130,13.750,20,0.125,0.500,0.375
1,2,1.020,70.500,0.37,0.720,0.600,9.420,20,0.125,0.500,0.375


In [8]:
def smooth_interpolate(df,delta):
    """delta = step size"""
    savgol0 = lambda x: savgol_filter(x, 15, 2, deriv=0)
    smoothed_data = pd.DataFrame()
    
    composition = range (1,11)
    trial = [1,2]
    
    for comp in composition:
        for tri in trial:
            if (comp == 5 and tri == 2) or (comp == 6 and tri == 2) or (comp == 7 and tri == 2):
                pass
            else:
                section = df.loc[comp, tri]

                times = section.index
                max_time = times[-1] + delta
                new_times = np.arange(0,max_time, delta)  
                section = section.reindex(section.index.union(new_times))
                section = section.interpolate()
                section = section.round(3)
                times_to_remove = set(times) - (set(times) & set(new_times))
                section = section.loc[~section.index.isin(times_to_remove)]
            
                smooth=section.apply(savgol0)

                smooth.insert(0, "time", smooth.index)
                smooth.insert(0, "trial", tri)
                smooth.insert(0, "composition", comp)
                smoothed_data = pd.concat([smoothed_data , smooth],sort=True)


    smoothed_data.set_index(['composition','trial','time'],drop=True,inplace=True)
    smoothed_data.clip(lower=0,inplace=True)
        
    times_to_drop = np.arange(0,1,delta)
    smoothed_data = smoothed_data.drop(index =times_to_drop, level ='time')

    # work around to change index values
    smoothed_data.reset_index(drop=False,inplace=True)
    smoothed_data['time'] = round (smoothed_data['time']-1,1)
    smoothed_data.set_index(['composition','trial','time'],drop=True,inplace=True)
    
    return smoothed_data

In [9]:
smooth_data = smooth_interpolate(exp_data, 0.1)
smooth_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CO,CO2,H2,acetate,biomass,butanol,butyrate,ethanol,flow rate
composition,trial,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.0,0.5,0.375,0.125,16.905029,0.429546,0.029818,0.075529,15.895240,20.0
1,1,0.1,0.5,0.375,0.125,18.001195,0.421076,0.028260,0.061033,15.607023,20.0
1,1,0.2,0.5,0.375,0.125,19.831073,0.410803,0.028316,0.051013,15.144135,20.0
1,1,0.3,0.5,0.375,0.125,22.196249,0.403602,0.031357,0.050367,14.522308,20.0
1,1,0.4,0.5,0.375,0.125,25.293925,0.403709,0.028700,0.035514,13.956015,20.0
1,1,0.5,0.5,0.375,0.125,29.005157,0.406214,0.031038,0.041123,13.322048,20.0
1,1,0.6,0.5,0.375,0.125,32.899333,0.410848,0.043533,0.081221,12.648214,20.0
1,1,0.7,0.5,0.375,0.125,36.745004,0.416725,0.070296,0.167219,12.061396,20.0
1,1,0.8,0.5,0.375,0.125,40.329649,0.424215,0.114119,0.308302,11.496556,20.0
1,1,0.9,0.5,0.375,0.125,43.438281,0.432878,0.176821,0.508967,10.975655,20.0


In [10]:
def add_slope_values(df):

  acetate_Δ = []
  biomass_Δ = []
  butanol_Δ = []
  butyrate_Δ = []
  ethanol_Δ = []

  old_comp = 0
  old_tri = 0

  for index, _ in df.iterrows():
    new_comp = index[0]
    new_tri = index[1]

    if (new_comp == old_comp and new_tri == old_tri):
      Δ_time = index[2] - old_index[2]

      acetate_Δ.append((df.loc[index]['acetate']-df.loc[old_index]['acetate'])/Δ_time)
      biomass_Δ.append((df.loc[index]['biomass']-df.loc[old_index]['biomass'])/Δ_time)
      butanol_Δ.append((df.loc[index]['butanol']-df.loc[old_index]['butanol'])/Δ_time)
      butyrate_Δ.append((df.loc[index]['butyrate']-df.loc[old_index]['butyrate'])/Δ_time)
      ethanol_Δ.append((df.loc[index]['ethanol']-df.loc[old_index]['ethanol'])/Δ_time)
    else: 
      acetate_Δ.append(0)
      biomass_Δ.append(0)
      butanol_Δ.append(0)
      butyrate_Δ.append(0)
      ethanol_Δ.append(0)

    old_comp = index[0]
    old_tri = index[1]
    old_index = index

  df['acetate_Δ'] = acetate_Δ
  df['biomass_Δ'] = biomass_Δ
  df['butanol_Δ'] = butanol_Δ
  df['butyrate_Δ'] = butyrate_Δ
  df['ethanol_Δ'] = ethanol_Δ

  return df

In [11]:
add_slope_values(smooth_data)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CO,CO2,H2,acetate,biomass,butanol,butyrate,ethanol,flow rate,acetate_Δ,biomass_Δ,butanol_Δ,butyrate_Δ,ethanol_Δ
composition,trial,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,0.0,0.5,0.375,0.125,16.905029,0.429546,0.029818,0.075529,15.895240,20.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,1,0.1,0.5,0.375,0.125,18.001195,0.421076,0.028260,0.061033,15.607023,20.0,10.961665,-0.084697,-0.015584,-0.144950,-2.882172
1,1,0.2,0.5,0.375,0.125,19.831073,0.410803,0.028316,0.051013,15.144135,20.0,18.298778,-0.102733,0.000561,-0.100208,-4.628878
1,1,0.3,0.5,0.375,0.125,22.196249,0.403602,0.031357,0.050367,14.522308,20.0,23.651756,-0.072009,0.030416,-0.006462,-6.218271
1,1,0.4,0.5,0.375,0.125,25.293925,0.403709,0.028700,0.035514,13.956015,20.0,30.976760,0.001068,-0.026579,-0.148525,-5.662923
1,1,0.5,0.5,0.375,0.125,29.005157,0.406214,0.031038,0.041123,13.322048,20.0,37.112317,0.025050,0.023385,0.056090,-6.339674
1,1,0.6,0.5,0.375,0.125,32.899333,0.410848,0.043533,0.081221,12.648214,20.0,38.941765,0.046344,0.124950,0.400977,-6.738344
1,1,0.7,0.5,0.375,0.125,36.745004,0.416725,0.070296,0.167219,12.061396,20.0,38.456706,0.058769,0.267629,0.859982,-5.868172
1,1,0.8,0.5,0.375,0.125,40.329649,0.424215,0.114119,0.308302,11.496556,20.0,35.846452,0.074905,0.438226,1.410833,-5.648407
1,1,0.9,0.5,0.375,0.125,43.438281,0.432878,0.176821,0.508967,10.975655,20.0,31.086317,0.086624,0.627023,2.006643,-5.209005


In [12]:
def add_previous_values(df):
    
    prev_acetate = []
    prev_biomass = []
    prev_butanol = []
    prev_butyrate = []
    prev_ethanol = []

    prev_acetate_Δ = []
    prev_biomass_Δ = []
    prev_butanol_Δ = []
    prev_butyrate_Δ = []
    prev_ethanol_Δ = []

    for count, (index, data) in enumerate(df.iterrows()):
        prev_data = df.iloc[count-1]
        time = index[2]
        
        if time == 0 or time == 0.1:
            prev_acetate.append(0)
            prev_biomass.append(0)
            prev_butanol.append(0)
            prev_butyrate.append(0)
            prev_ethanol.append(0)
    
            prev_acetate_Δ.append(0)
            prev_biomass_Δ.append(0)
            prev_butanol_Δ.append(0)
            prev_butyrate_Δ.append(0)
            prev_ethanol_Δ.append(0)
        else:
            prev_acetate.append(prev_data['acetate'])
            prev_biomass.append(prev_data['biomass'])
            prev_butanol.append(prev_data['butanol'])
            prev_butyrate.append(prev_data['butyrate'])
            prev_ethanol.append(prev_data['butanol'])
            
            prev_acetate_Δ.append(prev_data['acetate_Δ'])
            prev_biomass_Δ.append(prev_data['biomass_Δ'])
            prev_butanol_Δ.append(prev_data['butanol_Δ'])
            prev_butyrate_Δ.append(prev_data['butyrate_Δ'])
            prev_ethanol_Δ.append(prev_data['butyrate_Δ'])
            
    df['prev_acetate']  = prev_acetate
    df['prev_biomass']  = prev_biomass
    df['prev_butanol']  = prev_butanol
    df['prev_butyrate'] = prev_butyrate
    df['prev_ethanol']  = prev_ethanol

    df['prev_acetate_Δ']  = prev_acetate_Δ
    df['prev_biomass_Δ']  = prev_biomass_Δ
    df['prev_butanol_Δ']  = prev_butanol_Δ
    df['prev_butyrate_Δ'] = prev_butyrate_Δ
    df['prev_ethanol_Δ']  = prev_ethanol_Δ

    return df

In [13]:
smooth_data = add_previous_values(smooth_data)

In [14]:
# display(raw_data.head())
display(smooth_data.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CO,CO2,H2,acetate,biomass,butanol,butyrate,ethanol,flow rate,acetate_Δ,...,prev_acetate,prev_biomass,prev_butanol,prev_butyrate,prev_ethanol,prev_acetate_Δ,prev_biomass_Δ,prev_butanol_Δ,prev_butyrate_Δ,prev_ethanol_Δ
composition,trial,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,0.0,0.5,0.375,0.125,16.905029,0.429546,0.029818,0.075529,15.89524,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.1,0.5,0.375,0.125,18.001195,0.421076,0.02826,0.061033,15.607023,20.0,10.961665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.2,0.5,0.375,0.125,19.831073,0.410803,0.028316,0.051013,15.144135,20.0,18.298778,...,18.001195,0.421076,0.02826,0.061033,0.02826,10.961665,-0.084697,-0.015584,-0.14495,-0.14495
1,1,0.3,0.5,0.375,0.125,22.196249,0.403602,0.031357,0.050367,14.522308,20.0,23.651756,...,19.831073,0.410803,0.028316,0.051013,0.028316,18.298778,-0.102733,0.000561,-0.100208,-0.100208
1,1,0.4,0.5,0.375,0.125,25.293925,0.403709,0.0287,0.035514,13.956015,20.0,30.97676,...,22.196249,0.403602,0.031357,0.050367,0.031357,23.651756,-0.072009,0.030416,-0.006462,-0.006462


In [15]:
smooth_data.reset_index(inplace=True, drop = False)
smooth_data = smooth_data.loc[smooth_data['time'] != 0]
smooth_data = smooth_data.loc[smooth_data['time'] != 0.1]

smooth_data.set_index(['composition','trial','time'],drop=True,inplace=True)

In [16]:
smooth_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CO,CO2,H2,acetate,biomass,butanol,butyrate,ethanol,flow rate,acetate_Δ,...,prev_acetate,prev_biomass,prev_butanol,prev_butyrate,prev_ethanol,prev_acetate_Δ,prev_biomass_Δ,prev_butanol_Δ,prev_butyrate_Δ,prev_ethanol_Δ
composition,trial,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,1,0.2,0.5,0.375,0.125,19.831073,0.410803,0.028316,0.051013,15.144135,20.0,18.298778,...,18.001195,0.421076,0.028260,0.061033,0.028260,10.961665,-0.084697,-0.015584,-0.144950,-0.144950
1,1,0.3,0.5,0.375,0.125,22.196249,0.403602,0.031357,0.050367,14.522308,20.0,23.651756,...,19.831073,0.410803,0.028316,0.051013,0.028316,18.298778,-0.102733,0.000561,-0.100208,-0.100208
1,1,0.4,0.5,0.375,0.125,25.293925,0.403709,0.028700,0.035514,13.956015,20.0,30.976760,...,22.196249,0.403602,0.031357,0.050367,0.031357,23.651756,-0.072009,0.030416,-0.006462,-0.006462
1,1,0.5,0.5,0.375,0.125,29.005157,0.406214,0.031038,0.041123,13.322048,20.0,37.112317,...,25.293925,0.403709,0.028700,0.035514,0.028700,30.976760,0.001068,-0.026579,-0.148525,-0.148525
1,1,0.6,0.5,0.375,0.125,32.899333,0.410848,0.043533,0.081221,12.648214,20.0,38.941765,...,29.005157,0.406214,0.031038,0.041123,0.031038,37.112317,0.025050,0.023385,0.056090,0.056090
1,1,0.7,0.5,0.375,0.125,36.745004,0.416725,0.070296,0.167219,12.061396,20.0,38.456706,...,32.899333,0.410848,0.043533,0.081221,0.043533,38.941765,0.046344,0.124950,0.400977,0.400977
1,1,0.8,0.5,0.375,0.125,40.329649,0.424215,0.114119,0.308302,11.496556,20.0,35.846452,...,36.745004,0.416725,0.070296,0.167219,0.070296,38.456706,0.058769,0.267629,0.859982,0.859982
1,1,0.9,0.5,0.375,0.125,43.438281,0.432878,0.176821,0.508967,10.975655,20.0,31.086317,...,40.329649,0.424215,0.114119,0.308302,0.114119,35.846452,0.074905,0.438226,1.410833,1.410833
1,1,1.0,0.5,0.375,0.125,45.900253,0.441940,0.234150,0.749057,10.477146,20.0,24.619729,...,43.438281,0.432878,0.176821,0.508967,0.176821,31.086317,0.086624,0.627023,2.006643,2.006643
1,1,1.1,0.5,0.375,0.125,47.477392,0.451644,0.315612,1.072657,10.053591,20.0,15.771385,...,45.900253,0.441940,0.234150,0.749057,0.234150,24.619729,0.090624,0.573294,2.400905,2.400905


In [17]:
smooth_data.to_csv(f'{cur_dir}/processed_data/smooth_data.csv')