# Pre-processing

In [1]:
# The basics
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Scaler
from sklearn.preprocessing import StandardScaler

In [2]:
def normalize_data(X, Y):
    
    """ 
    Ad-hoc function that returns a new dataframe containing centered and scaled
    predictors X alongisde the dependend variables Y
    """
    
    # Instantiating scaler
    scaler = StandardScaler()
    
    # Fitting to X
    scaled_X = scaler.fit_transform(X)
    
    # Saving to df
    scaled_X = pd.DataFrame(scaled_X)
    
    scaled_df = pd.concat((Y, scaled_X), axis = 1)
    
    # Naming columns
    colnames = list(Y.columns) + list(X.columns)
    scaled_df.columns = colnames
    
    return scaled_df

In [3]:
# Raw data: contains 3 runs for the same 125 gas mixutures = 375 exposures
path_to_raw_data = '../../data/real_NO_NO2_NH3_frequency_measurement_Ir_1_25_V_275_C_2021-03-30.xlsx'
raw = pd.read_excel(path_to_raw_data)

In [4]:
raw.head()

Unnamed: 0,Exposure nr,Cycle nr,Sample nr,NO [ppm],NO2 [ppm],NH3 [ppm],Freq [Hz],Slope sensor 1 [uA/s],Slope sensor 2 [uA/s],Average sensor 1 [uA/s],Average sensor 2 [uA/s],Sensor temperature [C]
0,1,1,1,10,5,20,0.05,-18.855169,-22.588416,32.926184,27.961554,274.994683
1,1,1,2,10,5,20,0.05,-28.289268,-28.185027,25.853867,20.915297,274.980487
2,1,1,3,10,5,20,0.05,-0.390916,-0.482129,25.756138,20.794765,274.985895
3,1,1,4,10,5,20,0.05,-0.234549,-0.156366,25.697501,20.755673,275.020372
4,1,1,5,10,5,20,0.05,-0.143336,-0.24758,25.661667,20.693778,275.014964


In [6]:
# Renaming columns
raw.rename(columns={'Exposure nr': 'exposure',
                    'Cycle nr': 'cycle',
                    'Sample nr': 'sample',
                    'NO [ppm]': 'NO',
                    'NO2 [ppm]': 'NO2',
                    'NH3 [ppm]': 'NH3',
                    'Freq [Hz]': 'freq',
                    'Slope sensor 1 [uA/s]': '1-slope',
                    'Slope sensor 2 [uA/s]': '2-slope',
                    'Average sensor 1 [uA/s]': '1-avg',
                    'Average sensor 2 [uA/s]': '2-avg',
                    'Sensor temperature [C]': 'temp'    
                   },
           inplace = True)
raw.head()

Unnamed: 0,exposure,cycle,sample,NO,NO2,NH3,freq,1-slope,2-slope,1-avg,2-avg,temp
0,1,1,1,10,5,20,0.05,-18.855169,-22.588416,32.926184,27.961554,274.994683
1,1,1,2,10,5,20,0.05,-28.289268,-28.185027,25.853867,20.915297,274.980487
2,1,1,3,10,5,20,0.05,-0.390916,-0.482129,25.756138,20.794765,274.985895
3,1,1,4,10,5,20,0.05,-0.234549,-0.156366,25.697501,20.755673,275.020372
4,1,1,5,10,5,20,0.05,-0.143336,-0.24758,25.661667,20.693778,275.014964


In [7]:
# Creating the column names of the new df
foo = raw[(raw['exposure'] == 1) & (raw['cycle'] == 1)].transpose()
gasses = ['NO', 'NO2', 'NH3']
slope_names1 = [str(foo.loc['freq'][i])+ '-1-slope-' + str(i) for i in range(len(foo.columns))]
avg_names1 = [str(foo.loc['freq'][i])+ '-1-avg-' + str(i) for i in range(len(foo.columns))]
colnames1 = ['exposure'] + gasses + slope_names1 + avg_names1


slope_names2 = [str(foo.loc['freq'][i])+ '-2-slope-' + str(i) for i in range(len(foo.columns))]
avg_names2 = [str(foo.loc['freq'][i])+ '-2-avg-' + str(i) for i in range(len(foo.columns))]
colnames2 = ['exposure'] + gasses + slope_names2 + avg_names2

In [9]:
# Separating sensors
preprocessed_sensor1 = pd.DataFrame()
preprocessed_sensor2 = pd.DataFrame()

for exp in raw['exposure'].unique():
    #print(f'####################EXPOSURE{exp}################')
    for cyc in raw['cycle'].unique():
        #print(f'    cycle {cyc}')
        
        # Reading common rows
        temp = raw[(raw['exposure'] == exp) & (raw['cycle'] == cyc)].transpose()
        ex = pd.Series(temp.loc['exposure'].unique())
        NO = pd.Series(temp.loc['NO'].unique())
        NO2 = pd.Series(temp.loc['NO2'].unique())
        NH3 = pd.Series(temp.loc['NH3'].unique())
        
        
        # Sensor 1
        slope1 = temp.loc['1-slope']
        avg1 = temp.loc['1-avg']
        
        # Concatenating
        sensor1_row = pd.concat([ex, NO, NO2, NH3, slope1, avg1], axis =0, ignore_index = True, names= colnames1)
        # Populating
        preprocessed_sensor1 = preprocessed_sensor1.append(sensor1_row, ignore_index = True)
        
        # Sensor 2
        slope2 = temp.loc['2-slope']
        avg2 = temp.loc['2-avg']
        
        # Concatenating
        sensor2_row = pd.concat([ex, NO, NO2, NH3, slope2, avg2], axis =0, ignore_index = True, names= colnames2)
        # Populating
        preprocessed_sensor2 = preprocessed_sensor2.append(sensor2_row, ignore_index = True)
        


In [10]:
# Renaming columns
# Format: <FREQUENCY>-<SENSOR_NO>-<FEATURE_NAME>-<INDEX>
cols1 = {i:colnames1[i] for i in range(len(colnames1))}
cols2 = {i:colnames2[i] for i in range(len(colnames2))}
preprocessed_sensor1.rename(columns = cols1, inplace = True)
preprocessed_sensor2.rename(columns = cols2, inplace = True)

In [14]:
preprocessed_sensor2.head()

Unnamed: 0,exposure,NO,NO2,NH3,0.05-2-slope-0,0.05-2-slope-1,0.05-2-slope-2,0.05-2-slope-3,0.05-2-slope-4,0.05-2-slope-5,...,2500.0-2-avg-230,2500.0-2-avg-231,5000.0-2-avg-232,5000.0-2-avg-233,5000.0-2-avg-234,5000.0-2-avg-235,5000.0-2-avg-236,5000.0-2-avg-237,5000.0-2-avg-238,5000.0-2-avg-239
0,1.0,10.0,5.0,20.0,-22.588416,-28.185027,-0.482129,-0.156366,-0.24758,-0.039092,...,33.580968,33.587483,33.815517,33.877412,33.861124,33.861124,33.838321,33.854609,33.844836,33.867639
1,1.0,10.0,5.0,20.0,-39.019909,-9.961837,-0.49516,-0.130305,-0.114017,-0.094471,...,33.815517,33.835063,34.020748,34.059839,34.105446,34.07287,34.059839,34.066355,34.07287,34.048438
2,1.0,10.0,5.0,20.0,-33.573149,-15.193593,-0.273641,-0.234549,-0.078183,-0.117275,...,33.904559,33.929534,34.131507,34.199918,34.206433,34.193402,34.180372,34.193402,34.180372,34.209691
3,1.0,10.0,5.0,20.0,-40.199171,-8.691361,-0.325763,-0.039092,-0.299702,-0.091214,...,33.981656,34.024006,34.199918,34.268328,34.235752,34.245524,34.25204,34.25204,34.258555,34.287874
4,2.0,20.0,40.0,40.0,-23.546159,-26.825509,-0.447381,-0.24758,-0.377885,-0.034748,...,33.538619,33.554907,33.711273,33.802487,33.799229,33.763395,33.763395,33.760137,33.786198,33.75688


In [15]:
# Sensor1
X1 = preprocessed_sensor1.iloc[:, 4:]
Y1 = preprocessed_sensor1.iloc[:, 1:4]
scaled_sensor1 = normalize_data(X1, Y1)

# Sensor 2
X2 = preprocessed_sensor2.iloc[:, 4:]
Y2 = preprocessed_sensor2.iloc[:, 1:4]
scaled_sensor2 = normalize_data(X2, Y2)

In [17]:
scaled_sensor2.head()

Unnamed: 0,NO,NO2,NH3,0.05-2-slope-0,0.05-2-slope-1,0.05-2-slope-2,0.05-2-slope-3,0.05-2-slope-4,0.05-2-slope-5,0.05-2-slope-6,...,2500.0-2-avg-230,2500.0-2-avg-231,5000.0-2-avg-232,5000.0-2-avg-233,5000.0-2-avg-234,5000.0-2-avg-235,5000.0-2-avg-236,5000.0-2-avg-237,5000.0-2-avg-238,5000.0-2-avg-239
0,10.0,5.0,20.0,2.033884,-2.024181,-1.491787,0.008906,-1.663958,0.523062,-0.949601,...,2.237116,2.250681,2.286884,2.286337,2.272227,2.278346,2.260119,2.282564,2.276652,2.30189
1,10.0,5.0,20.0,-0.661288,0.869532,-1.613034,0.294908,-0.053896,-0.157638,-1.265197,...,2.491954,2.519748,2.509506,2.483717,2.536242,2.506909,2.499037,2.51081,2.522155,2.496103
2,10.0,5.0,20.0,0.232116,0.038766,0.448171,-0.849101,0.378072,-0.437926,-0.634006,...,2.588698,2.622419,2.629652,2.635277,2.645369,2.637014,2.629036,2.647757,2.637892,2.669321
3,10.0,5.0,20.0,-0.854716,1.071274,-0.036819,1.295916,-2.292275,-0.117596,-1.107399,...,2.672464,2.725089,2.703859,2.709294,2.67705,2.693276,2.706333,2.710963,2.722064,2.753305
4,20.0,40.0,40.0,1.87679,-1.808299,-1.16846,-0.992102,-3.234751,0.57645,-1.002201,...,2.191104,2.215277,2.173806,2.20527,2.205343,2.172855,2.179309,2.180732,2.213523,2.182912


In [None]:
#Saving to csv
scaled_sensor1.to_csv('../../data/scaled_sensor1.csv', index = False)
scaled_sensor2.to_csv('../../data/scaled_sensor2.csv', index = False)

In [None]:
pd.read_csv('../../data/scaled_sensor2.csv')