# Building dummy dataframe

Due to technical problems, the measurements have not been run yet. 

To avoid further waste of precious thesis time, I decided to build this dummy dataset from previous raw sensor outputs. 

This is far from ideal, but at least the values extracted here are expected to be in the same order of magnitude of the real data.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [2]:
# Raw sensor output from previous experiments
data = pd.read_excel('../data/NO2_NO_NH3_freq_meas_data_2021-02-16.xlsx', sheet_name =  1)

In [6]:
print(list(data.iloc[0]))

[50.0, 100.0, 25.0, 0.000263169, 0.000262954, 0.000262744, 0.000262544, 0.000262353, 0.000262168, 0.000261988, 0.000261818, 0.000261644, 0.000261479, 0.00026132, 0.000261167, 0.000261017, 0.000260869, 0.000260726, 0.000260588, 0.000260447, 0.00026031, 0.000260181, 0.000260054, 0.000259932, 0.000259814, 0.000259692, 0.000259578, 0.000259462, 0.000259355, 0.000259246, 0.000259138, 0.000259032, 0.000258928, 0.000372478, 0.000374585, 0.000375556, 0.000376122, 0.000376503, 0.000376777, 0.000376979, 0.000377144, 0.000377271, 0.000377368, 0.000377445, 0.000377504, 0.000377545, 0.000377589, 0.000377621, 0.000377644, 0.000377657, 0.00037767, 0.000377677, 0.000377678, 0.000377673, 0.00037767, 0.00037766, 0.000377649, 0.000377636, 0.000377622, 0.0003776, 0.000377585, 0.000377566, 0.000377545, 0.000377527, 0.0003775, 0.00037748, 0.000377463, 0.000377433, 0.000377412, 0.000377384, 0.000377358, 0.000377329, 0.000377304, 0.00037727, 0.000377247, 0.000377213, 0.000377185, 0.000377159, 0.000377133, 0.0

In [3]:
# Quick look
data.head()

Unnamed: 0,NO2,NO,NH3,1,2,3,4,5,6,7,...,1285,1286,1287,1288,1289,1290,1291,1292,1293,1294
0,50,100,25,0.000263,0.000263,0.000263,0.000263,0.000262,0.000262,0.000262,...,0.000245,0.000245,0.000245,0.000245,0.000245,0.000245,0.000245,0.000245,0.000245,0.000245
1,100,25,100,0.000244,0.000244,0.000244,0.000244,0.000244,0.000244,0.000244,...,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026,0.00026
2,25,100,50,0.000259,0.000259,0.000259,0.000259,0.000259,0.000258,0.000258,...,0.000248,0.000248,0.000248,0.000248,0.000248,0.000248,0.000248,0.000248,0.000248,0.000247
3,50,25,100,0.000246,0.000246,0.000247,0.000247,0.000247,0.000247,0.000247,...,0.000262,0.000262,0.000262,0.000262,0.000261,0.000261,0.000261,0.000261,0.00026,0.00026
4,100,100,25,0.000257,0.000257,0.000257,0.000257,0.000256,0.000256,0.000256,...,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243


In [4]:
# Gas concentrations
Y = np.asarray(data[['NO2', 'NO', 'NH3']])

In [5]:
# Creating dummy data
n_time = 1200 # Number of time steps
n_splits = 50 # Number of splits of the data
N = Y.shape[0] # Number of gas mixtures

In [6]:
# Raw response
response = np.asarray(data.loc[:, list(range(1,n_time+1))])

In [7]:
# Time steps
time = np.asarray(data.columns[3:n_time+3])
# Split
time_splits = np.array_split(time, n_splits)

In [8]:
response_splits = np.array_split(response, n_splits, axis = 1)

In [9]:
t_len = time_splits[0].shape[0]

In [10]:
# Average and Slope per split
avg = np.zeros((N, n_splits))

for i, resp in zip(range(N),response_splits):
    # Computing average per split
    avg[:,i] = np.average(resp, axis =1)
    
    slope = np.zeros((N, n_splits))

for split, j in zip(response_splits, range(n_splits)):
    for row, i in zip(split, range(N)):
        slope[i, j] = linregress(list(range(t_len)), row)[0]

In [11]:
# Shape Features
X = np.concatenate((avg, slope), axis = 1)

In [12]:
# Renaming columns and rows to keep it organized
gas_name = ['NO2', 'NO', 'NH3']
slope_name = [f'slope{i}' for i in range(n_splits)]
avg_name = [f'avg{i}' for i in range(n_splits)]
column_name = gas_name + slope_name + avg_name
row_name = [f'mix{i}' for i in range(N)]

In [13]:
# Building df
dummy = pd.concat([pd.DataFrame(Y), pd.DataFrame(X)], axis=1)
dummy.columns = list(range((n_splits*2)+3))
dummy.rename(columns = {i:column_name[i] for i in range((n_splits*2)+3)}, index = {i:row_name[i] for i in range(N)}, inplace = True)

In [14]:
# Saving to file
dummy.to_csv('../data/dummy.csv')