<a href="https://colab.research.google.com/github/elsaessern/Sensor_projectSu24/blob/main/Model1bench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
import random

In [None]:
# convert to pandas dataframe
sensordf = pd.read_csv('https://raw.githubusercontent.com/elsaessern/Sensor_projectSu24/main/AirQualityUCI.csv',sep=';')

In [None]:
sensordf.shape

(9471, 17)

In [None]:
sensordf.columns

Index(['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)',
       'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)', 'T', 'RH', 'AH', 'Unnamed: 15', 'Unnamed: 16'],
      dtype='object')

In [None]:
sensordf['PT08.S1(CO)']
pd.isna(sensordf['PT08.S1(CO)'])


0       False
1       False
2       False
3       False
4       False
        ...  
9466     True
9467     True
9468     True
9469     True
9470     True
Name: PT08.S1(CO), Length: 9471, dtype: bool

In [None]:
# sensordf includes empty rows and columns from csv so we drop them here
sensordf = sensordf.dropna(axis=0,how='all')
sensordf = sensordf.dropna(axis=1,how='all')
print(sensordf)
pd.isna(sensordf).sum()

            Date      Time CO(GT)  PT08.S1(CO)  NMHC(GT) C6H6(GT)  \
0     10/03/2004  18.00.00    2,6       1360.0     150.0     11,9   
1     10/03/2004  19.00.00      2       1292.0     112.0      9,4   
2     10/03/2004  20.00.00    2,2       1402.0      88.0      9,0   
3     10/03/2004  21.00.00    2,2       1376.0      80.0      9,2   
4     10/03/2004  22.00.00    1,6       1272.0      51.0      6,5   
...          ...       ...    ...          ...       ...      ...   
9352  04/04/2005  10.00.00    3,1       1314.0    -200.0     13,5   
9353  04/04/2005  11.00.00    2,4       1163.0    -200.0     11,4   
9354  04/04/2005  12.00.00    2,4       1142.0    -200.0     12,4   
9355  04/04/2005  13.00.00    2,1       1003.0    -200.0      9,5   
9356  04/04/2005  14.00.00    2,2       1071.0    -200.0     11,9   

      PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  \
0            1046.0    166.0        1056.0    113.0        1692.0   
1             955.0    103.0     

Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

In [None]:
# define a function that will split data into 70% training and 30% test data
def split_data(data):
  L = []
  # split data into ten sections of time (separate rows into 10ths)
  for i in range(0,10):
    L.append(data.loc[i*int(data.shape[0]*0.1):(i+1)*int(data.shape[0]*0.1)]  )

  # randomize the order of the 10ths to minimize date/time bias
  random.shuffle(L)

  # assign first seven 10ths of shuffled data to training data frame and last three 10ths to the test data frame
  train = pd.concat(L[:7])
  test = pd.concat(L[7:])

  return [train,test]


In [None]:
# check if split_data works
split = split_data(sensordf)
print(split)

train = split[0]
test = split[1]

[            Date      Time CO(GT)  PT08.S1(CO)  NMHC(GT) C6H6(GT)  \
0     10/03/2004  18.00.00    2,6       1360.0     150.0     11,9   
1     10/03/2004  19.00.00      2       1292.0     112.0      9,4   
2     10/03/2004  20.00.00    2,2       1402.0      88.0      9,0   
3     10/03/2004  21.00.00    2,2       1376.0      80.0      9,2   
4     10/03/2004  22.00.00    1,6       1272.0      51.0      6,5   
...          ...       ...    ...          ...       ...      ...   
3736  13/08/2004  10.00.00    1,5       1016.0    -200.0      7,9   
3737  13/08/2004  11.00.00    1,2        953.0    -200.0      5,8   
3738  13/08/2004  12.00.00    1,1        918.0    -200.0      5,3   
3739  13/08/2004  13.00.00    0,8        892.0    -200.0      4,2   
3740  13/08/2004  14.00.00    0,7        841.0    -200.0      3,3   

      PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  \
0            1046.0    166.0        1056.0    113.0        1692.0   
1             955.0    103.0    

In [None]:
# check if ratio of rows is 7/3 = 2.33
split[0].shape[0]/split[1].shape[0]


2.3333333333333335

In [None]:
# choose sensor data as input samples and ground truth benzene concentration as target
X_train = train.loc[:,['PT08.S1(CO)','PT08.S2(NMHC)','PT08.S3(NOx)','PT08.S4(NO2)','PT08.S5(O3)']]
Y_train = train.loc[:,['C6H6(GT)']]

X_test = test.loc[:,['PT08.S1(CO)','PT08.S2(NMHC)','PT08.S3(NOx)','PT08.S4(NO2)','PT08.S5(O3)']]
Y_test = test.loc[:,['C6H6(GT)']]


Y_train = Y_train.to_numpy()
Y_train = Y_train.flatten()

Y_test = Y_test.to_numpy()
Y_test = Y_test.flatten()

# replace commas with dots
parts = []
for i in range(len(Y_train)):
  parts.append(Y_train[i].strip().replace(',', '.'))

Y_train = pd.Series(parts)

parts = []
for i in range(len(Y_test)):
  parts.append(Y_test[i].strip().replace(',', '.'))

Y_test = pd.Series(parts)


print(Y_train)

0       11.9
1        9.4
2        9.0
3        9.2
4        6.5
        ... 
6547     7.9
6548     5.8
6549     5.3
6550     4.2
6551     3.3
Length: 6552, dtype: object


In [None]:
# train histogram gradient boosting regressor model
predictbenz = HistGradientBoostingRegressor(max_iter=1000).fit(X_train,Y_train)

In [None]:
predictbenz.score(X_test,Y_test)

0.9998956454918783

In [None]:
out = predictbenz.predict(X_test)
print(out)

[19.3021416  12.67734618 14.58034226 ...  5.48070935 12.91222325
 14.45432373]


In [None]:
print(Y_test.to_numpy().astype(float)-out)

[-0.0021416   0.02265382  0.01965774 ...  0.01929065 -0.01222325
  0.04567627]
