# Air quality data preprocessing

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## ThermoADR

In [145]:
df_tm = pd.read_csv('../data/raw/ThermoADR/SM-29-06-21.csv', skiprows=24)
df_tm.drop(columns=['Unnamed: 0'], inplace=True)
df_tm.dropna(inplace=True)
df_tm['date_time'] = df_tm['date'] + ' ' + df_tm['time']
print(df_tm.shape)
df_tm.head(2)

(34490, 9)


Unnamed: 0,record,PM10,Temp,RHumidity,AtmoPressure,Flags,time,date,date_time
0,1,6.81,26.9,47.0,634.0,2.0,14:09:57,29-jun-2021,29-jun-2021 14:09:57
1,2,3.52,27.0,47.0,634.0,3.0,14:10:57,29-jun-2021,29-jun-2021 14:10:57


In [146]:
df_tm['datetime'] = pd.to_datetime(df_tm['date_time'])

#change datatypes before resample
df_tm['PM10'] = pd.to_numeric(df_tm.PM10)
df_tm['Temp'] = pd.to_numeric(df_tm.Temp)

#sets small concentrations to detection limit
idx = df_tm['PM10']<=0
df_tm.loc[idx,'PM10'] = 1

df_tm.set_index('datetime',inplace=True)
df_tm = df_tm.resample('H').mean()

df_tm.reset_index(inplace=True)

print(df_tm.shape)
df_tm.head(5)

(576, 7)


Unnamed: 0,datetime,record,PM10,Temp,RHumidity,AtmoPressure,Flags
0,2021-06-29 14:00:00,26.0,6.831176,28.501961,41.980392,634.0,2.372549
1,2021-06-29 15:00:00,81.5,2.541333,29.301667,40.666667,634.0,2.166667
2,2021-06-29 16:00:00,141.5,2.689333,28.671667,42.183333,633.8,2.116667
3,2021-06-29 17:00:00,201.5,1.261667,31.095,36.65,633.833333,2.116667
4,2021-06-29 18:00:00,261.5,0.990167,31.481667,35.35,632.966667,2.2


In [147]:
df_tm.to_csv('../data/processed/lowcost_analysis/ThermoADR_290621_230721.csv')

## Simaj Santa Fe

In [141]:
df_simaj = pd.read_csv('../data/raw/simaj/SantaFeReport1_29jun-23jul2021.csv', parse_dates=[0], dayfirst=True)
print(df_simaj.shape)
df_simaj.head(2)

(41713, 25)


Unnamed: 0,Date_Time,CO,CO Estado,PM 10,PM 10 Estado,Temp Int,Temp Int Estado,Temp Ext,Temp Ext Estado,RH,...,Precipitacion,Precipitacion Estado,Radiacion Solar,Radiacion Solar Estado,Presion barometrica,Presion barometrica Estado,PM 2.5,PM 2.5 Estado,CO.1,CO Estado.1
0,2021-06-26 00:00:00,0.671,144.0,19.46,144,25.54,144,18.8,144,82.7,...,0.0,144,1.4,144,628.2,144,8.85,144,0.713,144
1,2021-06-26 00:01:00,0.664,144.0,19.38,144,25.54,144,18.9,144,82.8,...,0.0,144,1.6,144,628.3,144,8.73,144,0.711,3


In [142]:
df_simaj['datetime'] = pd.to_datetime(df_simaj['Date_Time'])

df_simaj.set_index('datetime',inplace=True)
df_simaj = df_simaj.resample('H').mean()

df_simaj.reset_index(inplace=True)

print(df_simaj.shape)
df_simaj.head(5)

(697, 25)


Unnamed: 0,datetime,CO,CO Estado,PM 10,PM 10 Estado,Temp Int,Temp Int Estado,Temp Ext,Temp Ext Estado,RH,...,Precipitacion,Precipitacion Estado,Radiacion Solar,Radiacion Solar Estado,Presion barometrica,Presion barometrica Estado,PM 2.5,PM 2.5 Estado,CO.1,CO Estado.1
0,2021-06-26 00:00:00,0.741733,144.0,19.3365,144.0,25.416333,144.0,18.338333,144.0,82.501667,...,0.0,144.0,1.721667,144.0,628.301667,144.0,9.070167,144.0,0.781583,134.6
1,2021-06-26 01:00:00,0.844833,144.0,18.5465,144.0,25.4735,144.0,18.758333,144.0,80.793333,...,0.0,144.0,1.541667,144.0,628.661667,144.0,14.6595,144.0,0.88455,144.0
2,2021-06-26 02:00:00,0.560067,139.2,27.018833,139.2,25.430667,139.2,17.973333,139.2,82.528333,...,0.0,139.2,1.56,139.2,628.105,139.2,10.958983,139.316667,0.6017,139.2
3,2021-06-26 03:00:00,0.422567,144.0,23.568333,144.0,25.379667,144.0,17.888333,144.0,80.833333,...,0.0,144.0,1.388333,144.0,627.026667,144.0,15.741833,144.0,0.466217,144.0
4,2021-06-26 04:00:00,0.47525,144.0,28.873,144.0,25.291,144.0,17.691667,144.0,81.425,...,0.0,144.0,1.378333,144.0,626.495,144.0,13.458167,144.0,0.51725,144.0


In [144]:
df_simaj.to_csv('../data/processed/lowcost_analysis/simaj_sfe_260621_25072021.csv')