In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [188]:
# Limpiar la columna parameter (nombre contaminante)
def clean_parameter_col(df):
    df['parameter'] = df['parameter'].apply(lambda s: s.strip())
    return df

def preprocess(df):
    df = clean_parameter_col(df)
    df = df[['date', 'parameter', 'SO2', 'SO2_b']]
    df.columns = ['date', 'parameter', 'Value', 'Flag']
    return df

def pivot(df):
    piv = df.pivot(
        columns='parameter',
        index=['date', 'Flag'],
        values='Value'
    )
    return piv


invalid_flags = set(['P', 'p', 'c', 'd', 'b', 'm', 'l', 'z',
                 'o', 's', 'f', 'e', 'a', 's', 'f', 'h'])
def rm_invalid_flags(df):

    df.loc[df['Flag'].isin(invalid_flags), 'Value'] = np.nan
    return df
    

In [189]:
DATA_DIR = "../data"
DATA_RAW =  DATA_DIR + "/raw"
DATA_PROCESSED =  DATA_DIR + "/processed"

RAW_CONT_PATH = f"{DATA_RAW}/SD_TecMTY_contaminantes_2021_2022.csv"
RAW_METEO_PATH = f"{DATA_RAW}/SD_TecMTY_meteorologia_2021_2022.csv"

PROCESSED_CONT_PATH = f"{DATA_PROCESSED}/cont.csv"
PROCESSED_MET_PATH = f"{DATA_PROCESSED}/meteo.csv"

PROCESSED_CONT_PATH_W_FLAG = f"{DATA_PROCESSED}/cont_flags.csv"
PROCESSED_MET_PATH_W_FLAG = f"{DATA_PROCESSED}/meteo_flags.csv"

PROCESSED_PIV_CONT_PATH = f"{DATA_PROCESSED}/piv_cont.csv"
PROCESSED_PIV_MET_PATH = f"{DATA_PROCESSED}/piv_meteo.csv"

# Contaminantes

In [190]:
cont = pd.read_csv(RAW_CONT_PATH, index_col=0)
cont.head()

Unnamed: 0,date,parameter,SE,SE_b,NE,NE_b,CE,CE_b,NO,NO_b,...,SO2,SO2_b,SE3,SE3_b,SUR,SUR_b,Norte2,NTE2_b,NE3,NE3_b
205739,2021-07-01 00:00:00,PM10,,x,,x,,x,,x,...,,x,,l,,l,,l,,x
205740,2021-07-01 00:00:00,PM2.5,,x,,x,,x,,x,...,,x,,l,,l,,l,,x
205741,2021-07-01 00:00:00,O3,,x,,x,,x,,x,...,,x,,l,,l,,l,,x
205742,2021-07-01 00:00:00,SO2,,x,,x,,x,,x,...,,x,,l,,l,,l,,x
205743,2021-07-01 00:00:00,NO2,,e,,x,,x,,x,...,,e,,e,,e,,e,,e


Guardar df incluyendo filas de banderas invalidas

In [191]:
cont = preprocess(cont)
cont['invalid'] = cont['Flag'].apply(lambda x: 1 if x in invalid_flags else 0)
cont.to_csv(PROCESSED_CONT_PATH_W_FLAG, index=False)
cont.drop('invalid', axis=1, inplace=True)
print(len(cont))
cont.head()

65754


Unnamed: 0,date,parameter,Value,Flag
205739,2021-07-01 00:00:00,PM10,,x
205740,2021-07-01 00:00:00,PM2.5,,x
205741,2021-07-01 00:00:00,O3,,x
205742,2021-07-01 00:00:00,SO2,,x
205743,2021-07-01 00:00:00,NO2,,e


Guardar df con NaN para banderas invalidas

In [192]:
cont = rm_invalid_flags(cont)
cont.to_csv(PROCESSED_CONT_PATH, index=False)
print(len(cont))

65754


Guardar df pivoteada sin incluir filas de banderas invalidas

In [194]:
piv_cont = pivot(cont)
piv_cont = piv_cont.reset_index()
piv_cont.to_csv(PROCESSED_PIV_CONT_PATH, index=False)
print(len(piv_cont))
piv_cont.head()

12085


parameter,date,Flag,CO,NO2,O3,PM10,PM2.5,SO2
0,2021-07-01 00:00:00,a,,,,,,
1,2021-07-01 00:00:00,e,,,,,,
2,2021-07-01 00:00:00,x,,,,,,
3,2021-07-01 01:00:00,,0.49,3.9,22.0,21.0,,2.8
4,2021-07-01 01:00:00,l,,,,,,


# Meteorologica

In [106]:
met = pd.read_csv(RAW_METEO_PATH, index_col=0)
met.head()

Unnamed: 0,date,parameter,SE,SE_b,NE,NE_b,CE,CE_b,NO,NO_b,...,SO2,SO2_b,SE3,SE3_b,SUR,SUR_b,Norte2,NTE2_b,NE3,NE3_b
239459,2021-07-01 00:00:00,TOUT,,x,,x,,x,,x,...,,x,,x,,x,,x,,x
239460,2021-07-01 00:00:00,RH,,x,,x,,x,,x,...,,x,,l,,l,,l,,x
239461,2021-07-01 00:00:00,SR,,x,0.0,l,0.0,l,0.0,l,...,0.0,l,0.0,l,0.0,l,0.0,l,0.0,x
239462,2021-07-01 00:00:00,PRS,,x,,x,,x,,x,...,,x,,l,,l,,l,,x
239463,2021-07-01 00:00:00,RAINF,,x,,x,,x,,x,...,,x,,l,,l,,l,,x


In [71]:
met = preprocess(met)
piv_met = pivot(met)
piv_met = piv_met.reset_index()
piv_met.head()

parameter,date,PRS,RAINF,RH,SR,TOUT,WDR,WSR
0,2021-07-01 00:00:00,,,,0.0,,,
1,2021-07-01 01:00:00,714.5,0.0,77.0,0.0,22.32,79.0,109.7
2,2021-07-01 02:00:00,714.20001,0.0,77.0,0.0,22.5,74.0,109.7
3,2021-07-01 03:00:00,714.20001,0.0,76.0,0.0,22.58,69.0,106.9
4,2021-07-01 04:00:00,714.20001,0.0,79.0,0.0,22.24,81.0,118.4


In [72]:
met.to_csv(PROCESSED_MET_PATH, index=False)
piv_met.to_csv(PROCESSED_PIV_MET_PATH, index=False)