In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
import joblib
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
import sklearn
import forecasting_metrics as metrics

# importamos voting regressor
from sklearn.ensemble import VotingRegressor
# modelos de regresion a utilizar para las predicciones
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

#Carga del dataset completos la florida, ucm y utal, para PM25 imputado por KNN
lf=pd.read_csv('./dataset/lf_pm25_ia.csv')
ucm=pd.read_csv('./dataset/ucm_pm25_ia.csv')
utal=pd.read_csv('./dataset/utal_pm25_ia.csv')

#Carga del dataset acotados la florida, ucm y utal, para PM25 imputado por KNN
lf_acot=pd.read_csv('./dataset/lf_pm25a.csv')
ucm_acot=pd.read_csv('./dataset/ucm_pm25a.csv')
utal_acot=pd.read_csv('./dataset/utal_pm25a.csv')

#Se asignan datos inferiores a 2018 para entranamiento
training1 = lf.loc[lf.FECHA <np.int64(180000)]
#Se asignan datos inferiores a 2017 para entranamiento
training2 = ucm.loc[ucm.FECHA <np.int64(170000)]
#Se asignan datos inferiores a 2017 para entranamiento
training3 = utal.loc[utal.FECHA <np.int64(170000)]


r1 = LinearRegression()
r2 = RandomForestRegressor(n_estimators=10, random_state=1)
r3 = KNeighborsRegressor()
r4 = SVR(kernel='rbf', C=50, epsilon=0.0075, gamma=1e-05)

voting_rgr = VotingRegressor([('lr', r1),('rf',r2),('knn',r3),('svr',r4)])

In [2]:
##################### Para dataset normales #####################
def metricas(voting_rgr, train):
    metrica = []
    predict_PM25 = voting_rgr.predict(train[["D_viento","H_relativa","P_atmosferica","Temperatura","V_viento","Pluviometria","PM10"]].values)
    metrica.append(["PM2.5","R2",r2_score(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MSE",mean_squared_error(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MAE",median_absolute_error(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MaxE",max_error(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","RMSE",metrics.rmse(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MASE",metrics.mase(train["PM2_5"].values, predict_PM25)])

    
    print(metrica)
    
def entrenamiento (voting_rgr,train):
    #Validacion simple con K-fold de 15 pasos
    fold= TimeSeriesSplit(max_train_size=None, n_splits=15)
    for train_index, test_index in fold.split(train):
        #Se asignan variables base sin medias ni min. ni max.
        X_train = train[["D_viento","H_relativa","P_atmosferica","Temperatura","V_viento","Pluviometria","PM10"]].iloc[train_index].values
        voting_rgr.fit(X_train,train["PM2_5"].iloc[train_index].values)
    
    metricas(voting_rgr,train)
    return(voting_rgr)

##################### Para dataset completos #####################
def metricas_full(voting_rgr, train):
    metrica = []
    predict_PM25 = voting_rgr.predict(train[["D_viento","H_relativa","P_atmosferica",
                        "Temperatura","V_viento","Pluviometria","Media_D_viento",
                        "Media_V_viento","Media_P_atmosferica","Media_Temperatura",
                        "Media_H_relativa","Minimo_Temperatura","Maximo_Temperatura",
                        "Rango_Temperatura","Media_PM2_5","Minimo_PM2_5","Maximo_PM2_5",
                        "Rango_PM2_5","PM10"]].values)
    metrica.append(["PM2.5","R2",r2_score(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MSE",mean_squared_error(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MAE",median_absolute_error(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MaxE",max_error(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","RMSE",metrics.rmse(train["PM2_5"].values, predict_PM25)])
    metrica.append(["PM2.5","MASE",metrics.mase(train["PM2_5"].values, predict_PM25)])
    
    print(metrica)
    
def entrenamiento_full (voting_rgr,train):
    #Validacion simple con K-fold de 15 pasos
    fold= TimeSeriesSplit(max_train_size=None, n_splits=15)
    for train_index, test_index in fold.split(train):
        #Se asignan variables base sin medias ni min. ni max.
        X_train = train[["D_viento","H_relativa","P_atmosferica",
                        "Temperatura","V_viento","Pluviometria","Media_D_viento",
                        "Media_V_viento","Media_P_atmosferica","Media_Temperatura",
                        "Media_H_relativa","Minimo_Temperatura","Maximo_Temperatura",
                        "Rango_Temperatura","Media_PM2_5","Minimo_PM2_5","Maximo_PM2_5",
                        "Rango_PM2_5","PM10"]].iloc[train_index].values
        voting_rgr.fit(X_train,train["PM2_5"].iloc[train_index].values)
    
    metricas_full(voting_rgr,train)
    return(voting_rgr)

# DATASET 24HRS

In [3]:
print("\n***********************************************************")
print("********** Analisis de dataset PM25 24:00 hrs **********")
print("***********************************************************\n")
#********************************** DATA BASICO *****************************************************#

print("\n********** Metricas de entrenamiento La florida completo basico**********\n")
voting_rgr_PM25 = entrenamiento(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dbLF.pkl')

print("\n********** Metricas de entrenamiento UCM completo basico**********\n")
voting_rgr_PM25 = entrenamiento(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dbUCM.pkl')

print("\n********** Metricas de entrenamiento Utal completo basico**********\n")
voting_rgr_PM25 = entrenamiento(voting_rgr, training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dbUTAL.pkl')

#********************************** DATA FULL *****************************************************#

print("\n********** Metricas de entrenamiento La florida completo ampliado**********\n")
voting_rgr_PM25 = entrenamiento_full(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dfLF.pkl')

print("\n********** Metricas de entrenamiento UCM completo ampliado**********\n")
voting_rgr_PM25 = entrenamiento_full(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dfUCM.pkl')

print("\n********** Metricas de entrenamiento Utal completo ampliado**********\n")
voting_rgr_PM25 = entrenamiento_full(voting_rgr,training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dfUTAL.pkl')



***********************************************************
********** Analisis de dataset PM25 24:00 hrs **********
***********************************************************


********** Metricas de entrenamiento La florida completo basico**********

[['PM2.5', 'R2', 0.9119558021791737], ['PM2.5', 'MSE', 438.2044761206571], ['PM2.5', 'MAE', 5.546695978227486], ['PM2.5', 'MaxE', 334.49554195763125], ['PM2.5', 'RMSE', 20.933334089930757], ['PM2.5', 'MASE', 0.5849336961174123]]

********** Metricas de entrenamiento UCM completo basico**********

[['PM2.5', 'R2', 0.8086726886913971], ['PM2.5', 'MSE', 242.1202897631001], ['PM2.5', 'MAE', 4.299078349071796], ['PM2.5', 'MaxE', 302.81053694060375], ['PM2.5', 'RMSE', 15.560214965195696], ['PM2.5', 'MASE', 0.7640575925309024]]

********** Metricas de entrenamiento Utal completo basico**********

[['PM2.5', 'R2', 0.9020317306149132], ['PM2.5', 'MSE', 156.18278091037885], ['PM2.5', 'MAE', 3.6379761095810275], ['PM2.5', 'MaxE', 228.055053613170

['modelo_entrenadoPM25_dfUTAL.pkl']

# DATASET 6HRS 

In [4]:
#Se asignan datos inferiores a 2018 para entranamiento
training1 = lf_acot.loc[lf_acot.FECHA <np.int64(180000)]
#Se asignan datos inferiores a 2017 para entranamiento
training2 = ucm_acot.loc[ucm_acot.FECHA <np.int64(170000)]
#Se asignan datos inferiores a 2017 para entranamiento
training3 = utal_acot.loc[utal_acot.FECHA <np.int64(170000)]

test_lf =  lf_acot.loc[lf.FECHA >=180000]
test_ucm =  lf_acot.loc[ucm.FECHA >=170000]
test_utal =  lf_acot.loc[utal.FECHA >=170000]

test_ucm =  test_ucm.loc[test_ucm.FECHA <=180000]
test_utal =  test_utal.loc[test_utal.FECHA <=180000]

print("\n***********************************************************")
print("********** Analisis de dataset PM25 17:00 a 23:00 hrs **********")
print("***********************************************************\n")

#********************************** DATA BASICO *****************************************************#

print("\n********** Metricas de entrenamiento La florida acotado basico**********\n")
voting_rgr_PM25 = entrenamiento(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dbaLF.pkl')

print("\n********** Metricas de entrenamiento UCM acotado basico**********\n")
voting_rgr_PM25 = entrenamiento(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dbaUCM.pkl')

print("\n********** Metricas de entrenamiento Utal acotado basico**********\n")
voting_rgr_PM25 = entrenamiento(voting_rgr, training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dbaUTAL.pkl')

#********************************** DATA FULL *****************************************************#

print("\n********** Metricas de entrenamiento La florida acotado ampliado**********\n")
voting_rgr_PM25 = entrenamiento_full(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dfaLF.pkl')

print("\n********** Metricas de entrenamiento UCM acotado ampliado**********\n")
voting_rgr_PM25 = entrenamiento_full(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dfaUCM.pkl')

print("\n********** Metricas de entrenamiento Utal acotado ampliado**********\n")
voting_rgr_PM25 = entrenamiento_full(voting_rgr,training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM25, 'modelo_entrenadoPM25_dfaUTAL.pkl')


***********************************************************
********** Analisis de dataset PM25 17:00 a 23:00 hrs **********
***********************************************************


********** Metricas de entrenamiento La florida acotado basico**********

[['PM2.5', 'R2', 0.9167918773038856], ['PM2.5', 'MSE', 847.0776369028874], ['PM2.5', 'MAE', 8.8142480296359], ['PM2.5', 'MaxE', 338.3866660817902], ['PM2.5', 'RMSE', 29.104598208923747], ['PM2.5', 'MASE', 0.4073675449448182]]

********** Metricas de entrenamiento UCM acotado basico**********

[['PM2.5', 'R2', 0.8295867704970233], ['PM2.5', 'MSE', 433.43999749331607], ['PM2.5', 'MAE', 4.943984593489269], ['PM2.5', 'MaxE', 275.86631654944296], ['PM2.5', 'RMSE', 20.819221827275776], ['PM2.5', 'MASE', 0.5051888568243842]]

********** Metricas de entrenamiento Utal acotado basico**********

[['PM2.5', 'R2', 0.8983487417093857], ['PM2.5', 'MSE', 297.602396992822], ['PM2.5', 'MAE', 5.266492986705316], ['PM2.5', 'MaxE', 225.193496522572

['modelo_entrenadoPM25_dfaUTAL.pkl']