In [6]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
import joblib
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
import sklearn
import forecasting_metrics as metrics

# importamos voting regressor
from sklearn.ensemble import VotingRegressor
# modelos de regresion a utilizar para las predicciones
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


#Carga del dataset completos la florida, ucm y utal, para PM10 imputado por KNN
lf=pd.read_csv('./dataset/lf_pm10_ia.csv')
ucm=pd.read_csv('./dataset/ucm_pm10_ia.csv')
utal=pd.read_csv('./dataset/utal_pm10_ia.csv')

#Carga del dataset acotados la florida, ucm y utal, para PM10 imputado por KNN
lf_acot=pd.read_csv('./dataset/lf_pm10a.csv')
ucm_acot=pd.read_csv('./dataset/ucm_pm10a.csv')
utal_acot=pd.read_csv('./dataset/utal_pm10a.csv')

#Se asignan datos inferiores a 2018 para entranamiento
training1 = lf.loc[lf.FECHA <np.int64(180000)]
#Se asignan datos inferiores a 2017 para entranamiento
training2 = ucm.loc[ucm.FECHA <np.int64(170000)]
#Se asignan datos inferiores a 2017 para entranamiento
training3 = utal.loc[utal.FECHA <np.int64(170000)]

"""
#Se asignan datos superiores a 2018 para test
test_lf =  lf.loc[lf.FECHA >=180000]
test_ucm =  ucm.loc[ucm.FECHA >=170000]
test_ucm =  test_ucm.loc[test_ucm.FECHA <=180000]
test_utal =  utal.loc[utal.FECHA >=170000]
test_utal =  test_utal.loc[test_utal.FECHA <=180000]
"""

r1 = LinearRegression()
r2 = RandomForestRegressor(n_estimators=10, random_state=1)
r3 = KNeighborsRegressor()
r4 = SVR(kernel='rbf', C=50, epsilon=0.0075, gamma=1e-05)

voting_rgr = VotingRegressor([('lr', r1),('rf',r2),('knn',r3),('svr',r4)])

In [7]:
##################### Para dataset normales #####################
def metricas(voting_rgr,train):
    metrica = []
    predict_PM10 = voting_rgr.predict(train[["D_viento","H_relativa","P_atmosferica","Temperatura","V_viento","Pluviometria","PM2_5"]].values)
    metrica.append(["PM10","R2",r2_score(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MSE",mean_squared_error(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MAE",median_absolute_error(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MaxE",max_error(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","RMSE",metrics.rmse(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MASE",metrics.mase(train["PM10"].values, predict_PM10)])
    
    
    
    print(metrica)
    
def entrenamiento (voting_rgr,train):
    #Validacion simple con K-fold de 15 pasos
    fold= TimeSeriesSplit(max_train_size=None, n_splits=15)
    for train_index, test_index in fold.split(train):
        #Se asignan variables base sin medias ni min. ni max.
        X_train = train[["D_viento","H_relativa","P_atmosferica","Temperatura","V_viento","Pluviometria","PM2_5"]].iloc[train_index].values
        voting_rgr.fit(X_train,train["PM10"].iloc[train_index].values)
    
    metricas(voting_rgr,train)
    return(voting_rgr)

##################### Para dataset completos #####################
def metricas_full(voting_rgr, train):
    metrica = []
    predict_PM10 = voting_rgr.predict(train[["D_viento","H_relativa","P_atmosferica",
                        "Temperatura","V_viento","Pluviometria","Media_D_viento",
                        "Media_V_viento","Media_P_atmosferica","Media_Temperatura",
                        "Media_H_relativa","Minimo_Temperatura","Maximo_Temperatura",
                        "Rango_Temperatura","Media_PM10","Minimo_PM10","Maximo_PM10",
                        "Rango_PM10","PM2_5"]].values)
    metrica.append(["PM10","R2",r2_score(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MSE",mean_squared_error(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MAE",median_absolute_error(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MaxE",max_error(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","RMSE",metrics.rmse(train["PM10"].values, predict_PM10)])
    metrica.append(["PM10","MASE",metrics.mase(train["PM10"].values, predict_PM10)])
    
    print(metrica)
    
def entrenamiento_full (voting_rgr,train):
    #Validacion simple con K-fold de 5 pasos
    fold= TimeSeriesSplit(max_train_size=None, n_splits=5)
    for train_index, test_index in fold.split(train):
        #Se asignan variables base sin medias ni min. ni max.
        X_train = train[["D_viento","H_relativa","P_atmosferica",
                        "Temperatura","V_viento","Pluviometria","Media_D_viento",
                        "Media_V_viento","Media_P_atmosferica","Media_Temperatura",
                        "Media_H_relativa","Minimo_Temperatura","Maximo_Temperatura",
                        "Rango_Temperatura","Media_PM10","Minimo_PM10","Maximo_PM10",
                        "Rango_PM10","PM2_5"]].iloc[train_index].values
        voting_rgr.fit(X_train,train["PM10"].iloc[train_index].values)
    
    metricas_full(voting_rgr,train)
    return(voting_rgr)

# DATASET 24HRS

In [8]:

print("\n***********************************************************")
print("********** Analisis de dataset PM10 24:00 hrs **********")
print("***********************************************************\n")
#********************************** DATA BASICO *****************************************************#

print("\n********** Metricas de entrenamiento La florida completo basico**********\n")
voting_rgr_PM10 = entrenamiento(voting_rgr,training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dbLF.pkl')

print("\n********** Metricas de entrenamiento UCM completo basico**********\n")
voting_rgr_PM10 = entrenamiento(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dbUCM.pkl')

print("\n********** Metricas de entrenamiento Utal completo basico**********\n")
voting_rgr_PM10 = entrenamiento(voting_rgr, training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dbUTAL.pkl')

#********************************** DATA FULL *****************************************************#

print("\n********** Metricas de entrenamiento La florida completo ampliado**********\n")
voting_rgr_PM10 = entrenamiento_full(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dfLF.pkl')

print("\n********** Metricas de entrenamiento UCM completo ampliado**********\n")
voting_rgr_PM10 = entrenamiento_full(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dfUCM.pkl')

print("\n********** Metricas de entrenamiento Utal completo ampliado**********\n")
voting_rgr_PM10 = entrenamiento_full(voting_rgr,training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dfUTAL.pkl')




***********************************************************
********** Analisis de dataset PM10 24:00 hrs **********
***********************************************************


********** Metricas de entrenamiento La florida completo basico**********

[['PM10', 'R2', 0.9120678371655511], ['PM10', 'MSE', 720.6793311555825], ['PM10', 'MAE', 8.685132584621666], ['PM10', 'MaxE', 376.3311517091481], ['PM10', 'RMSE', 26.84547133420426], ['PM10', 'MASE', 0.6166093346713206]]

********** Metricas de entrenamiento UCM completo basico**********

[['PM10', 'R2', 0.8082048214982708], ['PM10', 'MSE', 454.96800962503016], ['PM10', 'MAE', 6.923907375499048], ['PM10', 'MaxE', 407.4247066519237], ['PM10', 'RMSE', 21.329979128565274], ['PM10', 'MASE', 0.7169451638035894]]

********** Metricas de entrenamiento Utal completo basico**********

[['PM10', 'R2', 0.8914952886195107], ['PM10', 'MSE', 292.9392526392618], ['PM10', 'MAE', 5.013258675542399], ['PM10', 'MaxE', 289.4941376326881], ['PM10', 'RMSE',

['modelo_entrenadoPM10_dfUTAL.pkl']

# DATASET 6HRS

In [9]:
print("\n***********************************************************")
print("********** Analisis de dataset PM10 17:00 a 23:00 hrs **********")
print("***********************************************************\n")

#Se asignan datos inferiores a 2018 para entranamiento
training1 = lf_acot.loc[lf_acot.FECHA <np.int64(180000)]
#Se asignan datos inferiores a 2017 para entranamiento
training2 = ucm_acot.loc[ucm_acot.FECHA <np.int64(170000)]
#Se asignan datos inferiores a 2017 para entranamiento
training3 = utal_acot.loc[utal_acot.FECHA <np.int64(170000)]

test_lf =  lf_acot.loc[lf.FECHA >=180000]
test_ucm =  lf_acot.loc[ucm.FECHA >=170000]
test_utal =  lf_acot.loc[utal.FECHA >=170000]

test_ucm =  test_ucm.loc[test_ucm.FECHA <=180000]
test_utal =  test_utal.loc[test_utal.FECHA <=180000]

#********************************** DATA BASICO *****************************************************#

print("\n********** Metricas de entrenamiento La florida acotado basico**********\n")
voting_rgr_PM10 = entrenamiento(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dbaLF.pkl')

print("\n********** Metricas de entrenamiento UCM acotado basico**********\n")
voting_rgr_PM10 = entrenamiento(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dbaUCM.pkl')

print("\n********** Metricas de entrenamiento Utal acotado basico**********\n")
voting_rgr_PM10 = entrenamiento(voting_rgr, training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dbaUTAL.pkl')

#********************************** DATA FULL *****************************************************#

print("\n********** Metricas de entrenamiento La florida acotado ampliado**********\n")
voting_rgr_PM10 = entrenamiento_full(voting_rgr, training1)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dfaLF.pkl')

print("\n********** Metricas de entrenamiento UCM acotado ampliado**********\n")
voting_rgr_PM10 = entrenamiento_full(voting_rgr,training2)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dfaUCM.pkl')

print("\n********** Metricas de entrenamiento Utal acotado ampliado**********\n")
voting_rgr_PM10 = entrenamiento_full(voting_rgr, training3)
#Se guarda el modelo
joblib.dump(voting_rgr_PM10, 'modelo_entrenadoPM10_dfaUTAL.pkl')


***********************************************************
********** Analisis de dataset PM10 17:00 a 23:00 hrs **********
***********************************************************


********** Metricas de entrenamiento La florida acotado basico**********

[['PM10', 'R2', 0.9203029272256225], ['PM10', 'MSE', 1247.0009508388175], ['PM10', 'MAE', 13.16943839598195], ['PM10', 'MaxE', 384.75638842928606], ['PM10', 'RMSE', 35.31290062907347], ['PM10', 'MASE', 0.41657118769807905]]

********** Metricas de entrenamiento UCM acotado basico**********

[['PM10', 'R2', 0.8211338026309736], ['PM10', 'MSE', 788.0348469250345], ['PM10', 'MAE', 8.753094966941575], ['PM10', 'MaxE', 426.76555329587075], ['PM10', 'RMSE', 28.071958373527032], ['PM10', 'MASE', 0.5089010761464735]]

********** Metricas de entrenamiento Utal acotado basico**********

[['PM10', 'R2', 0.8853279646675352], ['PM10', 'MSE', 572.3195263309509], ['PM10', 'MAE', 7.872491785562303], ['PM10', 'MaxE', 285.97548142916025], ['PM10'

['modelo_entrenadoPM10_dfaUTAL.pkl']