In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn import metrics
from tqdm import tqdm
from EvoMSA import base
# NixtamalAI's packages
from covid_xprize.nixtamalai.helpers import ID_COLS, DEATHS_COL
from covid_xprize.nixtamalai import helpers
from covid_xprize.nixtamalai import models
from microtc.utils import save_model

# Getting de data
data = helpers.preprocess_full()  

In [None]:
population = {k:v for k, v in data.groupby("GeoID").Population.last().items()}

def predict(data, trans, model, start_date="2020-11-13", end_date="2020-12-05"):
    output = defaultdict(list)
    for X in trans.transform(data, start=start_date, end=end_date):
        hy = trans.update_prediction(model.predict(X))
        key = X.iloc[0]["GeoID"]
        output[key].append(hy)
    geo_pred_dfs = list()
    start_date = pd.to_datetime(start_date, format='%Y-%m-%d')
    end_date = pd.to_datetime(end_date, format='%Y-%m-%d')    
    data = data[(data.Date >= start_date) & (data.Date <= end_date)].copy()
    for key, value in output.items():
        geo_pred_df = data.loc[data.GeoID == key, ID_COLS].copy()
        # print(len(value), geo_pred_df.shape, key)
        geo_pred_df['PredictedDailyNewCases'] = value[-geo_pred_df.shape[0]:]
        geo_pred_dfs.append(geo_pred_df)
    pred_df = pd.concat(geo_pred_dfs)
    return pred_df

def compute_y_hy(output):
    res = pd.merge(data, output, how="inner")
    _ = [((100000 * value.NewCasesHampel /  population[key]).rolling(7, min_periods=1).mean().to_numpy(),
          (100000 * value.PredictedDailyNewCases /  population[key]).rolling(7, min_periods=1).mean().to_numpy())
         for key, value in res.groupby("GeoID")]

    y = np.concatenate([x[0] for x in _])
    hy = np.concatenate([x[1] for x in _])
    return y, hy

def performance(output):
    res = pd.merge(data, output, how="inner")
    y = res.NewCasesHampel.rolling(7, min_periods=1).mean()
    hy = res.PredictedDailyNewCases.rolling(7, min_periods=1).mean()
    mae = metrics.mean_absolute_error(y, hy)

    _ = [((100000 * value.NewCasesHampel /  population[key]).rolling(7, min_periods=1).mean().to_numpy(),
          (100000 * value.PredictedDailyNewCases /  population[key]).rolling(7, min_periods=1).mean().to_numpy())
         for key, value in res.groupby("GeoID")]

    y = np.concatenate([x[0] for x in _])
    hy = np.concatenate([x[1] for x in _])
    return [mae, metrics.mean_absolute_error(y, hy)]

In [None]:
transN = models.FeaturesN(lags=16).fit(data)
X, y = transN.training_set()
evo = base.EvoMSA(TR=False, stacked_method=models.AR,
                  classifier=False, n_jobs=1, tm_n_jobs=1,
                  models=[[models.Oscar, models.ARG],
                          [models.Oscar, models.LarsG],
                          [models.Oscar, models.LassoG]]).fit(X, y)
# save_model([transN, evo], "evomsaN.model")                          
_ = predict(data, transN, evo)
# y1, hy = compute_y_hy(_)
# evomsa = np.fabs(y1 - hy)
performance(_)   