In [245]:
import json
import numpy as np

with open("../data/coeficientes.json") as fp:
    factors = json.load(fp)
    factors = {int(k): v for k,v in factors.items()}
    cumul = {1: factors[1]}
    
    for i in range(2, 19):
        cumul[i] = cumul[i-1] + factors.get(i, 0)
    
cumul

{1: 35,
 2: 35,
 3: 76,
 4: 76,
 5: 76,
 6: 170,
 7: 203,
 8: 252,
 9: 299,
 10: 355,
 11: 448,
 12: 510,
 13: 617,
 14: 730,
 15: 851,
 16: 997,
 17: 1326,
 18: 1619}

In [281]:
with open("../data/timeseries.json") as fp:
    cases = [d['confirmed'] for d in json.load(fp)['Cuba'] if d['confirmed'] > 0]
    cases = {i+1: d for i,d in enumerate(cases)}
    
# alpha que mejor aproxima los datos (error cuadrático medio)
# alpha = []

# for i in range(2, 19):
#     alpha.append(cases[i] / cases[i-1])
    
# alpha = sum(alpha) / len(alpha)

alpha = 1.12072231 # alpha del modelo aprendido en el notebook `predict_cuba.ipynb` que supuestamente está mejor (generaliza mejor 🙏)
alpha

1.12072231

In [282]:
cases

{1: 3,
 2: 4,
 3: 4,
 4: 4,
 5: 4,
 6: 5,
 7: 7,
 8: 11,
 9: 16,
 10: 21,
 11: 35,
 12: 40,
 13: 48,
 14: 57,
 15: 67,
 16: 80,
 17: 119,
 18: 139}

In [283]:
predicted = {1: cases[1]}

for i in range(2, 19):
    predicted[i] = round(cases[i-1] * alpha)
    
predicted

{1: 3,
 2: 3,
 3: 4,
 4: 4,
 5: 4,
 6: 4,
 7: 6,
 8: 8,
 9: 12,
 10: 18,
 11: 24,
 12: 39,
 13: 45,
 14: 54,
 15: 64,
 16: 75,
 17: 90,
 18: 133}

In [284]:
def error(t1, t2, exponent=1, normalize=True):
    def metric(vi, vj):
        t = abs(vi - vj)
        b = abs(vi) if normalize else 1
        return (t / b) ** exponent
    
    residuals = [metric(vi, vj) for vi,vj in zip(t1, t2)]
    msqe = sum(residuals) / len(residuals)
    
    return msqe

In [285]:
import pandas as pd

paired = pd.DataFrame([dict(day=i, cases=cases[i], factor=cumul[i], new=cumul[i] - cumul.get(i-1,cumul[i]), percent=cases[i]/cumul[i], predicted=predicted[i]) for i in cases])
paired['errors'] = paired['cases'] - paired['predicted']
paired

Unnamed: 0,day,cases,factor,new,percent,predicted,errors
0,1,3,35,0,0.085714,3,0
1,2,4,35,0,0.114286,3,1
2,3,4,76,41,0.052632,4,0
3,4,4,76,0,0.052632,4,0
4,5,4,76,0,0.052632,4,0
5,6,5,170,94,0.029412,4,1
6,7,7,203,33,0.034483,6,1
7,8,11,252,49,0.043651,8,3
8,9,16,299,47,0.053512,12,4
9,10,21,355,56,0.059155,18,3


In [286]:
error(paired['cases'], paired['predicted'])

0.11705543987195122

In [287]:
alt.Chart(paired).mark_line(color='green').encode(
    x='day',
    y='cases'
) + alt.Chart(paired).mark_line(color='orange').encode(
    x='day',
    y='predicted'
) + alt.Chart(paired).mark_bar(color='red', width=2).encode(
    x='day',
    y='cases',
    y2='predicted'
)

In [288]:
import altair as alt

alt.Chart(paired).mark_line().encode(
    x='day',
    y='new'
) + alt.Chart(paired).mark_line(color="red").encode(
    x='day',
    y='errors'
)

In [289]:
X = paired['new'].values
y = paired['errors'].values

In [297]:
from sklearn.linear_model import LinearRegression

Xt = X.reshape(-1,1)

model = LinearRegression()
model.fit(Xt, y)
model.score(Xt, y)

0.5925947656470092

In [298]:
paired['fix'] = model.predict(Xt).astype(int)
paired['predicted+fixed'] = paired['predicted'] + paired['fix']
paired

Unnamed: 0,day,cases,factor,new,percent,predicted,errors,fix,predicted+fixed
0,1,3,35,0,0.085714,3,0,0,3
1,2,4,35,0,0.114286,3,1,0,3
2,3,4,76,41,0.052632,4,0,1,5
3,4,4,76,0,0.052632,4,0,0,4
4,5,4,76,0,0.052632,4,0,0,4
5,6,5,170,94,0.029412,4,1,4,8
6,7,7,203,33,0.034483,6,1,1,7
7,8,11,252,49,0.043651,8,3,1,9
8,9,16,299,47,0.053512,12,4,1,13
9,10,21,355,56,0.059155,18,3,2,20


In [300]:
error_before = error(paired['cases'], paired['predicted'])
error_after = error(paired['cases'], paired['predicted+fixed'])

error_before, error_after

(0.11705543987195122, 0.11328504995800565)