In [1]:
import scipy

import numpy as np
import pandas as pd

In [2]:
import datetime

In [3]:
from curvefit.core.model import CurveModel
from curvefit.core.functions import log_erf, erf

In [4]:
# Model active cases
# Need to transform this data format to an amenable format for the IHME model
active_cases_india = '../data/time_series_covid19_confirmed_India_20200410.csv'
# Active cases Bengaluru: as of 10 April from https://github.com/dishamakhija/covid19-india/
active_cases_bengaluru = '../data/time_series_bengaluru.csv'

In [5]:
df_india = pd.read_csv(active_cases_india)
df_bengaluru = pd.read_csv(active_cases_bengaluru)
karnataka_cases = df_india.loc[df_india['Province/State'] == 'Karnataka']
bengaluru_cases = df_bengaluru.loc[df_bengaluru['Province/State'] == 'Bengaluru']

In [6]:
train_date_start = datetime.datetime(2020, 3, 23)
train_date_end = datetime.datetime(2020, 4, 2)
test_date_start = datetime.datetime(2020, 4, 3)
test_date_end = datetime.datetime(2020, 4, 9)

## Prepare data in the IHME format

In [7]:
def fetch_columns(df, start_date, end_date):
    """
    Helper function to fetch the columns from the active cases df
    Assumption: df contains only 1 row corresponding to the Province/Region 
    of interest
    """
    # The code for fetching the dates from the df is not clean
    # will clean it up later
    active_cases = []
    date = start_date
    delta = datetime.timedelta(days=1)
    while date <= end_date:
        num_active = df[date.strftime('%F')].values[0]
        active_cases.append(num_active)
        date += delta
    return active_cases

def create_ihme_input(region, active_case_df, train_date_start, train_date_end, 
                      test_date_start, test_date_end, col_covariate_vars,
                      social_distance):
    """
    active_case_df : Active caess for the Province/State under consideration (pd.Dataframe)
    train_date_start, train_date_end : Date range for train time frame
    test_date_start, test_date_end : Date range for test time frame
    """
    train_active_cases = fetch_columns(active_case_df, train_date_start, train_date_end)
    test_active_cases = fetch_columns(active_case_df, test_date_start, test_date_end)
    
    num_points = len(train_active_cases)
    
    assert(len(social_distance) == num_points)
    # As of now we assume all variates have same variance
    assert(len(col_covariate_vars) == num_points)
    
    num_train_days = (train_date_end - train_date_start).days + 1
    
    independent_var   = np.arange(0, num_train_days)
    measurement_value = train_active_cases
    covariate_var     = col_covariate_vars
    social_distance   = [1 for i in range(num_train_days)]
    data_group        = num_train_days * [region]
    data_dict         = {
        'independent_var'   : independent_var   ,
        'measurement_value' : measurement_value ,
        'covariate_var'     : covariate_var     ,
        'social_distance'   : social_distance   ,
        'region'        : region        ,
    }
    train_df = pd.DataFrame(data_dict)
    return train_df, train_active_cases, test_active_cases

# Model assumptions
- Independent variable - Day
- Measurement val - number of active cases
- Social distancing - 1.0  # TODO: Need to understand scale

In [8]:
region = 'Karnataka'
num_train_days = (train_date_end - train_date_start).days + 1 # Account for the boundary case
col_covariate_vars = num_train_days * [1.0]
social_distance = num_train_days * [1.0]
train_df, train_active_cases, test_active_cases = create_ihme_input(region, karnataka_cases,
                                                                   train_date_start, train_date_end,
                                                                   test_date_start, test_date_end,
                                                                   col_covariate_vars, social_distance)

In [9]:
# Functions
# identity function
def identity_fun(x):
    return x
# link function used for alpha, p
def exp_fun(x):
    return np.exp(x)

In [10]:
# curve_model
num_params   = 3 # alpha, beta and p
num_fe       = 3 # fixed effect parameters
col_t        = 'independent_var'
col_obs      = 'measurement_value'
col_covs     = num_params *[['covariate_var']]
col_group    = 'region'
param_names  = ['alpha', 'beta', 'p']
link_fun     = [exp_fun, identity_fun, exp_fun ]
var_link_fun = num_fe * [identity_fun ]
fun          = erf

In [11]:
curve_model = CurveModel(
    train_df,
    col_t,
    col_obs,
    col_covs,
    col_group,
    param_names,
    link_fun,
    var_link_fun,
    fun
)

In [12]:
# Initialize params
fe_init   = np.zeros(num_fe)
re_init   = np.zeros(num_fe)
fe_bounds = [[-np.inf, np.inf]] * num_fe
re_bounds = [[0.0, 0.0]] * num_fe

In [13]:
curve_model.fit_params(fe_init, re_init, fe_bounds, re_bounds)
params_estimate = curve_model.params
fe_estimate     = curve_model.result.x[: num_fe]

In [14]:
params_estimate

array([[4.49626550e-02],
       [2.77965577e+01],
       [8.53426374e+02]])

In [15]:
curve_model.compute_rmse()

4.0497917150332965

In [16]:
out = curve_model.predict(
    t=np.array(np.arange(11, 18)),
    group_name=region
)

In [17]:
test_active_cases

[124.0, 128.0, 144.0, 151.0, 175.0, 175.0, 181.0]

In [18]:
np.sqrt(np.mean((np.log(out) - np.log(test_active_cases))**2))

0.07662795444184213

In [19]:
def mean_absolute_percentage_error(y_true, y_pred):      
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [20]:
mean_absolute_percentage_error(test_active_cases, out)

6.433349488405324

In [21]:
out_train = curve_model.predict(
    t=np.array(np.arange(0, 11)),
    group_name=region
)

In [22]:
mean_absolute_percentage_error(train_active_cases, out_train)

4.875822198011663