In [1]:
import scipy

import numpy as np
import pandas as pd

In [2]:
import datetime

In [3]:
from curvefit.core.model import CurveModel
from curvefit.core.functions import log_erf, erf

In [4]:
# Model number of deaths
# Need to transform this data format to an amenable format for the IHME model
covid_deaths_india = '../data/time_series_covid19_deaths_India_20200411.csv'

In [5]:
# state population
population = '../data/state_population.csv'

In [6]:
region = 'Karnataka'
df_india = pd.read_csv(covid_deaths_india)
region_cases = df_india.loc[df_india['Province/State'] == region]

In [7]:
df_population = pd.read_csv(population)
region_population = int(df_population[df_population['state'] == region]['population'])

In [8]:
train_date_start = datetime.datetime(2020, 3, 23)
train_date_end = datetime.datetime(2020, 4, 2)
test_date_start = datetime.datetime(2020, 4, 3)
test_date_end = datetime.datetime(2020, 4, 9)

# Prepare data in IHME format

In [9]:
def fetch_columns(df, region_population, start_date, end_date):
    """
    Helper function to fetch the columns from the active cases df
    Assumption: df contains only 1 row corresponding to the Province/Region 
    of interest
    """
    # The code for fetching the dates from the df is not clean
    # will clean it up later
    death_counts = []
    date = start_date
    delta = datetime.timedelta(days=1)
    while date <= end_date:
        num_deaths = df[date.strftime('%F')].values[0]
        death_rate = num_deaths / region_population
        death_counts.append(death_rate)
        date += delta
    return death_counts

def create_ihme_input(region, region_population, death_counts_df, 
                      train_date_start, train_date_end, 
                      test_date_start, test_date_end, col_covariate_vars,
                      social_distance):
    """
    active_case_df : Active caess for the Province/State under consideration (pd.Dataframe)
    train_date_start, train_date_end : Date range for train time frame
    test_date_start, test_date_end : Date range for test time frame
    """
    train_death_counts = fetch_columns(death_counts_df, region_population, 
                                       train_date_start, train_date_end)
    test_death_counts = fetch_columns(death_counts_df, region_population, 
                                      test_date_start, test_date_end)
    
    num_points = len(train_death_counts)
    
    assert(len(social_distance) == num_points)
    # As of now we assume all variates have same variance
    assert(len(col_covariate_vars) == num_points)
    
    num_train_days = (train_date_end - train_date_start).days + 1
    
    independent_var   = np.arange(0, num_train_days)
    measurement_value = train_death_counts
    covariate_var     = col_covariate_vars
    social_distance   = [1 for i in range(num_train_days)]
    data_group        = num_train_days * [region]
    data_dict         = {
        'independent_var'   : independent_var   ,
        'measurement_value' : measurement_value ,
        'covariate_var'     : covariate_var     ,
        'social_distance'   : social_distance   ,
        'region'        : region        ,
    }
    train_df = pd.DataFrame(data_dict)
    return train_df, train_death_counts, test_death_counts

# Model assumptions
- Independent variable - Day
- Measurement val - number of active cases
- Social distancing - 1.0  # TODO: Need to understand scale

In [10]:
num_train_days = (train_date_end - train_date_start).days + 1 # Account for the boundary case
col_covariate_vars = num_train_days * [1.0]
social_distance = num_train_days * [0.0]
train_df, train_death_counts, test_death_counts = create_ihme_input(region, region_population, region_cases,
                                                                   train_date_start, train_date_end,
                                                                   test_date_start, test_date_end,
                                                                   col_covariate_vars, social_distance)

In [11]:
# Functions
# identity function
def identity_fun(x):
    return x
# link function used for alpha, p
def exp_fun(x):
    return np.exp(x)

In [12]:
# curve_model
num_params   = 3 # alpha, beta and p
num_fe       = 3 # fixed effect parameters
col_t        = 'independent_var'
col_obs      = 'measurement_value'
col_covs     = num_params *[['covariate_var']]
col_group    = 'region'
param_names  = ['alpha', 'beta', 'p']
link_fun     = [exp_fun, identity_fun, exp_fun ]
var_link_fun = num_fe * [identity_fun ]
fun          = erf

In [13]:
train_df.head()

Unnamed: 0,independent_var,measurement_value,covariate_var,social_distance,region
0,0,1.636787e-08,1.0,1,Karnataka
1,1,1.636787e-08,1.0,1,Karnataka
2,2,1.636787e-08,1.0,1,Karnataka
3,3,3.273574e-08,1.0,1,Karnataka
4,4,3.273574e-08,1.0,1,Karnataka


In [14]:
curve_model = CurveModel(
    train_df,
    col_t,
    col_obs,
    col_covs,
    col_group,
    param_names,
    link_fun,
    var_link_fun,
    fun
)

In [15]:
# Initialize params
fe_init   = np.zeros(num_fe)
re_init   = np.zeros(num_fe)
fe_bounds = [[-np.inf, np.inf]] * num_fe
re_bounds = [[0.0, 0.0]] * num_fe

In [16]:
curve_model.fit_params(fe_init, re_init, fe_bounds, re_bounds)
params_estimate = curve_model.params
fe_estimate     = curve_model.result.x[: num_fe]

In [17]:
out = curve_model.predict(
    t=np.array(np.arange(11, 18)),
    group_name=region
)

In [18]:
test_death_counts

[4.9103615946085014e-08,
 4.9103615946085014e-08,
 6.547148792811335e-08,
 6.547148792811335e-08,
 6.547148792811335e-08,
 6.547148792811335e-08,
 8.18393599101417e-08]

In [19]:
out

array([5.15652383e-08, 5.18245976e-08, 5.19483107e-08, 5.20028443e-08,
       5.20250593e-08, 5.20334222e-08, 5.20363316e-08])

In [20]:
np.sqrt(np.mean((np.log(out) - np.log(test_death_counts))**2))

0.24568961432774808

In [21]:
def mean_absolute_percentage_error(y_true, y_pred):      
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [22]:
mean_absolute_percentage_error(test_death_counts, out)

18.465808384169176

In [23]:
out_train = curve_model.predict(
    t=np.array(np.arange(0, 11)),
    group_name=region
)

In [24]:
mean_absolute_percentage_error(train_death_counts, out_train)

11.621132813276526

In [None]:
params_estimate