### 1. Package setup

In [1]:
import scipy
import math
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
from curvefit.core.model import CurveModel
from curvefit import pv
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from curvefit.core.functions import log_erf, erf, expit, log_expit

### 2. Input data setup

In [2]:
# Model active cases
# Need to transform this data format to an amenable format for the IHME model
active_cases_italy = 'timeseries_covid19_italy_nhu_confirmed_cases_04102020.csv'
# Active cases as of 10 April from https://github.com/dishamakhija/covid19-india/blob/master/data/

In [3]:
def generalized_logistic(t, params) :
    alpha = params[0]
    beta  = params[1]
    p     = params[2]
    return p / ( 1.0 + np.exp( - alpha * ( t - beta ) ) )

### 3. Load input data

In [4]:
df_italy = pd.read_csv(active_cases_italy)

lombardia_cases = df_italy.loc[df_italy['Province/State'] == 'Lombardia']


In [9]:
train_date_start = datetime.datetime(2020, 3, 12)
train_date_end = datetime.datetime(2020, 3, 26)
test_date_start = datetime.datetime(2020, 3, 27)
test_date_end = datetime.datetime(2020, 4, 9)

In [6]:
lombardia_cases

Unnamed: 0,Province/State,Country/Region,Lat,Long,02/24/2020,02/25/2020,02/26/2020,02/27/2020,02/28/2020,02/29/2020,...,03/31/2020,04/01/2020,04/02/2020,04/03/2020,04/04/2020,04/05/2020,04/06/2020,04/07/2020,04/08/2020,04/09/2020
8,Lombardia,Italy,45.466794,9.190347,172,240,258,403,531,615,...,43208,44773,46065,47520,49118,50455,51534,52325,53414,54802


### 4. Prepare data in the IHME format

In [7]:
def fetch_columns(df, start_date, end_date, under_reporting_factor):
    """
    Helper function to fetch the columns from the active cases df
    Assumption: df contains only 1 row corresponding to the Province/Region 
    of interest
    """
    # The code for fetching the dates from the df is not clean
    # will clean it up later
    active_cases = []
    date = start_date
    delta = datetime.timedelta(days=1)
    count_df = pd.DataFrame(columns=["date", "active_count"])
    while date <= end_date:
        ##### NOTE - CHANGED DATE FORMAT FOR BLORE DATA
        num_active = df[date.strftime('%m/%d/%Y')].values[0]
        #num_active = df[date.strftime('%F')].values[0]
        count_df = count_df.append({
         "date": date,
         "active_count": num_active 
          }, ignore_index=True)
        date += delta
    
    count_df = count_df.sort_values(by='date', ascending=True)
    count_df.active_count = count_df.active_count*under_reporting_factor
    count_df['cumulative_count'] = count_df.active_count
    print(count_df)
    return count_df

def create_ihme_input(region, active_case_df, train_date_start, train_date_end, 
                      test_date_start, test_date_end,under_reporting_factor, col_covariate_vars,
                      social_distance):
    """
    active_case_df : Active caess for the Province/State under consideration (pd.Dataframe)
    train_date_start, train_date_end : Date range for train time frame
    test_date_start, test_date_end : Date range for test time frame
    """
    train_active_cases_df = fetch_columns(active_case_df, train_date_start, train_date_end, under_reporting_factor)
    test_active_cases_df = fetch_columns(active_case_df, test_date_start, test_date_end, under_reporting_factor)
    
    num_points = (train_active_cases_df.shape[0])
    
    assert(len(social_distance) == num_points)
    # As of now we assume all variates have same variance
    assert(len(col_covariate_vars) == num_points)
    
    num_train_days = (train_date_end - train_date_start).days + 1
    print(num_train_days)
    independent_var   = train_active_cases_df.index
    measurement_value = train_active_cases_df['cumulative_count']
    covariate_var     = col_covariate_vars
    print(covariate_var)
    social_distance   = [1 for i in range(num_train_days)]
    data_group        = num_train_days * [region]
    data_dict         = {
        'independent_var'   : independent_var   ,
        'measurement_value' : measurement_value ,
        'covariate_var'     : covariate_var     ,
        'social_distance'   : social_distance   ,
        'region'        : region        ,
    }
    train_df = pd.DataFrame(data_dict)
    print(train_df)
    return train_df, train_active_cases_df, test_active_cases_df

# Functions
# identity function
def identity_fun(x):
    return x
# link function used for alpha, p
def exp_fun(x):
    return np.exp(x)

def compute_mape(y_true, y_pred):
    mape = 0
    for i in range(len(y_pred)):
        if(not y_true[i] == 0):
            mape+= np.abs((y_true[i] - y_pred[i] + 0.) / y_true[i])
    mape = (100*mape)/len(y_pred)
    return mape

def rmse_error(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred)
            
    rmse = math.sqrt(rmse)
    return rmse

def rmsle_error(y_true, y_pred):
    rmsle = mean_squared_log_error(y_true, y_pred)
            
    rmsle = math.sqrt(rmsle)
    return rmsle



### 4. Model
### Model assumptions
- Independent variable - Day
- Measurement val - number of active cases
- Social distancing - 1.0  # TODO: Need to understand scale

In [10]:
region = 'Lombardia'
num_train_days = (train_date_end - train_date_start).days + 1 # Account for the boundary case
print(num_train_days)
col_covariate_vars = num_train_days * [1.11]
social_distance = num_train_days * [10.0]
under_reporting_factor = 2
train_df, train_active_cases, test_active_cases = create_ihme_input(region, lombardia_cases,
                                                                   train_date_start, train_date_end,
                                                                   test_date_start, test_date_end, 
                                                                    under_reporting_factor,
                                                                   col_covariate_vars, social_distance)

15
         date active_count cumulative_count
0  2020-03-12        17450            17450
1  2020-03-13        19640            19640
2  2020-03-14        23370            23370
3  2020-03-15        26544            26544
4  2020-03-16        29298            29298
5  2020-03-17        32440            32440
6  2020-03-18        35426            35426
7  2020-03-19        39768            39768
8  2020-03-20        44528            44528
9  2020-03-21        51030            51030
10 2020-03-22        54412            54412
11 2020-03-23        57522            57522
12 2020-03-24        61406            61406
13 2020-03-25        64692            64692
14 2020-03-26        69778            69778
         date active_count cumulative_count
0  2020-03-27        74596            74596
1  2020-03-28        78830            78830
2  2020-03-29        82014            82014
3  2020-03-30        84322            84322
4  2020-03-31        86416            86416
5  2020-04-01        89546   

In [11]:
# curve_model
num_params   = 3 # alpha, beta and p
num_fe       = 3 # fixed effect parameters
col_t        = 'independent_var'
col_obs      = 'measurement_value'
col_covs     = num_params *[['covariate_var']]
col_group    = 'region'
param_names  = ['alpha', 'beta', 'p']
link_fun     = [exp_fun, identity_fun, exp_fun ]
var_link_fun = num_fe * [identity_fun ]
fun          = erf

In [12]:
col_covs

[['covariate_var'], ['covariate_var'], ['covariate_var']]

In [14]:
# Initialize params
fe_init   = np.ones(num_fe)
re_init   = np.ones(num_fe)
fe_bounds = [[-np.inf, np.inf], [0, np.inf], [0, np.inf]] 
print(fe_bounds)
re_bounds = [[0.0, 0.0]] * num_fe

[[-inf, inf], [0, inf], [0, inf]]


In [22]:
var_a = [1]
var_b = [1]
var_c = [1]

train_df['var_a'] = 0
train_df['var_b'] = 0
train_df['var_c'] = 0
all_metrics = []
for i in var_a:
    for j in var_b:
        for k in var_c:
            flat = {}
            train_df['var_a'] = i
            train_df['var_b'] = j
            train_df['var_c'] = k
            col_covs = [['var_a'],['var_b'],['var_c']]
            curve_model = CurveModel(
            train_df,
            col_t,
            col_obs,
            col_covs,
            col_group,
            param_names,
            link_fun,
            var_link_fun,
            fun
            )
            curve_model.fit_params(fe_init, re_init, fe_bounds, re_bounds)
            params_estimate = curve_model.params
            fe_estimate     = curve_model.result.x[: num_fe]
            flat['alpha'] = params_estimate[0][0]
            flat['beta'] = params_estimate[1][0]
            flat['p'] = params_estimate[2][0]
            
            out_train = curve_model.predict(
            t=np.array(np.arange(0, 15)),
            group_name=region
            )
            
            if(( (np.max(out_train))!=np.inf) & ((np.isnan(out_train).any()==False))):
                flat['train_mape'] = compute_mape(train_active_cases['cumulative_count'], out_train)
                flat['train_rmsle'] = rmsle_error(train_active_cases['cumulative_count'], out_train)
            else:
                flat['train_mape'] = np.inf
                flat['train_rmsle'] = np.inf
            
            test_out = curve_model.predict(
            t=np.array(np.arange(16, 30)),
            group_name=region
            )
            
            
            if( ((np.max(test_out))!=np.inf) & ((np.isnan(test_out).any())==False)):
                flat['test_mape'] = compute_mape(test_active_cases['cumulative_count'], test_out)
                flat['test_rmsle'] = rmsle_error(test_active_cases['cumulative_count'], test_out)
            else:
                flat['test_mape'] = np.inf
                flat['test_rmsle'] = np.inf
            
            print(flat)
            
            all_metrics.append(flat)

{'alpha': 0.07060273778959013, 'beta': 9.873512940358843, 'p': 105446.59025783473, 'train_mape': 1.698679373395907, 'train_rmsle': 0.020900729254549312, 'test_mape': 2.696361116255715, 'test_rmsle': 0.031292433605656186}


In [21]:
print(test_out)
print(test_active_cases['cumulative_count'])

[ 76937.66351307  80311.45726072  83454.05382435  86352.28238877
  88998.65219205  91391.09921925  93532.54965365  95430.33732306
  97095.5170372   98542.11716508  99786.37327775 100845.98062858
 101739.39726863 102485.22238756]
0      74596
1      78830
2      82014
3      84322
4      86416
5      89546
6      92130
7      95040
8      98236
9     100910
10    103068
11    104650
12    106828
13    109604
Name: cumulative_count, dtype: object


In [24]:
print(out_train)
print(train_active_cases['cumulative_count'])

[17093.39966613 19803.88618334 22764.86267753 25967.41745605
 29396.92745295 33033.07484096 36850.09790054 40817.27865359
 44899.65550523 49058.93492423 53254.56317898 57444.90841961
 61588.49587989 65645.23532034 69577.58036893]
0     17450
1     19640
2     23370
3     26544
4     29298
5     32440
6     35426
7     39768
8     44528
9     51030
10    54412
11    57522
12    61406
13    64692
14    69778
Name: cumulative_count, dtype: object
