In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from lmfit import minimize, Parameters, Parameter, report_fit
from scipy.integrate import odeint
import sys
import time
import datetime
from datetime import timedelta  
from mpi4py import MPI # use the Discovery cluster @ Dartmouth to perform parallel processing
import import_ipynb
from utils_common import *

In [6]:
_Figure_PATH_XX_ = './figures/parameters_xx/'
_Data_PATH_XX_ = './data/parameters_xx/'

_Network_P2P_PATH_ = './data/data_network_P2P.csv'
_City_PATH_ = './data/data_DXY_city_all.csv'
_Province_PATH_ = './data/data_DXY_province_all.csv'

In [7]:
data_city, data_province, data_province_domestic = load_DXY_raw()

In [13]:
# SEIR simulation WITHOUT MIGRATION 
# same timeline for all porvinces
# beta, gamma and sigma are time-dependent
# beta: first changes every TI = 3 days for T = 21 days, then changes every ti days
# gamma and sigma: first changes every TT = 6 days for 2*TT = 12 days, then stays the same

# possible algorithms:
# ’leastsq’: Levenberg-Marquardt (default)
# ’emcee’: Maximum likelihood via Monte-Carlo Markov Chain
# ’dual_annealing’: Dual Annealing optimization

def simulation_province_xx(df = data_province_domestic, update_date_tr = False, rank = 0, # the index of the province
                           T = 21, TT = 6, TI = 3, ti = 7, mmethod = 'dual_annealing', kws = False):
    
    start_time = time.time()
    province = names_province[rank]
    
    df = df[df.update_date <= update_date_tr]
    df_single = df[df.province_name_en == province]
    
    timespan_local = (max(df_single.update_date) - min(df_single.update_date)).days + 1 # provincial timespan
    timespan = (max(df.update_date) - min(df.update_date)).days + 1 # national timespan
    
    list_front = np.repeat(range(int(np.ceil(T/TI))), TI).tolist()
    list_end = np.repeat(range(int(np.ceil((timespan - T)/ti))), ti).tolist()
    list_end = [item + max(list_front) + 1 for item in list_end]
    index_list = list_front + list_end
    index_list = index_list[:timespan] # index list for beta
    
    list_front_gs = [0]*TT + [1]*TT
    list_end_gs = [2]*(timespan - TT - TT)
    index_list_gs = list_front_gs + list_end_gs # index list for gamma and sigma
    
    # Least-square fitting
    def SEIR_equations(states, t, ps):
        """Receptor synthesis-internalization model."""
        if t <= timespan - 1:
            beta_index = index_list[min(round(t), timespan - 1)]
            gs_index = index_list_gs[min(round(t), timespan - 1)]
        else: # integrate.ode may generate t values outside the data range
            beta_index = index_list[timespan - 1]
            gs_index = index_list_gs[timespan - 1]
    
        beta = ps['beta_' + str(beta_index)].value
        gamma = ps['gamma_' + str(gs_index)].value
        sigma = ps['sigma_' + str(gs_index)].value
        N = ps['N'].value
        S, E, I, R = states

        return [-beta*S*I/N, beta*S*I/N - sigma*E, sigma*E - gamma*I, gamma*I]
    
    def SEIR(states_0, ts, ps):
        """
        Solution to the ODE x'(t) = f(t,x,k) with initial condition x(0) = x0
        """
        states = odeint(SEIR_equations, states_0, ts, args=(ps,))
        return states
    
    # Calculate the residual
    def residual(ps, ts, data):
        states_0 = ps['S_0'].value, ps['E_0'].value, ps['I_0'].value, ps['R_0'].value
        model = SEIR(states_0, ts, ps)
        est = model[:,3]
        est = est[-timespan_local:]
        return (est - data).ravel()
    
    # Calculate the estimation
    def estimation(ps, ts):
        states_0 = ps['S_0'].value, ps['E_0'].value, ps['I_0'].value, ps['R_0'].value
        model = SEIR(states_0, ts, ps)
        est = model[:,3]
        return est

    ts = range(0, timespan)
    
    nbeta = int(T/TI + np.ceil((timespan - T)/ti)) # number of beta's
    
    data = np.array(df_single.cum_confirmed)
    
    # set parameters (incluing their bounds)
    ps_bound = Parameters()
    ps_bound.add(name = 'N', value = provincial_population_dict.get(province), vary = False)
    ps_bound.add(name = 'E_0', value = 50, min = 0, max = 500)
    ps_bound.add(name = 'I_0', value = 50, min = 0, max = 200)
    ps_bound.add(name = 'R_0', value = 0, min = 0, max = 100)
    ps_bound.add(name = 'S_0', expr = 'N - E_0 - I_0 - R_0')
    for i in range(max(index_list) + 1):
        ps_bound.add(name = 'beta_' + str(int(i)) , value = 0.5, min = 0.01, max = 1)
    for i in range(max(index_list_gs) + 1):
        ps_bound.add(name = 'gamma_' + str(int(i)), value = 0.5, min = 0.05, max = 0.5)
    for i in range(max(index_list_gs) + 1):
        ps_bound.add(name = 'sigma_' + str(int(i)), value = 0.5, min = 0.05, max = 0.5)
    
    # fit model and find predicted values
    if kws == False: # whether any key word is assigned to the algorithm
        result = minimize(residual, ps_bound, args = (ts, data), method = mmethod)
    elif mmethod == 'emcee':
        result = minimize(residual, ps_bound, args = (ts, data), method = mmethod, steps = kws.get('steps'))
    elif mmethod == 'dual_annealing':
        result = minimize(residual, ps_bound, args = (ts, data), method = mmethod, maxiter = kws.get('maxiter'), initial_temp = kws.get('initial_temp'))
    elif mmethod == 'differential_evolution':
        result = minimize(residual, ps_bound, args = (ts, data), method = mmethod, maxiter = kws.get('maxiter'))
        
    residual = result.residual.reshape(data.shape)
    final = estimation(result.params, ts)
    data = np.concatenate(([None]*(timespan - timespan_local), data), axis=0)
    residual = np.concatenate(([None]*(timespan - timespan_local), residual), axis=0)

    # plot real data and fitted curves
    palette = plt.get_cmap('magma')
    fig = plt.figure(figsize = (10,5))
    plt.plot(ts, data, '-o', color = palette(0.6))
    plt.plot(ts, final, linewidth = 2, color = palette(0.5))
    plt.title(province)
    plt.xlabel('Date')
    plt.ylabel('Number of people')
    fig.savefig(_Figure_PATH_XX_ + province + '_conf_SEIR.pdf', dpi = 400) # save the figure
    
    print("processor %d of %d" % (rank, size))
    print("--- %s seconds ---" % (time.time() - start_time))
    
    df_parameters = pd.DataFrame(dict(result.params.valuesdict()).items(), columns=['parameter', 'value'])
    df_parameters.to_csv(_Data_PATH_XX_ + province + '_parameters.csv', index = False) # save the parameters
    df_final = pd.DataFrame({'estimation': final, 'real': data, 'residual': residual}, columns=['estimation', 'real', 'residual'])
    df_final.to_csv(_Data_PATH_XX_ + province + '_estimation.csv', index = False) # save the estimated values
    return result, df_parameters, df_final

In [14]:
#emcee_kws = dict(steps = 10)
#differential_evolution_kws = dict(maxiter = 1)
annealing_kws = dict(maxiter = 1, initial_temp = 1e4)
date_tr = datetime.date(int(2020),int(3),int(10))

In [16]:
#for rank, name in enumerate(names_province):
    #result, df_parameters, df_final = simulation_province_xx(df = data_province_domestic, update_date_tr = date_tr, rank = rank, # the index of the province
                           #T = 21, TT = 6, TI = 3, ti = 7, mmethod = 'dual_annealing', kws = annealing_kws)

In [None]:
# parallel computing using MPI
# settings are given in the pbs file
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
if size != len(names_province):
    print("Error!")
    sys.exit()

In [None]:
# pbs file
#!/bin/bash -l
#PBS -A Fulab
#PBS -q default
#PBS -N worker29
#PBS -l nodes=2:ppn=16
#PBS -l walltime=480:00:00
#module load python/3-Anaconda
#source activate axel
#cd $PBS_O_WORKDIR
#mpirun -np 30 ./utils_parameters.py

In [None]:
start = time.time()
result, df_parameters, df_final = simulation_province_xx(df = data_province_domestic, update_date_tr = date_tr, rank = rank, 
                                                        T = 21, TT = 6, TI = 3, ti = 7, mmethod = 'dual_annealing', kws = annealing_kws)
#print("--- %s seconds ---" % (time.time() - start))