Code for Hawks Process function tranlsated from matlab from chiangwe's HawkPR  repository

## Python version

*Input variables*
*(CSV Files)*

Each has 2824 counties identified by an FIPS code
1.   **Report** ;
rows: 2824 x 6 (6 locations for cases);
cols: 4 + 297 (dates) [15/02/20 - 07/12/20];
records number of cases

2. **Mobility**;
rows: 2824 (counties);
cols: 3 + 297 (dates);
records amount of mobility

3.   **Demography**;
rows: 2824;
cols: 9;
records demographic identifiers of each county




In [1]:
import numpy as np
import jax
import jax.numpy as jnp
import pandas as pd
import warnings
import time
import random
import scipy.stats as stats
import scipy.sparse as sparse
from scipy.stats import weibull_min, poisson
from scipy.optimize import curve_fit
from scipy.sparse import csc_matrix, eye
import statsmodels.api as sm
from statsmodels.genmod.families import Poisson
from sklearn.linear_model import PoissonRegressor
from scipy import sparse
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


code uses a weibull distribution to model inter infection times. the parameters are updated within the code according to the expectation maximization algorithm.

In [22]:
def HawkPR(InputPath_report, InputPath_mobility, InputPath_demography, Delta, Alpha, Beta, EMitr, DaysPred, SimTimes, OutputPath_mdl, OutputPath_pred):
    warnings.filterwarnings('ignore')

    # Read in parameters
    if Alpha == '' and Beta == '':
        print('No shape and scale parameter for Weibull distribution provided. Use MLE to infer alpha and beta ... ')
        alphaScale_in = 0
        betaShape_in = 0
    else:
        alphaScale_in = float(Alpha)
        betaShape_in = float(Beta)

    if Delta == '':
        print('No shift parameter for mobility provided. It will set to zero ... ')
        mobiShift_in = 0
    else:
        mobiShift_in = int(Delta)

    # Read-in COVID data
    NYT = pd.read_csv(InputPath_report)
    NYT = NYT.iloc[:10,:-250]

    # Read-in mobility
    Mobi = pd.read_csv(InputPath_mobility)
    Mobi = Mobi.iloc[:60,:-250]

    # Read-in demographic
    Demo = pd.read_csv(InputPath_demography)
    Demo = Demo.iloc[:10,:]
    Demo_val = Demo.iloc[:, 3:].values

    # Data pre-processing
    covid = NYT.iloc[:, 3:].values
    covid = NYT.iloc[:, 3:].apply(pd.to_numeric, errors='coerce').values  # Convert to numeric, coerce errors to NaN
    covid[np.isnan(covid)] = 0
    covid = np.hstack([np.zeros((covid.shape[0], 1)), np.diff(covid, axis=1)])
    covid[covid <= 0] = 0

    # Pad to shift
    mob_head = Mobi.iloc[:, :4]
    mob_val = Mobi.iloc[:, 4:].values

    for _ in range(mobiShift_in):
        mob_val = np.hstack([np.mean(mob_val[:, :7], axis=1, keepdims=True), mob_val])

    # Get Key and Date
    NYT_Date_list = NYT.columns[3:]
    NYT_Key_list = NYT.iloc[:, :3].values

    Mobi_Type_list = Mobi.iloc[:6, 3].values
    Mobi_Date_list = Mobi.columns[4:]
    Mobi_Key_list = Mobi.iloc[::6, :3].values

    Demo_Type_list = Demo.columns[3:]
    Demo_Key_list = Demo.iloc[:, 0].values


    n_cty, n_day = covid.shape
    n_mobitype = mob_val.shape[0] // n_cty

    print(f'There are {n_cty} counties, {n_mobitype} types of Mobility indices, and {n_day} days in the COVID reports.')

    # Train & Test Split
    n_tr = covid.shape[1] - DaysPred
    mob_tr = mob_val[:, :n_tr]
    mob_te = mob_val[:, n_tr:n_tr+DaysPred]

    # Normalization
    mob_tr_reshape = mob_tr.reshape(n_mobitype,-1).T
    mob_te_reshape = mob_te.reshape(n_mobitype, -1).T

    Demo_val_in = Demo_val
    Demo_val_tr = np.tile(Demo_val_in, (n_tr, 1))
    Demo_val_te = np.tile(Demo_val_in, (DaysPred, 1))

    covid_tr = covid

    Covar_tr = np.hstack([mob_tr_reshape, Demo_val_tr])
    Covar_te = np.hstack([mob_te_reshape, Demo_val_te])

    Covar_tr_mean = np.mean(Covar_tr, axis=0)
    Covar_tr_std = np.std(Covar_tr, axis=0)

    Covar_tr = (Covar_tr - Covar_tr_mean) / Covar_tr_std
    Covar_te = (Covar_te - Covar_tr_mean) / Covar_tr_std

    # Get Variable names
    #clean up variable names
    VarNamesOld = np.concatenate([Mobi_Type_list, Demo_Type_list.T, ['Qprob']])
    VarNames = [name.replace(' & ', '_').replace(' ', '_').lstrip('_') for name in VarNamesOld]

    # Define Parameters
    n_day_tr = n_day
    T = n_day_tr
    dry_correct = 14

    emiter = EMitr
    break_diff = 1e-3
    day_for_tr = min(T - dry_correct, mob_tr.shape[1])

    # Initialize Inferred Parameters
    if (alphaScale_in == 0) and (betaShape_in == 0):
        alpha = 2
        beta = 2
    else:
        alpha = alphaScale_in
        beta = betaShape_in

    # Initial Weibull values
    wbl_val = np.tile(np.tril(weibull_min.pdf(np.arange(1, n_day_tr+1)[:,None] - np.arange(1, n_day_tr+1), c=beta, loc=0, scale=alpha)), (n_cty, 1))
    print(f'shape of wbl_val: {wbl_val.shape})')

    # K0 reproduction number, a function of time and mobility.
    # K0 is a n_county * n_day by n_day matrix.
    K0 = np.ones((n_cty, n_day_tr))
    print(f'shape of K0(reproductive number): {K0.shape})')
    K0_ext_j = np.repeat(K0, n_day_tr, axis=0)
    print(f'shape of K0_ext_j(adjusted): {K0_ext_j.shape})')

    # q is a n_county * n_day by n_day matrix.
    q = sparse.lil_matrix((n_cty * n_day_tr, n_day_tr))

    # Mu is the background rate
    mus = 0.5 * np.ones(n_cty)
    mus = mus.reshape(n_cty , 1)
    print(f'mus shape is: {mus.shape}')

    # lam is the event intensity
    lam = np.zeros((n_cty, T))

    # EM iteration
    alpha_delta = []
    alpha_prev = []
    beta_delta = []
    beta_prev = []
    mus_delta = []
    mus_prev = []
    K0_delta = []
    K0_prev = []
    theta_delta = []
    theta_prev = []

    for itr in range(emiter):
        start_time = time.time()

        # E-step
        q = K0_ext_j * wbl_val * (covid_tr_ext_j(covid_tr, n_day_tr) > 0)
        print(f'shape of q: {q.shape}')
        print(f'shape of covid_tr: {covid_tr_ext_j(covid_tr, n_day_tr).shape}')

        eye_mu_1 = sparse.eye(n_day_tr)
        dense_eye_1 = eye_mu_1.todense()
        eye_mu_1 = np.tile(dense_eye_1, (n_cty, 1))
        eye_mu_2 = np.tile(mus,(n_day_tr,1))
        eye_mu = eye_mu_1 * eye_mu_2
        eye_mu = np.kron(sparse.eye(n_day_tr), np.ones((n_cty, 1))) * np.tile(mus,(n_day_tr,1))
        #eye_mu = eye_mu.toarray()
        print(f'shape of eye_mu (should be (470,47)): {eye_mu.shape}')

        print(f'shape of q * covid_tr_ext_j(covid_tr, n_day_tr): {q * covid_tr_ext_j(covid_tr, n_day_tr).shape} ')
        lam = np.sum(q * covid_tr_ext_j(covid_tr, n_day_tr) + eye_mu, axis=1)
        lam_eq_zero = lam == 0

        q /= lam
        q[lam_eq_zero, :] = 0

        lam = lam.reshape(n_day_tr, n_cty).T

        # Prepare data for Poisson regression
        X = Covar_tr[:n_cty*day_for_tr, :]
        y = Q[:, :day_for_tr].flatten()
        weights = covid_tr[:, :day_for_tr].flatten()
        # Fit Poisson regression model
        model = LinearRegression().fit(X, y, sample_weight=weights)
        # Predict and reshape
        ypred = model.predict(Covar_tr)
        K0 = ypred.reshape(n_cty, n_day_tr)

        # M-step
        Q = np.reshape(q * covid_tr_ext_i[:, :n_day_tr, :], (n_day_tr, n_day_tr * n_cty))
        Q = np.reshape(np.sum(Q, axis=1), (n_cty, n_day_tr))

        # Parameters for Poisson regression
        glm_tr = Covar_tr[:n_cty*day_for_tr, :]
        glm_y = Q[:, :day_for_tr].flatten()

        design_matrix = sm.add_constant(glm_tr)

        family = Poisson()
        model = sm.GLM(glm_y, design_matrix, family=family, extra_kwds={'weights': freqs.flatten()})
        result = model.fit()
        print(model)

        ypred, yci = result.predict(sm.add_constant(Covar_tr))

        # Reshape ypred to match the original dimensions
        K0 = np.reshape(ypred, (n_cty, n_day_tr))

        # Bound K0
        K0 = savgol_filter(K0, window_length=5, polyorder=2)  # Adjust window_length and polyorder as needed

        # Repeat K0_smoothed to match the required shape
        K0_ext_j = np.repeat(K0, n_day_tr, axis=0)

        lam_eq_zero = np.where(lam[:, :day_for_tr] == 0)[0]
        mus[lam != 0] /= lam[lam != 0, :day_for_tr]
        mus[lam_eq_zero] = 0
        # Calculate the average of mus weighted by covid_tr
        mus_avg = np.sum(mus * covid_tr[:, :day_for_tr], axis=1) / day_for_tr

         # Weibull fitting
        if alphaScale_in == 0 and betaShape_in == 0:
            obs = np.tril(np.arange(day_for_tr)**2)
            freq = covid_tr_ext_j[covid_tr, n_day_tr] * covid_tr_ext_i[covid_tr, n_day_tr, n_cty] * q
            freq = np.sum(freq, axis=-1)
            obs = obs[freq > 0]
            freq = freq[obs > 0]

            popt, _ = curve_fit(stats.weibull_min.pdf, obs, freq, maxfev=500)
            alpha, beta = popt

            if beta > 100:
                beta = 100
            if alpha > 100:
                alpha = 100
        else:
            alpha = alphaScale_in
            beta = betaShape_in

        # Convergence check
        if itr == 1:
            #save the first value
            alpha_prev = alpha
            beta_prev = beta
            mus_prev = mus
            K0_prev = K0.flatten()
            theta_prev = model.coef_
        else:
            #calculate the RMSR
            alpha_delta = np.sqrt((alpha - alpha_prev)**2)
            beta_delta = np.sqrt((beta - beta_prev)**2)
            mus_delta = np.sqrt(np.mean((mus_prev - mus)**2))
            K0_delta = np.sqrt(np.mean((K0_prev - K0)**2))
            theta_delta = np.sqrt(np.mean((theta_prev - model.coef_)**2))

            #save the current
            alpha_prev = alpha
            beta_prev = beta
            mus_prev = mus
            K0_prev = K0.flatten()
            theta_prev = model.coef_

        # Check for convergence
        if itr > 5:
            if all(delta < break_diff for delta in [alpha_delta[-4:], beta_delta[-4:], mus_delta, K0_delta]):
                print("Convergence Criterion Met. Breaking out of EM iteration...")
                break

        elapsed_time = time.time() - start_time
        print(f"Iteration {itr+1}, Elapsed time: {elapsed_time:.2f} seconds")

        if itr == emiter - 1:
            print('Reached maximum EM iteration.')


    # Start Simulation
    np.savez(OutputPath_mdl, mus=mus, alpha=alpha, beta=beta, K0=K0, VarNames=VarNames, alpha_delta=alpha_delta, beta_delta=beta_delta, mus_delta=mus_delta, K0_delta=K0_delta, theta_delta=theta_delta)
    loaded_data = np.load(OutputPath_mdl + '.npz')

    # Get K0
    Covar_all = np.vstack((Covar_tr, Covar_te))
    n_day = n_day_tr + DaysPred
    T_sim = n_day
    Tlow = T_sim - DaysPred

    # Predict
    ypred = model.predict(Covar_all)
    fK0 = ypred.reshape(n_cty, n_day)

    # Make fK0 stable
    fK0[fK0 > 4] = 4

    # Simulation results
    sim = np.zeros((n_cty, T_sim, SimTimes))

    # Simulate offsprings
    n_per_batch = 10**2
    K0_sim = fK0[:, Tlow:]

    for itr in range(SimTimes):
        np.random.seed(itr)

        # Calculate base rate
        base = np.zeros((n_cty, DaysPred))
        n_exh = np.zeros((n_cty, DaysPred))

        t_stamps = np.arange(Tlow + 1, T_sim + 1)[:, None] - np.arange(1, Tlow + 1)
        intense = (weibull_min.pdf(t_stamps, alpha, scale=beta)[:, :, None] *
                  fK0[:, :Tlow][:, None, :] *
                  covid_tr[:, :Tlow][:, None, :])
        base = np.sum(intense, axis=2) + mus
        n_exh = np.random.poisson(base)

        for itr_cty in range(int(np.ceil(n_cty * 0.5))):
            for itr_d in range(DaysPred):
                max_d = DaysPred - itr_d

                # Sample first
                if n_exh[itr_cty, itr_d] > n_per_batch:
                    n_batch = n_exh[itr_cty, itr_d] // n_per_batch
                    cand = np.random.poisson(K0_sim[itr_cty, itr_d], size=n_per_batch)
                    n_mod = n_exh[itr_cty, itr_d] % n_per_batch
                    n_offs = np.sum(cand) * n_batch + np.sum(np.random.poisson(K0_sim[itr_cty, itr_d], size=n_mod))
                else:
                    n_offs = np.sum(np.random.poisson(K0_sim[itr_cty, itr_d], size=n_exh[itr_cty, itr_d]))

                if n_offs > n_per_batch:
                    n_batch = n_offs // n_per_batch
                    n_mod = n_offs % n_per_batch

                    sim_cand_wbl = np.ceil(weibull_min.rvs(alpha, scale=beta, size=n_per_batch))
                    sim_cand_wbl = sim_cand_wbl[sim_cand_wbl <= max_d]
                    sim_cand_wbl = np.histogram(sim_cand_wbl, bins=np.arange(1, max_d + 2))[0]

                    t_delta = np.ceil(weibull_min.rvs(alpha, scale=beta, size=n_mod))
                    t_delta = t_delta[t_delta <= max_d]
                    nt = np.histogram(t_delta, bins=np.arange(1, max_d + 2))[0] + sim_cand_wbl * n_batch
                else:
                    t_delta = np.ceil(weibull_min.rvs(alpha, scale=beta, size=n_offs))
                    t_delta = t_delta[t_delta <= max_d]
                    nt = np.histogram(t_delta, bins=np.arange(1, max_d + 2))[0]

                n_exh[itr_cty, itr_d + 1:] += nt

        sim[:, :, itr] = np.concatenate((covid_tr, n_exh[:, None]), axis=1)

    sim_out = sim[:, -DaysPred:, :]

    # Format the output
    sim_mean = np.mean(sim_out, axis=2)
    Date_pred = pd.date_range(start=NYT_Date_list[-1], periods=DaysPred, freq='D').strftime('%Y_%m_%d').to_list()
    table_out = pd.DataFrame(sim_mean, columns=Date_pred)
    table_out = pd.concat([NYT.iloc[:, :3], table_out], axis=1)

    table_out.to_csv(OutputPath_pred, index=False)

def covid_tr_ext_j(covid_tr, n_day_tr):
    return np.repeat(covid_tr, n_day_tr, axis=0)

def covid_tr_ext_i(covid_tr, n_day_tr, n_cty):
    return np.tile(covid_tr.T, (1, n_day_tr)).T

In [3]:
# Path to your CSV file in Google Drive
InputPath_demography = '/content/drive/My Drive/HawkPR_data_sim/Demo_Dconfirmed.csv'
InputPath_report = '/content/drive/My Drive/HawkPR_data_sim/NYT_Dconfirmed.csv'
InputPath_mobility = '/content/drive/My Drive/HawkPR_data_sim/GoogleMobi_Dconfirmed.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(InputPath_demography)

print(df.head())
num_rows, num_columns = df.shape
print(f'Number of rows: {num_rows}')
print(f'Number of columns: {num_columns}')

   countyFIPS  PopulationDensityperSqMile2010  PopulationEstimate2018  \
0        6037                          2419.6              10105518.0   
1       17031                          5495.1               5180493.0   
2       12086                          1315.5               2761581.0   
3        4013                           414.9               4410824.0   
4       48201                          2402.4               4698619.0   

   #ICU_beds  MedianAge2010  Smokers_Percentage  DiabetesPercentage  \
0     2126.0           34.8           10.847678                 8.1   
1     1606.0           35.3           13.776183                 9.0   
2      593.0           38.2           16.479410                 6.7   
3     1004.0           34.6           13.686398                 8.2   
4      918.0           32.2           13.852122                10.3   

   HeartDiseaseMortality  #Hospitals  
0                  150.8        76.0  
1                  175.1        46.0  
2                

In [4]:
#scale of weibull
Alpha = 4
#shape of weibull
Beta = 2

# num of maximum iterations for EM algortihm in case convergence not reached
EMitr = 20

#additional days to be predicted by trained hawks process model
DaysPred = 6

#mobility shift parameter: ???
Delta = 3

SimTimes = 6

#to_csv function will automatically create a csv file with this path
OutputPath_pred = '/content/drive/My Drive/HawkPR_data_sim/Output.csv'
OutputPath_mdl = '/content/drive/My Drive/HawkPR_data_sim/Output_mdl'

In [23]:
HawkPR(InputPath_report, InputPath_mobility, InputPath_demography, Delta, Alpha, Beta, EMitr, DaysPred, SimTimes, OutputPath_mdl, OutputPath_pred)

There are 10 counties, 6 types of Mobility indices, and 47 days in the COVID reports.
shape of wbl_val: (470, 47))
shape of K0(reproductive number): (10, 47))
shape of K0_ext_j(adjusted): (470, 47))
mus shape is: (10, 1)
shape of q: (470, 47)
shape of covid_tr: (470, 47)


ValueError: shapes (470,47) and (470,1) not aligned: 47 (dim 1) != 470 (dim 0)