In [152]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns

from sklearn import (linear_model, metrics, neural_network, pipeline, preprocessing, model_selection)

In [153]:
"""
Load data.
Change path to your location of CSV files locally.
"""

path = "./data/"

df_zones = pd.read_csv(path + "zona_censal_comuna.csv")
df_demos = pd.read_csv(path + "demo_comunas.csv")
df_inf = pd.read_csv(path + "dp1_contagiados_por_comuna.csv")
df_mob1 = pd.read_csv(path + "od_zones_weeks_20200301.enc.csv")
df_mob2 = pd.read_csv(path + "od_zones_weeks_20200401.enc.csv")

df_mob = pd.concat([df_mob1, df_mob2])

### Zones & Communes

In [154]:
df_zones

Unnamed: 0,OBJECTID,REGION,PROVINCIA,NOM_COMUNA,COMUNA,GEOCODIGO
0,1,13,134,PAINE,13404,13404011003
1,2,13,134,PAINE,13404,13404011002
2,3,13,134,PAINE,13404,13404061004
3,4,13,134,PAINE,13404,13404061001
4,5,13,134,PAINE,13404,13404061002
...,...,...,...,...,...,...
1860,1861,13,136,EL MONTE,13602,13602031001
1861,1862,13,136,TALAGANTE,13601,13601021001
1862,1863,13,136,TALAGANTE,13601,13601021002
1863,1864,13,135,ALHUÉ,13502,13502011001


In [155]:
"""
Zone information available only for Santiago.

Create dictionaries for (standardized) ID lookups for zones and communes.
"""
# dictionary mapping commune id to standardized id (i.e., from 0 to 50)
dict_commune_sid = {commune:sid for sid, commune in enumerate(df_zones.COMUNA.unique())}

# dictionary mapping census zone id to its commune standardized id
dict_zone_commune_sid = {zone:dict_commune_sid[commune] for zone, commune in zip(df_zones.GEOCODIGO, df_zones.COMUNA)}    

In [156]:
# commune_zone_list = lambda x: list(df_zones[df_zones.COMUNA==x].GEOCODIGO.values)
# dict_commune_zones = {commune:commune_zone_list(commune) for commune in df_zones.COMUNA.unique()}
# print(dict_commune_id)
# print(dict_zone_commune_id)
# df_zones

### Mobility data

In [213]:
df_mob.week.max()

17

In [161]:
"""
Create dataframe of edge information: (home_commune, external_commune, week, mean_population_commune)
Aggregate information in same communes.
"""
t = df_mob.week.min()
get_zone_to_com_sid = lambda i: dict_zone_commune_sid[i]

df_edge_list = df_mob[["home_zone","code_zone","week","mean_population_zone"]]
df_edge_list.columns = ['home_com', 'ext_com', 'week', 'mean_pop_com']
df_edge_list.loc[:,"home_com"] = df_edge_list.loc[:,"home_com"].apply(get_zone_to_com_sid)
df_edge_list.loc[:,"ext_com"] = df_edge_list.loc[:,"ext_com"].apply(get_zone_to_com_sid)
df_edge_list.loc[:,"week"] = df_edge_list.loc[:,"week"] - t

# Aggregate data by summing population of equivalent edges in same week.
df_edge_list = df_edge_list.groupby(['home_com','ext_com','week']).agg({'mean_pop_com':'sum'}).reset_index()

In [162]:
"""
Create mobility matrix: A[i,j,t] 
"""
T = df_mob.week.max() - df_mob.week.min() + 1
N = len(dict_commune_sid)
A = np.zeros((N, N, T))

edge_list = zip(df_edge_list.home_com, df_edge_list.ext_com, df_edge_list.week, df_edge_list.mean_pop_com)
for edge in edge_list:
    i, j, t, a_ijt = edge
    A[i,j,t] += a_ijt

In [163]:
# Matrix is not sparse
print(np.count_nonzero(A), N*N*T)

21949 23409


### Infections data

In [164]:
df_inf

Unnamed: 0.1,Unnamed: 0,Region,Codigo region,Comuna,Codigo comuna,Poblacion,2020-03-30,2020-04-01,2020-04-03,2020-04-06,...,2020-04-13,2020-04-15,2020-04-17,2020-04-20,2020-04-24,2020-04-27,2020-05-01,2020-05-04,2020-05-08,Tasa
0,0,Arica y Parinacota,15,Arica,15101,247552.0,6.0,6.0,12.0,41.0,...,115.0,124.0,134.0,166.0,224.0,270.0,297.0,310.0,328.0,132.5
1,1,Arica y Parinacota,15,Camarones,15102,1233.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Arica y Parinacota,15,General Lagos,15202,810.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Arica y Parinacota,15,Putre,15201,2515.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Tarapacá,1,Alto Hospicio,1107,129999.0,0.0,0.0,0.0,5.0,...,14.0,15.0,16.0,27.0,39.0,55.0,77.0,128.0,161.0,123.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,341,Magallanes,12,Punta Arenas,12101,141984.0,29.0,87.0,143.0,203.0,...,387.0,416.0,470.0,516.0,581.0,623.0,685.0,744.0,825.0,581.1
342,342,Magallanes,12,Rio Verde,12103,211.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,343,Magallanes,12,San Gregorio,12104,681.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,146.8
344,344,Magallanes,12,Timaukel,12303,282.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
"""
Aggregate infections data in same weeks.

Extract infections data of communes in Santiago.
"""
get_com_sid = lambda i: dict_commune_sid[i] if i in dict_commune_sid else pd.NaT
df_y = pd.DataFrame()
df_y["com"] = df_inf["Codigo comuna"]
df_y.loc[:,"com"] = df_y.loc[:,"com"].apply(get_com_sid)
df_y["week1"] = df_inf["2020-03-30"] + df_inf["2020-04-01"] + df_inf["2020-04-03"]
df_y["week2"] = df_inf["2020-04-06"] + df_inf["2020-04-08"] + df_inf["2020-04-10"]
df_y["week3"] = df_inf["2020-04-13"] + df_inf["2020-04-15"] + df_inf["2020-04-17"]
df_y["week4"] = df_inf["2020-04-20"] + df_inf["2020-04-24"]
df_y["week5"] = df_inf["2020-04-27"] + df_inf["2020-05-01"]
df_y["week6"] = df_inf["2020-05-04"] + df_inf["2020-05-08"]
df_y = df_y.dropna().sort_values("com")
y = df_y.drop("com",1).to_numpy()

In [166]:
df_y
y.shape

(51, 6)

### Demographics

In [167]:
df_demos = pd.read_csv(path + "demo_comunas.csv")
df_demos.head()

Unnamed: 0,COMUNA,NOM_COMUNA,Densidad_,SUPERFICIE__KM2_,T_POB,T_HOM,T_MUJ,T_VIV,por_muj,por_hom,pers_viv,viv_km2_total
0,13101,SANTIAGO,17483.935547,23.135237,404495,206678,197817,193628,0.489047,0.510953,2.089032,8369.397823
1,13120,ÑUÑOA,12353.292969,16.856802,208237,95409,112828,92248,0.541825,0.458175,2.257361,5472.449641
2,13108,INDEPENDENCIA,13633.545898,7.35546,100281,49186,51095,36666,0.509518,0.490482,2.734986,4984.868268
3,13123,PROVIDENCIA,9870.609375,14.394146,142079,65710,76369,70965,0.537511,0.462489,2.0021,4930.129245
4,13117,LO PRADO,14671.995117,6.560049,96249,46799,49450,29526,0.513772,0.486228,3.259805,4500.881303


In [168]:
df_C = df_demos
df_C.insert(0, "com", df_C.loc[:,"COMUNA"].apply(get_com_sid))
df_C = df_C.dropna().sort_values("com")
df_C.drop(["NOM_COMUNA", "COMUNA"], axis=1, inplace=True)
C = df_C.drop("com",1).to_numpy()

In [169]:
df_C.head()
print(C.shape)

(51, 10)


In [170]:
# Visualize data
def plot_vars(data, levels, color, leveltype):
    """
    Displays historical trends
    """
    
    fig, ax = plt.subplots(1, 6, figsize=(16,2.5), sharex=True)
    
    palettes = ["blue", "green", "red", "orange", "purple", "black"]
    
    for col, i in dict(zip(levels, list(range(6)))).items():
        data[col].plot(ax=ax[i], legend=True, linewidth=1.0, color=color, sharex=True)     
    
    fig.set_facecolor("floralwhite")
    fig.suptitle(f"Historical trends of VAR {leveltype} variables", 
                 fontsize=14, fontweight="bold", fontname="Verdana")

### Feature Extraction

In [209]:
# Feature extraction
def extract_features(A, y, C):
    """
    This function extracts feature vectors for each sequential input.
    Returns a matrix of all features X = [X_1 X_2 ... X_T] where X_t is matrix of features for data at time t.
    """
    
    N = 51
    L = 15
    T = 6
    X = np.zeros((N, T*L))
    print(A.shape)
    for t in range(T):
        At = A[:,:,t+5]
        yt = y[:,t]
        
        x1 = np.ones(N)
        x2 = np.diag(At)
        x3 = np.dot(At.T, np.ones(N))
        x4 = yt
        x5 = np.dot(At.T, yt)
        
        Xt = np.stack((x1,x2,x3,x4,x5), axis=-1)
        Xt = np.column_stack((Xt, C))
        
        X[:,t*L:(t+1)*L] = Xt
    
    return X

def collect_data(X, y):
    L = 15
    T = 6
    K = 2
    Z = X[:,:K*L]
    for t in range(K,T-K+1):
        Zt = X[:,t*L:(t+K)*L]
        Z = np.vstack((Z,Zt))
    
    Y = y[:,K]
    for t in range(K+1, T):
        Y = np.vstack((Y,y[:,t]))
        
    return Z, Y
        

In [210]:
### Train-validation-test dataset split

def timeseries_train_test_split(Z, Y):
    """
    This function splits the sample into a train and test data.
    """
    T = 6
    K = 2
    N = 51
    L = 15
    X_train = X[:(T-K-1)*N,:]
    X_test = X[(T-K-1)*N:,:]
    y_train = y[:N*(T-K-1)]
    y_test = y[(T-K-1)*N:]
    
    return X_train, y_train, X_test, y_test

In [211]:
### Plot forecast
def train_test_plot(model, X_train, X_test):
    """
    This will plot the actual values of data against the one fitted by the model.
    """
    return

### LASSO

In [212]:
# LASSO

Z = extract_features(A, y, C)
X, Y = collect_data(Z, y)
X_train, Y_train, X_test, Y_test = timeseries_train_test_split(X=X, y=Y)

(51, 51, 9)


IndexError: index 9 is out of bounds for axis 2 with size 9

In [None]:
lasso = linear_model.LassoCV(cv=model_selection.TimeSeriesSplit(), 
                             alphas=None, tol = 10000, normalize=True) 

cv_lasso = lasso.fit(X_train, y_train)
optimal_alpha = cv_lasso.alpha_

lasso2 = linear_model.Lasso(alpha=optimal_alpha, normalize=True)
lasso2.fit(X_train, y_train)

# train_test_plot(lasso2, X_train, X_test) 

In [None]:
# MSE
metrics.mean_squared_error(y_test, lasso2.predict(X_test))

In [None]:
# Feature selection
lasso_coefs = pd.DataFrame({"features":list(X_train), "coef": lasso2.coef_})
lasso_coefs = lasso_coefs[lasso_coefs.coef != 0.0]
lasso_coefs.sort_values("coef", ascending=False)

### XGBoost

In [None]:
X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)


scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

train_test_plot(model=xgb, X_train=X_train_scaled, X_test=X_test_scaled)


### Neural Network

In [None]:
X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)

reg = neural_network.MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
reg.fit(X, y)

train_test_plot(model=reg, X_train=X_train_scaled, X_test=X_test_scaled)