In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns

from sklearn import (linear_model, metrics, neural_network, pipeline, preprocessing, model_selection)

In [31]:
# Load data

path = "./data/"

df_zones = pd.read_csv(path + "zona_censal_comuna.csv")
df_demos = pd.read_csv(path + "demo_comunas.csv")
df_inf = pd.read_csv(path + "dp1_contagiados_por_comuna.csv")
df_mob1 = pd.read_csv(path + "od_zones_weeks_20200301.enc.csv")
df_mob2 = pd.read_csv(path + "od_zones_weeks_20200401.enc.csv")

df_mob = pd.concat([df_mob1, df_mob2])

In [83]:
"""
Create dictionaries for id lookups
"""
# dictionary mapping commune id to standardized id (i.e., from 0 to 50)
dict_commune_sid = {commune:sid for sid, commune in enumerate(df_zones.COMUNA.unique())}

# dictionary mapping census zone id to its corresponding commune standardized id
dict_zone_commune_sid = {zone:dict_commune_sid[commune] for zone, commune in zip(df_zones.GEOCODIGO, df_zones.COMUNA)}    

In [80]:
# commune_zone_list = lambda x: list(df_zones[df_zones.COMUNA==x].GEOCODIGO.values)
# dict_commune_zones = {commune:commune_zone_list(commune) for commune in df_zones.COMUNA.unique()}
print(dict_commune_id)
# print(dict_zone_commune_id)
df_zones

{13404: 0, 13402: 1, 13124: 2, 13301: 3, 13303: 4, 13302: 5, 13504: 6, 13503: 7, 13112: 8, 13121: 9, 13603: 10, 13110: 11, 13122: 12, 13123: 13, 13127: 14, 13120: 15, 13107: 16, 13114: 17, 13115: 18, 13106: 19, 13109: 20, 13105: 21, 13116: 22, 13103: 23, 13104: 24, 13108: 25, 13113: 26, 13132: 27, 13117: 28, 13126: 29, 13119: 30, 13102: 31, 13501: 32, 13118: 33, 13605: 34, 13604: 35, 13201: 36, 13128: 37, 13125: 38, 13401: 39, 13403: 40, 13203: 41, 13202: 42, 13129: 43, 13130: 44, 13111: 45, 13131: 46, 13101: 47, 13601: 48, 13602: 49, 13502: 50}


Unnamed: 0,OBJECTID,REGION,PROVINCIA,NOM_COMUNA,COMUNA,GEOCODIGO
0,1,13,134,PAINE,13404,13404011003
1,2,13,134,PAINE,13404,13404011002
2,3,13,134,PAINE,13404,13404061004
3,4,13,134,PAINE,13404,13404061001
4,5,13,134,PAINE,13404,13404061002
...,...,...,...,...,...,...
1860,1861,13,136,EL MONTE,13602,13602031001
1861,1862,13,136,TALAGANTE,13601,13601021001
1862,1863,13,136,TALAGANTE,13601,13601021002
1863,1864,13,135,ALHUÉ,13502,13502011001


### Mobility data

In [50]:
"""
Create dataframe of edge information: (home_commune, external_commune, week, mean_population_commune)
IDs and weeks are standardized.

Aggregate information according to same communes.
"""
t = df_mob.week.min()
get_zone_to_com_sid = lambda i: dict_zone_commune_sid[i]

df_edge_list = df_mob[["home_zone","code_zone","week","mean_population_zone"]]
df_edge_list.columns = ['home_com', 'ext_com', 'week', 'mean_pop_com']
df_edge_list.loc[:,"home_com"] = df_edge_list.loc[:,"home_com"].apply(get_zone_to_com_sid)
df_edge_list.loc[:,"ext_com"] = df_edge_list.loc[:,"ext_com"].apply(get_zone_to_com_sid)
df_edge_list.loc[:,"week"] = df_edge_list.loc[:,"week"] - t

# Aggregate data by summing population of equivalent edges in same week.
df_edge_list = df_edge_list.groupby(['home_com','ext_com','week']).agg({'mean_pop_com':'sum'}).reset_index()

In [51]:
df_edge_list

Unnamed: 0,home_com,ext_com,week,mean_pop_com
0,0,0,0,6309.983333
1,0,0,1,6231.733333
2,0,0,2,7239.033333
3,0,0,3,7708.000000
4,0,0,4,16121.833333
...,...,...,...,...
21944,50,50,4,403.500000
21945,50,50,5,220.000000
21946,50,50,6,220.400000
21947,50,50,7,222.000000


In [52]:
"""
Create mobility matrix: A[i,j,t] 
"""
T = df_mob.week.max() - df_mob.week.min() + 1
N = len(dict_commune_sid)
A = np.zeros((N, N, T))

edge_list = zip(df_edge_list.home_com, df_edge_list.ext_com, df_edge_list.week, df_edge_list.mean_pop_com)
for edge in edge_list:
    i, j, t, a_ijt = edge
    A[i,j,t] += a_ijt

In [59]:
# Matrix is not sparse
print(np.count_nonzero(A), N*N*T)

21949 23409


### Infections data

In [81]:
df_inf
# print(df_inf.columns)

Unnamed: 0.1,Unnamed: 0,Region,Codigo region,Comuna,Codigo comuna,Poblacion,2020-03-30,2020-04-01,2020-04-03,2020-04-06,...,2020-05-01,2020-05-04,2020-05-08,Tasa,week1,week2,week3,week4,week5,week6
0,0,Arica y Parinacota,15,Arica,15101,247552.0,6.0,6.0,12.0,41.0,...,297.0,310.0,328.0,132.5,24.0,191.0,373.0,390.0,567.0,638.0
1,1,Arica y Parinacota,15,Camarones,15102,1233.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Arica y Parinacota,15,General Lagos,15202,810.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Arica y Parinacota,15,Putre,15201,2515.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Tarapacá,1,Alto Hospicio,1107,129999.0,0.0,0.0,0.0,5.0,...,77.0,128.0,161.0,123.8,0.0,21.0,45.0,66.0,132.0,289.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,341,Magallanes,12,Punta Arenas,12101,141984.0,29.0,87.0,143.0,203.0,...,685.0,744.0,825.0,581.1,259.0,709.0,1273.0,1097.0,1308.0,1569.0
342,342,Magallanes,12,Rio Verde,12103,211.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,343,Magallanes,12,San Gregorio,12104,681.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,146.8,0.0,0.0,0.0,0.0,0.0,1.0
344,344,Magallanes,12,Timaukel,12303,282.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
get_com_sid = lambda i: dict_commune_sid[i] if i in dict_commune_sid else pd.NaT
df_y = pd.DataFrame()
df_y["com"] = df_inf["Codigo comuna"]
df_y.loc[:,"com"] = df_y.loc[:,"com"].apply(get_com_sid)
df_y["week1"] = df_inf["2020-03-30"] + df_inf["2020-04-01"] + df_inf["2020-04-03"]
df_y["week2"] = df_inf["2020-04-06"] + df_inf["2020-04-08"] + df_inf["2020-04-10"]
df_y["week3"] = df_inf["2020-04-13"] + df_inf["2020-04-15"] + df_inf["2020-04-17"]
df_y["week4"] = df_inf["2020-04-20"] + df_inf["2020-04-24"]
df_y["week5"] = df_inf["2020-04-27"] + df_inf["2020-05-01"]
df_y["week6"] = df_inf["2020-05-04"] + df_inf["2020-05-08"]
df_y = df_y.dropna()
df_y = df_y.sort_values("com")

In [100]:
df_y

Unnamed: 0,com,week1,week2,week3,week4,week5,week6
112,0,0.0,7.0,27.0,24.0,36.0,73.0
83,1,29.0,69.0,119.0,100.0,141.0,206.0
118,2,74.0,163.0,269.0,255.0,400.0,660.0
87,3,118.0,178.0,232.0,185.0,232.0,382.0
132,4,0.0,8.0,12.0,10.0,14.0,24.0
101,5,33.0,58.0,123.0,106.0,164.0,289.0
108,6,0.0,0.0,1.0,2.0,4.0,6.0
89,7,15.0,27.0,41.0,29.0,34.0,48.0
99,8,23.0,82.0,213.0,235.0,464.0,828.0
113,9,33.0,71.0,202.0,198.0,309.0,523.0


In [None]:
# Visualize data to detect stationarity
def plot_vars(data, levels, color, leveltype):
    """
    Displays historical trends
    """
    
    fig, ax = plt.subplots(1, 6, figsize=(16,2.5), sharex=True)
    
    palettes = ["blue", "green", "red", "orange", "purple", "black"]
    
    for col, i in dict(zip(levels, list(range(6)))).items():
        data[col].plot(ax=ax[i], legend=True, linewidth=1.0, color=color, sharex=True)     
    
    fig.set_facecolor("floralwhite")
    fig.suptitle(f"Historical trends of VAR {leveltype} variables", 
                 fontsize=14, fontweight="bold", fontname="Verdana")

In [None]:
# Feature extraction
def extract_features(in_data):
    """
    This function extracts feature vectors for each sequential input.
    Returns a matrix of all features X = [X_1 X_2 ... X_T] where X_t is matrix of features for data at time t.
    """
    return

In [None]:
# Train-validation-test dataset split

def timeseries_train_test_split(X, y):
    """
    This function splits the sample into a train and test data.
    """
    return X_train, y_train, X_test, y_test

In [None]:
# Plot forecast
def train_test_plot(model, X_train, X_test):
    """
    This will plot the actual values of data against the one fitted by the model.
    """

### LASSO

In [None]:
# LASSO

X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)

lasso = linear_model.LassoCV(cv=model_selection.TimeSeriesSplit(), 
                             alphas=None, tol = 10000, normalize=True) 

cv_lasso = lasso.fit(X_train, y_train)
optimal_alpha = cv_lasso.alpha_

lasso2 = linear_model.Lasso(alpha=optimal_alpha, normalize=True)
lasso2.fit(X_train, y_train)

train_test_plot(lasso2, X_train, X_test) 

In [None]:
# MSE
metrics.mean_squared_error(y_test, lasso2.predict(X_test))

In [None]:
# Feature selection
lasso_coefs = pd.DataFrame({"features":list(X_train), "coef": lasso2.coef_})
lasso_coefs = lasso_coefs[lasso_coefs.coef != 0.0]
lasso_coefs.sort_values("coef", ascending=False)

### XGBoost

In [None]:
X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)


scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

train_test_plot(model=xgb, X_train=X_train_scaled, X_test=X_test_scaled)


### Neural Network

In [None]:
X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)

reg = neural_network.MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
reg.fit(X, y)

train_test_plot(model=reg, X_train=X_train_scaled, X_test=X_test_scaled)