In [62]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns

from sklearn import (linear_model, metrics, neural_network, pipeline, preprocessing, model_selection)

In [63]:
"""
Load data.
Change path variable to your location of CSV files.
"""

path = "./data/"

df_zones = pd.read_csv(path + "zona_censal_comuna.csv")
df_demos = pd.read_csv(path + "age_demo_comunas.csv")
df_qua = pd.read_csv(path + "cuarentena_comunas_semanas.csv")
df_inf = pd.read_csv(path + "dp1_contagiados_por_comuna.csv")
df_mob1 = pd.read_csv(path + "od_zones_weeks_20200301.enc.csv")
df_mob2 = pd.read_csv(path + "od_zones_weeks_20200401.enc.csv")
df_mob = pd.concat([df_mob1, df_mob2])

### Census zones & Communes

In [64]:
"""
Census zone information (only for Santiago).

Relevant columns:
    GEOCODIGO = census zone id
    COMUNA    = commune id
"""
df_zones

Unnamed: 0,OBJECTID,REGION,PROVINCIA,NOM_COMUNA,COMUNA,GEOCODIGO
0,1,13,134,PAINE,13404,13404011003
1,2,13,134,PAINE,13404,13404011002
2,3,13,134,PAINE,13404,13404061004
3,4,13,134,PAINE,13404,13404061001
4,5,13,134,PAINE,13404,13404061002
...,...,...,...,...,...,...
1860,1861,13,136,EL MONTE,13602,13602031001
1861,1862,13,136,TALAGANTE,13601,13601021001
1862,1863,13,136,TALAGANTE,13601,13601021002
1863,1864,13,135,ALHUÉ,13502,13502011001


In [65]:
"""
Create dictionaries for (standardized) ID lookups of zones and communes.
"""
# Enumerate communes with standardized IDs (from 0 to 50)
df_coms = df_zones[["COMUNA","NOM_COMUNA"]].drop_duplicates()
dict_com_sid_com_info = {com_sid:com_info for com_sid, com_info in enumerate(zip(df_coms.COMUNA, df_coms.NOM_COMUNA))}
dict_com_id_com_sid = {com_id:com_sid for com_sid, com_id in enumerate(df_coms.COMUNA)}
dict_com_name_com_sid = {com_name:com_sid for com_sid, com_name in enumerate(df_coms.NOM_COMUNA)}

# Dictionary mapping census zone id to its commune standardized id
dict_zone_id_com_sid = {zone_id:dict_com_id_com_sid[com_id] for zone_id, com_id in zip(df_zones.GEOCODIGO, df_zones.COMUNA)}    

### Mobility data

In [66]:
"""
Logs of averaged amount of movement of individuals across census zones.

Relevant columns:
    mean_population_zone = weekly mean of daily number of individuals
                                that spend majority of their day in census zone [code_zone]
                                that live in census zone [home_zone]
    code_zone            = visiting census zone
    home_zone            = home census zone
    week                 = week of year
    time_block           = T1 - 10:00-13:00, T2 - 14:00-17:00
"""
df_mob

Unnamed: 0,mean_population_zone,code_zone,week,time_block,home_zone
0,154.800000,13124061022,9,T1,13124061005
1,14.200000,13101081001,9,T1,13126031003
2,12.800000,13123021005,9,T1,13120041002
3,26.800000,13114021002,9,T1,13114061001
4,44.200000,13114131005,9,T1,13114031002
...,...,...,...,...,...
4642958,1.666667,13130031001,17,T2,13112051010
4642959,1.666667,13119011005,17,T2,13106051001
4642960,1.666667,13301011001,17,T1,13104051001
4642961,1.666667,13119201001,17,T2,13124061005


In [67]:
"""
Create dataframe of edge information: (home zone, visiting zone, week, mean population)
Aggregate information into communes.
"""
t = df_mob.week.min()    # earliest week available
get_zone_com_sid = lambda i: dict_zone_id_com_sid[i]

df_edge_list = df_mob[["home_zone","code_zone","week","mean_population_zone"]]
df_edge_list = df_edge_list[df_edge_list["home_zone"] != df_edge_list["code_zone"]]    # remove intra-zone info
df_edge_list.columns = ['home_com', 'travel_com', 'week', 'mean_pop']
df_edge_list.loc[:,"home_com"] = df_edge_list.loc[:,"home_com"].apply(get_zone_com_sid)
df_edge_list.loc[:,"travel_com"] = df_edge_list.loc[:,"travel_com"].apply(get_zone_com_sid)
df_edge_list.loc[:,"week"] = df_edge_list.loc[:,"week"] - t    # zero index weeks

# Aggregate data by summing mean population of same edges in same week. Sum data from both time blocks.
df_edge_list = df_edge_list.groupby(['home_com','travel_com','week']).agg({'mean_pop':'sum'}).reset_index()

In [263]:
df_edge_list.head()

Unnamed: 0,home_com,travel_com,week,mean_pop
0,0,0,0,2263.183333
1,0,0,1,2283.533333
2,0,0,2,2210.433333
3,0,0,3,2283.0
4,0,0,4,4658.5


In [274]:
"""
Create mobility matrix: A[i,j,t] 
"""
T = df_mob.week.max() - df_mob.week.min() + 1
N = len(dict_com_sid_com_info)
A = np.zeros((N, N, T))

edge_list = zip(df_edge_list.home_com, df_edge_list.travel_com, df_edge_list.week, df_edge_list.mean_pop)
for edge in edge_list:
    i, j, t, a_ijt = edge
    A[i,j,t] += a_ijt
    
# normalize entries by visit commune population size
normalize = True
if normalize:
    get_com_sid = lambda i: dict_com_id_com_sid[i] if i in dict_com_id_com_sid else pd.NaT
    df_pop = df_demos[["COMUNA","T_POB"]]
    df_pop["COMUNA"] = df_pop.loc[:,"COMUNA"].apply(get_com_sid)
    df_pop = df_pop.dropna()
    dict_com_sid_pop = {sid:pop for sid, pop in zip(df_pop.COMUNA, df_pop.T_POB)}

    for j in range(N):
        A[:,j,:] = A[:,j,:] / dict_com_sid_pop[j]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [275]:
# Mobility matrix over 51 communes in Santiago over 9 time periods (weeks 9-17).
print(A.shape)

# Matrix is not sparse
print(np.count_nonzero(A), N*N*T)

(51, 51, 9)
21931 23409


### Infections data

In [276]:
"""
Logs of cumulative number of infections in each commune in Chile over differently spaced out days in weeks 14-19.

Relevant columns:
    Codigo comuna          = commune ID
    Date (e.g. 2020-03-30) = number of confirmed cases of COVID19 on that day.
"""
df_inf

Unnamed: 0.1,Unnamed: 0,Region,Codigo region,Comuna,Codigo comuna,Poblacion,2020-03-30,2020-04-01,2020-04-03,2020-04-06,...,2020-04-13,2020-04-15,2020-04-17,2020-04-20,2020-04-24,2020-04-27,2020-05-01,2020-05-04,2020-05-08,Tasa
0,0,Arica y Parinacota,15,Arica,15101,247552.0,6.0,6.0,12.0,41.0,...,115.0,124.0,134.0,166.0,224.0,270.0,297.0,310.0,328.0,132.5
1,1,Arica y Parinacota,15,Camarones,15102,1233.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Arica y Parinacota,15,General Lagos,15202,810.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Arica y Parinacota,15,Putre,15201,2515.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Tarapacá,1,Alto Hospicio,1107,129999.0,0.0,0.0,0.0,5.0,...,14.0,15.0,16.0,27.0,39.0,55.0,77.0,128.0,161.0,123.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,341,Magallanes,12,Punta Arenas,12101,141984.0,29.0,87.0,143.0,203.0,...,387.0,416.0,470.0,516.0,581.0,623.0,685.0,744.0,825.0,581.1
342,342,Magallanes,12,Rio Verde,12103,211.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,343,Magallanes,12,San Gregorio,12104,681.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,146.8
344,344,Magallanes,12,Timaukel,12303,282.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [292]:
"""
Aggregate infections data into weeks.
Only keep data for communes in Santiago.
"""
get_com_sid = lambda i: dict_com_id_com_sid[i] if i in dict_com_id_com_sid else pd.NaT
df_y = pd.DataFrame()
df_y["com"] = df_inf["Codigo comuna"]
df_y.loc[:,"com"] = df_y.loc[:,"com"].apply(get_com_sid)
# df_y["week14"] = df_inf["2020-03-30"] + df_inf["2020-04-01"] + df_inf["2020-04-03"]
# df_y["week15"] = df_inf["2020-04-06"] + df_inf["2020-04-08"] + df_inf["2020-04-10"]
# df_y["week16"] = df_inf["2020-04-13"] + df_inf["2020-04-15"] + df_inf["2020-04-17"]
# df_y["week17"] = df_inf["2020-04-20"] + df_inf["2020-04-24"]
# df_y["week18"] = df_inf["2020-04-27"] + df_inf["2020-05-01"]
# df_y["week19"] = df_inf["2020-05-04"] + df_inf["2020-05-08"]
df_y["week14"] = df_inf["2020-04-03"]
df_y["week15"] = df_inf["2020-04-10"]
df_y["week16"] = df_inf["2020-04-17"]
df_y["week17"] = df_inf["2020-04-24"]
df_y["week18"] = df_inf["2020-05-01"]
df_y["week19"] = df_inf["2020-05-08"]
df_y = df_y.dropna().sort_values("com")

# Get time-evolving cumulative infections vector
y = df_y.drop("com",1).to_numpy()

if normalize:
    for j in range(N):
        y[j,:] = y[j,:] / dict_com_sid_pop[j]

# Transform data to ensure positivity in regression
y = np.log(y + 1)

In [293]:
# Cumulative infections over 51 communes in Santiago over 6 time periods (weeks 14-19)
y.shape

(51, 6)

### Quarantine data

In [294]:
"""
Logs of percentage of quarantine measures in communes in Santiago.
"""
get_com_sid_from_name = lambda i: dict_com_name_com_sid[i.upper()]
df_qua["com_sid"] = df_qua.loc[:,"comuna"].apply(get_com_sid_from_name)
df_qua.dropna().sort_values("com_sid")

Unnamed: 0,comuna,year_week,por_cuarentena,com_sid
26,La Pintana,18,0.471066,8
27,La Pintana,19,0.471066,8
28,La Pintana,20,0.471066,8
42,Pedro Aguirre Cerda,17,1.000000,9
43,Pedro Aguirre Cerda,18,1.000000,9
...,...,...,...,...
80,Santiago,15,1.000000,47
79,Santiago,14,1.000000,47
78,Santiago,13,1.000000,47
85,Santiago,20,1.000000,47


### Demographics

In [295]:
"""
Demographics information for all communes in Chile.
"""
df_demos.head()

Unnamed: 0,COMUNA,NOM_COMUNA,Densidad_,SUPERFICIE__KM2_,T_POB,T_HOM,T_MUJ,T_VIV,por_muj,por_hom,...,55 a 59,60 a 64,65 a 69,70 a 74,75 a 79,80 a 84,85 a 89,90 a 94,95 a 99,100 o más
0,13101,SANTIAGO,17483.935547,23.135237,404495,206678,197817,193628,0.489047,0.510953,...,16473,12820,9465,7449,5328,3738,2602,1027,302,108
1,13102,CERRILLOS,4817.263672,16.77965,80832,39631,41201,24547,0.509712,0.490288,...,4510,3602,2963,2550,1862,1154,738,267,57,18
2,13103,CERRO NAVIA,11950.771484,11.097359,132622,65438,67184,38020,0.506583,0.493417,...,7466,6013,5340,4550,3160,1842,1028,340,103,34
3,13104,CONCHALÍ,11427.335938,11.109763,126955,61877,65078,37759,0.512607,0.487393,...,7941,5988,4801,4250,3533,2521,1625,552,127,43
4,13105,EL BOSQUE,11344.626953,14.324402,162505,79372,83133,47941,0.511572,0.488428,...,10091,8212,6348,5352,3737,2362,1473,452,104,42


In [296]:
"""
Get demographic information for communes in Santiago.
"""
df_C = df_demos.copy()
df_C.loc[:,"COMUNA"] = df_C.loc[:,"COMUNA"].apply(get_com_sid)
df_C = df_C.dropna().sort_values("COMUNA")
df_C.drop(["COMUNA", "NOM_COMUNA", "NOM_COMUNA.1"], axis=1, inplace=True)

# Get demographics feature matrix
C = df_C.to_numpy()

In [297]:
# Demographics features over 51 communes in Santiago
print(C.shape)

(51, 31)


### Feature Extraction

In [298]:
# Feature extraction
def extract_features(A, y, C, params):
    """
    Extracts feature vectors for each sequential input.
    Returns a matrix of all features X[N,L,T] where X[:,:,t] is matrix of features for data at time t.
    """
    
    # get overlap of mobility data and infections data - hardcoded
    A = A[:,:,5:]
    y = y[:,:4]
    
    
    N, T, L = params[:3]
    X = np.zeros((N, L, T))
    for t in range(T):
        At = A[:,:,t]
        yt = y[:,t]
        
        x1 = np.diag(At)
        x2 = np.dot(At.T, np.ones(N))
        x3 = np.dot(At, np.ones(N))
        x4 = yt
        x5 = np.dot(At.T, yt)
        x6 = np.dot(At, yt)
          
        Xt = np.stack((x1,x2,x3,x4,x5,x6), axis=-1)
        Xt = np.column_stack((Xt, C))
        X[:,:,t] = Xt
    
#         x1 = yt
# #         x2 = np.ones(N)*(t+1)
#         Xt = np.stack((x1), axis=-1)
# #         Xt = np.stack((x1,x2), axis=-1)
#         Xt = np.column_stack((Xt, C[:,2]))
# #         Xt = np.stack((x1), axis=-1)
#         X[:,:,t] = Xt
# #         X[:,:,t] = Xt.reshape((N,1))
    
        
    return X, y

def collect_data(X, y, params):
    """
    Collects data into a big 'ol matrix and output matrix to feed into a regression method.
    """
    N, T, L, K, k = params
                  
    Z = np.zeros((N*(T-K), L*(K-k)))
    for t in range(T-K):
        Z[N*t:N*(t+1),:] = X[:,:,t:t+K-k].transpose([0,2,1]).reshape((N, L*(K-k)))
                 
    Y = np.zeros(N*(T-K))
    for t in range(T-K):
        Y[N*t:N*(t+1)] = y[:,t+K].reshape((N))
        
    return Z, Y

def timeseries_train_test_split(X, y, params):
    """
    Splits data into a train and test data.
    """
    N, T, L, K, k = params
    n = 1                        # number of test points
    
    X_train = X[:N*(T-K-n),:]
    X_test = X[N*(T-K-n):,:]
    y_train = y[:N*(T-K-n)]
    y_test = y[N*(T-K-n):]
    
    return X_train, y_train, X_test, y_test

In [299]:
"""
Extract features
"""
N = 51                       # num communes
T = 4                        # num total data points
L = 6+C.shape[1]                        # num features
K = 2                        # num lags
k = 0                        # num skips
params = [N,T,L,K,k]


F, y = extract_features(A, y, C, params)
X_data, y_data = collect_data(F, y, params)
X_train, y_train, X_test, y_test = timeseries_train_test_split(X_data, y_data, params)
print(F.shape, X_data.shape, y_data.shape)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(51, 37, 4) (102, 74) (102,)
(51, 74) (51,) (51, 74) (51,)


In [300]:
# print(F, X_data, y_data)
# print(X_train, y_train, X_test, y_test)

### Linear Regression

In [301]:
reg = linear_model.LinearRegression(normalize=True)
reg.fit(X_train, y_train)
reg_rmse = np.sqrt(metrics.mean_squared_error(y_test, reg.predict(X_test)))

print(reg_rmse)
print(reg.intercept_, reg.coef_)
print(reg.score(X_test, y_test))
print(y_test.mean())

0.00044847913688260085
-114674514543.13264 [ 4.11945625e-03 -2.25043636e-03 -4.46023962e-04  1.04768183e+00
  1.16143557e+01 -2.50441709e+01  6.47981988e+03 -2.57236797e+04
 -4.86721473e+02  6.97246246e+02  5.13267406e+02 -3.30780230e+02
  5.47948950e+10  5.84622638e+10  9.28092886e+07 -8.25645552e+03
  7.81617927e+01  4.43922518e+01  3.94159610e+02  5.69109510e+01
  4.68741426e+01  2.31837116e+02  3.71651974e+02  1.70596722e+02
  2.23850233e+02  4.37132102e+02  3.31586994e+02 -2.13312041e+02
  6.64985537e+02 -4.86202618e+01 -5.74899692e+01  9.48825887e+02
 -5.56744746e+01  1.00774329e+02 -3.32761239e+02 -1.04908688e+03
 -4.09601119e+04 -3.63231528e-03  5.26650338e-03 -4.10867327e-03
  8.01720207e-01 -1.28872120e+01  2.11881551e+01 -6.47981988e+03
  2.57236797e+04 -4.99356578e+01 -6.15493300e+02 -4.31514460e+02
  3.30780230e+02  5.98796195e+10  5.62122507e+10 -9.28092886e+07
  8.25645552e+03  3.76742393e+02  4.10511934e+02  6.07445750e+01
  3.97993235e+02  4.08030043e+02  2.23067069e+0

In [302]:
# y_pred = reg.predict(X_test)
# for i in range(len(y_test)):
#     commune_id = dict_sid_commune[i]
#     yi_act = y_test[i]
#     yi_pred = y_pred[i]
#     info_i = (commune_id, yi_act, yi_pred)
#     print(info_i)

### LASSO

In [303]:
lasso = linear_model.LassoCV(cv=model_selection.TimeSeriesSplit(n_splits=5), alphas=None, tol = 10000, normalize=True) 
lasso.fit(X_train, y_train)
optimal_alpha = lasso.alpha_
lasso2 = linear_model.Lasso(alpha=optimal_alpha, normalize=True)
lasso2.fit(X_train, y_train)
lasso2_rmse = np.sqrt(metrics.mean_squared_error(y_test, lasso2.predict(X_test)))

print(lasso2_rmse)

0.00014569471650763684


In [304]:
# Feature selection
fnames = []
for i in range(F.shape[1]):
    if i < 7:
        fnames.append('x'+str(i+1)+'_2')
    else:
        fnames.append('c'+str(i-6)+'_2')
for i in range(F.shape[1]):
    if i < 7:
        fnames.append('x'+str(i+1)+'_1')
    else:
        fnames.append('c'+str(i-6)+'_1')
        
print(lasso2.intercept_)
for pair in zip(fnames, lasso2.coef_):
    print(pair)

5.509897690727855e-06
('x1_2', 0.0)
('x2_2', 0.00021240410195986118)
('x3_2', 0.0)
('x4_2', 0.0)
('x5_2', -0.0)
('x6_2', -0.0)
('x7_2', 3.4555255841479924e-09)
('c1_2', -7.038587477798091e-10)
('c2_2', 0.0)
('c3_2', 0.0)
('c4_2', 0.0)
('c5_2', 0.0)
('c6_2', 0.0)
('c7_2', -0.0)
('c8_2', 6.334616276822603e-06)
('c9_2', 0.0)
('c10_2', 0.0)
('c11_2', 2.3866266425105734e-09)
('c12_2', 0.0)
('c13_2', 0.0)
('c14_2', 0.0)
('c15_2', 0.0)
('c16_2', 0.0)
('c17_2', 0.0)
('c18_2', 0.0)
('c19_2', 0.0)
('c20_2', 0.0)
('c21_2', 0.0)
('c22_2', 0.0)
('c23_2', 0.0)
('c24_2', -0.0)
('c25_2', -0.0)
('c26_2', -0.0)
('c27_2', -0.0)
('c28_2', -0.0)
('c29_2', -0.0)
('c30_2', -0.0)
('x1_1', 0.0)
('x2_1', 0.0)
('x3_1', 0.0)
('x4_1', 1.0064225776377944)
('x5_1', 0.0)
('x6_1', 0.0)
('x7_1', 6.620870445442234e-09)
('c1_1', -7.597221772893222e-10)
('c2_1', 0.0)
('c3_1', 0.0)
('c4_1', 0.0)
('c5_1', 0.0)
('c6_1', 0.0)
('c7_1', -0.0)
('c8_1', 1.2845380233170961e-05)
('c9_1', 0.0)
('c10_1', 0.0)
('c11_1', 3.990497654896

###### The following implementations are not finished yet...

### XGBoost

In [None]:
X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)


scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

train_test_plot(model=xgb, X_train=X_train_scaled, X_test=X_test_scaled)


### Neural Network

In [None]:
X_train, y_train, X_test, y_test = timeseries_train_test_split(X=X, y=y)

reg = neural_network.MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
reg.fit(X, y)

train_test_plot(model=reg, X_train=X_train_scaled, X_test=X_test_scaled)