In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import time
    
# funciones desarrolladas
from functions.agrupar_dfs_censo import *
from functions.cargar_data import *
from functions.impresion import *

In [2]:
flujos_loc = pd.read_csv('tablas/dd_localidades.csv')
print(flujos_loc.shape[0])
flujos_loc.head(3)

377610


Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m
0,10202220,1020,2220,1304729,40657,635.0,583715
1,10202521,1020,2521,1304729,12200,90.0,612802
2,10202522,1020,2522,1304729,2659,17.0,617076


In [3]:
locs = cargar_datos_geo()[2][['CODLOC', 'geometry']]
locs.head(3)

Unnamed: 0,CODLOC,geometry
0,2220,POINT (549273.219 6633915.063)
1,2521,POINT (442454.671 6652273.541)
2,2522,POINT (457921.001 6633361.660)


In [4]:
locs_cod = list(locs.CODLOC)
len(locs_cod) ** 2 - len(locs_cod)

377610

In [5]:
# pega geometrías
dd = flujos_loc.merge(locs, left_on='cod_ori', right_on='CODLOC')
dd = dd.merge(locs, left_on='cod_des', right_on='CODLOC')
dd.drop(['CODLOC_x', 'CODLOC_y'], axis=1, inplace=True)
dd.rename({'geometry_x':'geom_ori', 'geometry_y':'geom_des'}, axis=1, inplace=True)

dd.head()

# exporta CSV con geometrías puntuales para mapa de flujos
#dd.to_csv('capas/loc_lines.csv', index=False)

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m,geom_ori,geom_des
0,10202220,1020,2220,1304729,40657,635.0,583715,POINT (573082.248 6145008.434),POINT (549273.219 6633915.063)
1,25212220,2521,2220,12200,40657,,137927,POINT (442454.671 6652273.541),POINT (549273.219 6633915.063)
2,25222220,2522,2220,2659,40657,,111408,POINT (457921.001 6633361.660),POINT (549273.219 6633915.063)
3,26212220,2621,2220,2531,40657,,106867,POINT (468458.270 6601361.867),POINT (549273.219 6633915.063)
4,27212220,2721,2220,380,40657,,58392,POINT (502880.928 6648162.053),POINT (549273.219 6633915.063)


In [6]:
print(flujos_loc.shape)
flujos_loc.head()

(377610, 7)


Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m
0,10202220,1020,2220,1304729,40657,635.0,583715
1,10202521,1020,2521,1304729,12200,90.0,612802
2,10202522,1020,2522,1304729,2659,17.0,617076
3,10202621,1020,2621,1304729,2531,9.0,595785
4,10202721,1020,2721,1304729,380,,628213


In [7]:
# reemplaza ceros
flujos_loc.loc[flujos_loc.personas_mig.isna(), 'personas_mig'] = 0.001
flujos_loc.loc[flujos_loc.poblacion_des==0, 'poblacion_des'] = 1
flujos_loc.loc[flujos_loc.poblacion_ori==0, 'poblacion_ori'] = 1

In [8]:
# convierte códigos a string para ser correctamente interpretados por el modelo
flujos_loc['cod_ori'] = flujos_loc['cod_ori'].astype(str)
flujos_loc['cod_des'] = flujos_loc['cod_des'].astype(str)

flujos_loc.head()

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m
0,10202220,1020,2220,1304729,40657,635.0,583715
1,10202521,1020,2521,1304729,12200,90.0,612802
2,10202522,1020,2522,1304729,2659,17.0,617076
3,10202621,1020,2621,1304729,2531,9.0,595785
4,10202721,1020,2721,1304729,380,0.001,628213


In [9]:
formula = "personas_mig ~ cod_ori + np.log(poblacion_des) + np.log(distancia_m) -1"

t0= time.time()

# entrena y ajusta el modelo
prodSim = smf.glm(formula=formula, data=flujos_loc, family=sm.families.Poisson()).fit(method='lbfgs', max_start_irls=0)

t1 = time.time() - t0
print(t1)

191.4250123500824


In [10]:
#prodSim.summary()
prodSim.params

cod_ori[1020]            6.905786
cod_ori[10320]           3.896025
cod_ori[10321]           2.599674
cod_ori[10521]           0.229187
cod_ori[10522]           0.507811
                           ...   
cod_ori[9957]           -0.568802
cod_ori[9958]           -0.533280
cod_ori[9991]           -0.381858
np.log(poblacion_des)    0.921008
np.log(distancia_m)     -0.865461
Length: 617, dtype: float64

In [11]:
# recupera los parámetros del modelo
mu_i = prodSim.params.to_frame()

# elimina caractérres no numéricos para poder pegar
mu_i.rename(index = dict(zip(mu_i.index[0:-2].values,  mu_i.index[0:-2].str.replace('cod_ori[','', regex=False).str.replace(']','', regex=False).values)),
            inplace=True)

# renombre columna
mu_i.rename(columns = {0:'mu_i'}, inplace=True)

# merge
flujos_loc = flujos_loc.merge(mu_i, left_on='cod_ori', right_index=True)

flujos_loc.head()

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m,mu_i
0,10202220,1020,2220,1304729,40657,635.0,583715,6.905786
1,10202521,1020,2521,1304729,12200,90.0,612802,6.905786
2,10202522,1020,2522,1304729,2659,17.0,617076,6.905786
3,10202621,1020,2621,1304729,2531,9.0,595785,6.905786
4,10202721,1020,2721,1304729,380,0.001,628213,6.905786


In [12]:
# funciones para recuperar parámetros e imprimir
def get_gml_params(model, variables):
    "Accede a los parámetros alfa y beta dentro de los resutaldos del modelo"
    params = [model.params[i] for i in variables]
    params_str = [str(round(i, 4)) for i in params]
    return params, params_str

def print_params(variables, params_list):
    "Imprime los parámetros"
    return print("""alpha ({}) = {}\nbeta ({}) = {}
    """.format(variables[0], params_list[0], variables[1], params_list[1]))

In [13]:
# imprime parámetros
variables= ['np.log(poblacion_des)', 'np.log(distancia_m)']

params, params_str = get_gml_params(prodSim, variables)

print_params(variables, params_str)

alpha (np.log(poblacion_des)) = 0.921
beta (np.log(distancia_m)) = -0.8655
    


In [14]:
variables

['np.log(poblacion_des)', 'np.log(distancia_m)']

In [15]:
# genera estimación redondeada
def prod_sim_est(df, variables, alpha, beta):
    "Estimación del modelo imputando los parámetros alfa y beta previamente calculados"
    prodsimest = np.exp(df['mu_i'] + alpha * np.log(df[variables[0]]) + beta * np.log(df[variables[1]]))
    return round(prodsimest)

In [16]:
# recupera alpha y beta, previamente guardados en la lista de parámetros
alpha, beta = [i for i in params]

variables = ['poblacion_des', 'distancia_m']

# estima y guarda en columna "podsimtest"
flujos_loc['prodsimest'] = prod_sim_est(flujos_loc, variables, alpha, beta)

In [18]:
flujos_loc.head()

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m,mu_i,prodsimest
0,10202220,1020,2220,1304729,40657,635.0,583715,6.905786,179.0
1,10202521,1020,2521,1304729,12200,90.0,612802,6.905786,57.0
2,10202522,1020,2522,1304729,2659,17.0,617076,6.905786,14.0
3,10202621,1020,2621,1304729,2531,9.0,595785,6.905786,14.0
4,10202721,1020,2721,1304729,380,0.001,628213,6.905786,2.0


In [19]:
# matriz de flujos estimada por el modelo
flujos_loc['cod_ori'] = flujos_loc.cod_ori.astype(int)
flujos_loc['cod_des'] = flujos_loc.cod_des.astype(int)

matrix_prodsim = pd.pivot_table(flujos_loc,
                                values='prodsimest',
                                index ='cod_ori',
                                columns='cod_des',
                                fill_value=0,
                                aggfunc=sum,
                                margins=True,
                                margins_name='Total')

matrix_prodsim.Total = matrix_prodsim.Total.astype(int)

matrix_prodsim

cod_des,1020,2220,2521,2522,2621,2721,2724,2725,2822,2823,...,19963,19964,19965,19966,19967,19968,19969,19970,19971,Total
cod_ori,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1020,0,179,57,14,14,2,5,7,3,4,...,0,0,0,0,0,1,1,0,1,55617
2220,809,0,38,11,11,3,3,7,2,3,...,0,0,0,0,0,0,0,0,0,3295
2521,475,71,0,24,11,1,35,3,21,30,...,0,0,0,0,0,0,0,0,0,2114
2522,8,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20
2621,11,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19968,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
19969,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
19970,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
19971,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6


In [None]:
#zero inflated
from patsy import dmatrices


df = flujos_loc

expr = "personas_mig ~ cod_ori + log_dist -1"

mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))


y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')

zip_training_results = sm.ZeroInflatedPoisson(endog=y_train, exog=X_train, exog_infl=X_train, inflation='logit').fit()

zip_training_results.summary()

zip_predictions = zip_training_results.predict(X_test,exog_infl=X_test)

predicted_counts=np.round(zip_predictions)

actual_counts = y_test['personas_mig']

print('ZIP RMSE='+str(np.sqrt(np.sum(np.power(np.subtract(predicted_counts,actual_counts),2)))))



fig = plt.figure(figsize=(12, 8), dpi=150)

fig.suptitle('Predicted versus actual counts using the ZIP model')

predicted, = plt.plot(X_test.index, predicted_counts, 'g', label='Predicted')

actual, = plt.plot(X_test.index, actual_counts, 'r', label='Actual')

plt.legend(handles=[predicted, actual])

plt.show()

In [None]:
prodSim.summary()

In [None]:
# funciones para recuperar parámetros e imprimir
def get_gml_params(model, variables):
    "Accede a los parámetros alfa y beta dentro de los resutaldos del modelo"
    params = [model.params[i] for i in variables]
    params_str = [str(round(i, 4)) for i in params]
    return params, params_str

def print_params(variables, params_list):
    "Imprime los parámetros"
    return print("""alpha ({}) = {}\nbeta ({}) = {}
    """.format(variables[0], params_list[0], variables[1], params_list[1]))

In [None]:
# imprime parámetros
variables= ['cod_ori', 'log_dist']

params, params_str = get_gml_params(prodSim, variables)

print_params(variables, params_str)

In [None]:
# crea la sumatoria de migrantes en origen (Oi)
O_i = flujos_loc.groupby('cod_ori')['personas_mig'].sum().to_frame().rename(columns = {'personas_mig':'O_i'})

# crea la sumatoria de migrantes en destino (Di)
D_j = flujos_loc.groupby('cod_des')['personas_mig'].sum().to_frame().rename(columns = {'personas_mig':'D_j'})

# pega ambas variables con el df
flujos_loc = flujos_loc.merge(O_i, left_on='cod_ori', right_index=True)
flujos_loc = flujos_loc.merge(D_j, left_on='cod_des', right_index=True)

flujos_loc.head()

In [None]:
# recupera los parámetros del modelo
mu_i = prodSim.params.to_frame()

# elimina caractérres no numéricos para poder pegar
mu_i.rename(index = dict(zip(mu_i.index[0:-2].values, mu_i.index[0:-2].str.replace(r'[^ ABCDEFGHIJKLMNÑOPQRSTUVWXYZ]','', regex=True).values)),
            inplace=True)

# renombre columna
mu_i.rename(columns = {0:'mu_i'}, inplace=True)

# merge
flujos_loc = flujos_loc.merge(mu_i, left_on='cod_ori', right_index=True)

In [None]:
mu_i

In [None]:
flujos_loc

In [None]:
# genera estimación redondeada
def prod_sim_est(df, variables, alpha, beta):
    "Estimación del modelo imputando los parámetros alfa y beta previamente calculados"
    prodsimest = np.exp(df['mu_i'] + alpha * df[variables[0]] + beta * df[variables[1]])
    return round(prodsimest)