In [2]:
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import tabulate
from operator import add
import itertools

from shapely import wkt
from shapely.geometry import shape, LineString, Point
    
# funciones desarrolladas
from functions.agrupar_dfs_censo import *
from functions.cargar_data import *
from functions.impresion import *

In [3]:
flujos_loc = pd.read_csv('tablas/dd_localidades.csv')
print(flujos_loc.shape[0])
flujos_loc.head(3)

377610


Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m
0,10202220,1020,2220,1304729,40657,635.0,583715
1,10202521,1020,2521,1304729,12200,90.0,612802
2,10202522,1020,2522,1304729,2659,17.0,617076


In [4]:
locs = cargar_datos_geo()[2][['CODLOC', 'geometry']]
locs.head(3)

Unnamed: 0,CODLOC,geometry
0,2220,POINT (549273.219 6633915.063)
1,2521,POINT (442454.671 6652273.541)
2,2522,POINT (457921.001 6633361.660)


In [5]:
locs_cod = list(locs.CODLOC)
len(locs_cod) ** 2 - len(locs_cod)

377610

In [6]:
# pega geometrías
dd = flujos_loc.merge(locs, left_on='cod_ori', right_on='CODLOC')
dd = dd.merge(locs, left_on='cod_des', right_on='CODLOC')
dd.drop(['CODLOC_x', 'CODLOC_y'], axis=1, inplace=True)
dd.rename({'geometry_x':'geom_ori', 'geometry_y':'geom_des'}, axis=1, inplace=True)

dd.head()

# exporta CSV con geometrías puntuales para mapa de flujos
dd.to_csv('capas/loc_lines.csv', index=False)

In [7]:
print(flujos_loc.shape)
flujos_loc.head()

(377610, 7)


Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m
0,10202220,1020,2220,1304729,40657,635.0,583715
1,10202521,1020,2521,1304729,12200,90.0,612802
2,10202522,1020,2522,1304729,2659,17.0,617076
3,10202621,1020,2621,1304729,2531,9.0,595785
4,10202721,1020,2721,1304729,380,,628213


In [8]:
flujos_loc.loc[flujos_loc.personas_mig.isna(), 'personas_mig'] = 0.001
flujos_loc.head()

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m
0,10202220,1020,2220,1304729,40657,635.0,583715
1,10202521,1020,2521,1304729,12200,90.0,612802
2,10202522,1020,2522,1304729,2659,17.0,617076
3,10202621,1020,2621,1304729,2531,9.0,595785
4,10202721,1020,2721,1304729,380,0.001,628213


In [9]:
flujos_loc['log_dist'] = np.log(flujos_loc.distancia_m)
flujos_loc.head()

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m,log_dist
0,10202220,1020,2220,1304729,40657,635.0,583715,13.277168
1,10202521,1020,2521,1304729,12200,90.0,612802,13.325797
2,10202522,1020,2522,1304729,2659,17.0,617076,13.332747
3,10202621,1020,2621,1304729,2531,9.0,595785,13.297635
4,10202721,1020,2721,1304729,380,0.001,628213,13.350635


In [10]:
formula = "personas_mig ~ cod_ori + log_dist -1"

# entrena y ajusta el modelo
prodSim = smf.glm(formula=formula, data=flujos_loc, family=sm.families.Poisson()).fit()

In [11]:
prodSim.summary()

0,1,2,3
Dep. Variable:,personas_mig,No. Observations:,377610.0
Model:,GLM,Df Residuals:,377608.0
Model Family:,Poisson,Df Model:,1.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-810840.0
Date:,"Tue, 27 Apr 2021",Deviance:,1590100.0
Time:,17:25:46,Pearson chi2:,267000000.0
No. Iterations:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
cod_ori,-0.0001,6.76e-07,-221.082,0.000,-0.000,-0.000
log_dist,0.0107,0.000,25.042,0.000,0.010,0.012


In [12]:
# funciones para recuperar parámetros e imprimir
def get_gml_params(model, variables):
    "Accede a los parámetros alfa y beta dentro de los resutaldos del modelo"
    params = [model.params[i] for i in variables]
    params_str = [str(round(i, 4)) for i in params]
    return params, params_str

def print_params(variables, params_list):
    "Imprime los parámetros"
    return print("""alpha ({}) = {}\nbeta ({}) = {}
    """.format(variables[0], params_list[0], variables[1], params_list[1]))

In [13]:
# imprime parámetros
variables= ['cod_ori', 'log_dist']

params, params_str = get_gml_params(prodSim, variables)

print_params(variables, params_str)

alpha (cod_ori) = -0.0001
beta (log_dist) = 0.0107
    


In [14]:
# crea la sumatoria de migrantes en origen (Oi)
O_i = flujos_loc.groupby('cod_ori')['personas_mig'].sum().to_frame().rename(columns = {'personas_mig':'O_i'})

# crea la sumatoria de migrantes en destino (Di)
D_j = flujos_loc.groupby('cod_des')['personas_mig'].sum().to_frame().rename(columns = {'personas_mig':'D_j'})

# pega ambas variables con el df
flujos_loc = flujos_loc.merge(O_i, left_on='cod_ori', right_index=True)
flujos_loc = flujos_loc.merge(D_j, left_on='cod_des', right_index=True)

flujos_loc.head()

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m,log_dist,O_i,D_j
0,10202220,1020,2220,1304729,40657,635.0,583715,13.277168,55484.172,1043.544
1229,25212220,2521,2220,12200,40657,0.001,137927,11.83448,1172.518,1043.544
1843,25222220,2522,2220,2659,40657,0.001,111408,11.620954,144.595,1043.544
2457,26212220,2621,2220,2531,40657,0.001,106867,11.57934,191.58,1043.544
3071,27212220,2721,2220,380,40657,0.001,58392,10.974934,12.609,1043.544


In [17]:
# recupera los parámetros del modelo
mu_i = prodSim.params.to_frame()

# elimina caractérres no numéricos para poder pegar
mu_i.rename(index = dict(zip(mu_i.index[0:-2].values, mu_i.index[0:-2].str.replace(r'[^ ABCDEFGHIJKLMNÑOPQRSTUVWXYZ]','', regex=True).values)),
            inplace=True)

# renombre columna
mu_i.rename(columns = {0:'mu_i'}, inplace=True)

# merge
flujos_loc = flujos_loc.merge(mu_i, left_on='cod_ori', right_index=True)

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

In [18]:
mu_i

Unnamed: 0,mu_i
cod_ori,-0.000149
log_dist,0.010713


In [16]:
flujos_loc

Unnamed: 0,cod,cod_ori,cod_des,poblacion_ori,poblacion_des,personas_mig,distancia_m,log_dist,O_i,D_j
0,10202220,1020,2220,1304729,40657,635.000,583715,13.277168,55484.172,1043.544
1229,25212220,2521,2220,12200,40657,0.001,137927,11.834480,1172.518,1043.544
1843,25222220,2522,2220,2659,40657,0.001,111408,11.620954,144.595,1043.544
2457,26212220,2621,2220,2531,40657,0.001,106867,11.579340,191.580,1043.544
3071,27212220,2721,2220,380,40657,0.001,58392,10.974934,12.609,1043.544
...,...,...,...,...,...,...,...,...,...,...
374540,199671020,19967,1020,41,1304729,0.001,373520,12.830727,0.614,40003.224
375154,199681020,19968,1020,55,1304729,0.001,327602,12.699555,0.614,40003.224
375768,199691020,19969,1020,72,1304729,0.001,386414,12.864665,0.614,40003.224
376382,199701020,19970,1020,25,1304729,0.001,316572,12.665306,0.614,40003.224


In [None]:
# genera estimación redondeada
def prod_sim_est(df, variables, alpha, beta):
    "Estimación del modelo imputando los parámetros alfa y beta previamente calculados"
    prodsimest = np.exp(df['mu_i'] + alpha * df[variables[0]] + beta * df[variables[1]])
    return round(prodsimest)