In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
    
# funciones desarrolladas
from functions.cargar_data import *
from functions.modelos import *

In [2]:
# carga datos diádicos
dd_deptos = cargar_dd_deptos()
dd_deptos.head(2)

Unnamed: 0,cod,depto_origen,depto_destino,personas_mig,personas_mig_100k,pob_origen,pob_destino,dist,pbi_origen,pbi_porcen_ori,...,dummy_limit,empresas_origen,empresas_destino,log_pbi_destino,log_dist,nom_depto_orig,nom_depto_des,dist_km,pbi_destino_millardos,largo_limite_km
0,102,1,2,914,891,1318755,73377,610726,465848031,50.3,...,False,3124,55,16.469807,13.322404,MONTEVIDEO,ARTIGAS,610.726,142.1498,1e-07
1,103,1,3,33127,0,1318755,520173,22544,465848031,50.3,...,True,3124,563,18.273415,10.023224,MONTEVIDEO,CANELONES,22.544,863.06492,60.107


## Modelo con Montevideo

In [3]:
# define el string tomado como fórmula
formula = """
personas_mig ~ nom_depto_orig + dummy_limit
+ largo_limite_km + pbi_destino_millardos + dist_km -1
"""

# entrena y ajusta el modelo
family = sm.families.Poisson()
prodSim = smf.glm(formula=formula, data=dd_deptos, family = family).fit()

# resultados
prodSim.summary()

0,1,2,3
Dep. Variable:,personas_mig,No. Observations:,342.0
Model:,GLM,Df Residuals:,319.0
Model Family:,Poisson,Df Model:,22.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-29027.0
Date:,"Wed, 25 Aug 2021",Deviance:,55827.0
Time:,20:15:11,Pearson chi2:,75400.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
nom_depto_orig[ARTIGAS],5.2834,0.023,231.352,0.000,5.239,5.328
nom_depto_orig[CANELONES],4.7691,0.016,306.139,0.000,4.739,4.800
nom_depto_orig[CERRO LARGO],4.6945,0.021,223.233,0.000,4.653,4.736
nom_depto_orig[COLONIA],4.6439,0.018,261.150,0.000,4.609,4.679
nom_depto_orig[DURAZNO],4.1996,0.020,208.043,0.000,4.160,4.239
nom_depto_orig[FLORES],3.3588,0.027,124.054,0.000,3.306,3.412
nom_depto_orig[FLORIDA],4.1804,0.019,223.938,0.000,4.144,4.217
nom_depto_orig[LAVALLEJA],4.0977,0.020,205.055,0.000,4.059,4.137
nom_depto_orig[MALDONADO],4.7656,0.016,289.160,0.000,4.733,4.798


In [4]:
print_scores_simple(dd_deptos.personas_mig, prodSim.predict())

$R²$ = 0.7915

RMSE = 889.053

In [5]:
prodSim.save("modelos/restringido_origen_mvo.pickle")

## Modelo sin Montevideo

In [6]:
dd_deptos_no_mvo = dd_deptos.loc[(dd_deptos.depto_origen!=1) & (dd_deptos.depto_destino!=1)]

In [7]:
# define el string tomado como fórmula
formula = """
personas_mig ~ nom_depto_orig + dummy_limit
+ largo_limite_km + pbi_destino_millardos + dist_km -1
"""

# entrena y ajusta el modelo
family = sm.families.Poisson()
prodSim = smf.glm(formula=formula, data=dd_deptos_no_mvo, family = family).fit()

# resultados
prodSim.summary()

0,1,2,3
Dep. Variable:,personas_mig,No. Observations:,306.0
Model:,GLM,Df Residuals:,284.0
Model Family:,Poisson,Df Model:,21.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-6428.8
Date:,"Wed, 25 Aug 2021",Deviance:,10964.0
Time:,20:15:14,Pearson chi2:,13300.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
nom_depto_orig[ARTIGAS],4.6405,0.035,131.893,0.000,4.571,4.709
nom_depto_orig[CANELONES],5.3474,0.023,234.981,0.000,5.303,5.392
nom_depto_orig[CERRO LARGO],4.3357,0.032,136.800,0.000,4.274,4.398
nom_depto_orig[COLONIA],4.0844,0.029,139.449,0.000,4.027,4.142
nom_depto_orig[DURAZNO],3.8025,0.030,125.649,0.000,3.743,3.862
nom_depto_orig[FLORES],3.0400,0.039,78.325,0.000,2.964,3.116
nom_depto_orig[FLORIDA],3.8090,0.029,129.226,0.000,3.751,3.867
nom_depto_orig[LAVALLEJA],3.6887,0.031,118.106,0.000,3.627,3.750
nom_depto_orig[MALDONADO],4.4988,0.028,162.535,0.000,4.445,4.553


In [8]:
print_scores_simple(dd_deptos_no_mvo.personas_mig, prodSim.predict())

$R²$ = 0.7562

RMSE = 101.3761

In [9]:
prodSim.save("modelos/restringido_origen_noMVO.pickle")