In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import PoissonRegressor

from patsy import dmatrices

# funciones desarrolladas
from functions.agrupar_dfs_censo import *
from functions.cargar_data import *
from functions.impresion import *

In [2]:
dd_deptos_ps = pd.read_csv('tablas/dd_deptos.csv')
dd_deptos_ps.loc[dd_deptos_ps.largo_limite.isna(), 'largo_limite'] = 0.0001

dd_deptos_ps['log_largo_limite'] = np.log(dd_deptos_ps.largo_limite)

In [36]:
formula = "personas_mig ~ nom_depto_orig + dummy_limit + log_pbi_destino + log_dist -1"

y, X = dmatrices(formula, dd_deptos_ps, return_type = 'dataframe')

In [44]:
X.head()

Unnamed: 0,nom_depto_orig[ARTIGAS],nom_depto_orig[CANELONES],nom_depto_orig[CERRO LARGO],nom_depto_orig[COLONIA],nom_depto_orig[DURAZNO],nom_depto_orig[FLORES],nom_depto_orig[FLORIDA],nom_depto_orig[LAVALLEJA],nom_depto_orig[MALDONADO],nom_depto_orig[MONTEVIDEO],...,nom_depto_orig[RIVERA],nom_depto_orig[ROCHA],nom_depto_orig[SALTO],nom_depto_orig[SAN JOSE],nom_depto_orig[SORIANO],nom_depto_orig[TACUAREMBO],nom_depto_orig[TREINTA Y TRES],dummy_limit[T.True],log_pbi_destino,log_dist
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.469807,13.322404
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.273415,10.023224
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.651595,12.883352
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.594887,12.059734
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.3823,12.200909


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.85, random_state=0)

In [39]:
X_test.shape[0]/X.shape[0]

0.8508771929824561

In [40]:
# alpha=0.0 determina el no uso de rgularización
model = PoissonRegressor(alpha=0, fit_intercept=False, max_iter=10000000000000)
mdl = model.fit(X_train, y_train.values.ravel())
np.around(model.coef_, 4)

array([-6.2424, -5.5568, -7.3108, -6.6952, -7.2907, -8.3857, -7.6316,
       -7.4059, -6.1361, -4.194 , -6.8052, -6.786 ,  0.    , -6.9059,
       -6.8049, -7.0382, -6.7766, -6.7742, -6.8008,  1.2491,  1.0554,
       -0.5367])

In [41]:
print(model.score(X, y.values.ravel()))

-11.154863033170258


In [42]:
y_pred = model.predict(X_test)

In [43]:
scores = cross_val_score(mdl, X, y.values.ravel(), cv=5, scoring='r2')

print(scores)
print('\n')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[-6.66535089e+03 -4.66788840e+06 -1.02993969e+07 -1.29103351e+06
 -1.28048843e+07]


-5813973.69 accuracy with a standard deviation of 4989733.10


https://stats.stackexchange.com/a/174846

Both allow you to compare models. But.

**Cross-validated $R^2$** is likely to give you the $R^2$ you would observe generalizing your model to unseen data (provided the distributions of the train data and the test data remain the same).

**Adjusted $R^2$** is a way to compare models, possibly helps you to pick up the best, penalizing them for the number of predictors they use. This does not give you any hint regarding the performance of the models on new data.

In [38]:
r2_score(y_test, y_pred)

-3.2121166660405356

#### Prueba con las matrices completas

Solo para comparar coeficientes y predicciones contra las que devuelves statsmodel

In [28]:
dd_deptos_ps.personas_mig

0        914
1      33127
2       1387
3       2100
4        982
       ...  
337      174
338       13
339       28
340       25
341       38
Name: personas_mig, Length: 342, dtype: int64

In [29]:
# Si corremos con la matrices completas vemos que los resultados tanto de los coeficientes como de las predicciones son similares a los que devuelve statsmodel

# alpha=0.0 determina el no uso de rgularización
model = PoissonRegressor(alpha=0, fit_intercept = False, max_iter=10000)
mdl = model.fit(X, y.values.ravel())

y_pred = model.predict(X)

print(np.around(model.coef_, 4))


r2 = round(r2_score(dd_deptos_ps['personas_mig'], y_pred), 4)
print('El R2 (coincide con el abordaje de statsmodel):', r2)

y_pred = model.predict(X)
list(np.round(y_pred))

[ 0.8907  0.3788  0.519   0.1649 -0.0836 -1.0428 -0.3388 -0.3429  0.2271
  2.5748  0.5033 -0.012   0.7054  0.0459  0.7633 -0.4886  0.247   0.6533
 -0.021   0.8527 -0.7834]
El R2 (coincide con el abordaje de statsmodel): 0.9726


[484.0,
 29872.0,
 797.0,
 3397.0,
 1081.0,
 705.0,
 2378.0,
 1714.0,
 5034.0,
 1237.0,
 1285.0,
 769.0,
 1159.0,
 953.0,
 4712.0,
 1438.0,
 900.0,
 739.0,
 1761.0,
 423.0,
 144.0,
 252.0,
 107.0,
 64.0,
 121.0,
 99.0,
 253.0,
 266.0,
 206.0,
 337.0,
 106.0,
 378.0,
 163.0,
 173.0,
 219.0,
 92.0,
 14004.0,
 54.0,
 93.0,
 351.0,
 125.0,
 81.0,
 297.0,
 216.0,
 602.0,
 140.0,
 144.0,
 87.0,
 137.0,
 107.0,
 445.0,
 161.0,
 102.0,
 88.0,
 1713.0,
 85.0,
 426.0,
 198.0,
 107.0,
 54.0,
 120.0,
 128.0,
 305.0,
 137.0,
 117.0,
 173.0,
 147.0,
 135.0,
 144.0,
 115.0,
 153.0,
 202.0,
 2291.0,
 47.0,
 505.0,
 62.0,
 97.0,
 85.0,
 129.0,
 86.0,
 229.0,
 150.0,
 184.0,
 70.0,
 73.0,
 106.0,
 263.0,
 242.0,
 83.0,
 50.0,
 1601.0,
 43.0,
 394.0,
 74.0,
 212.0,
 100.0,
 145.0,
 87.0,
 197.0,
 115.0,
 113.0,
 73.0,
 71.0,
 87.0,
 156.0,
 125.0,
 100.0,
 65.0,
 657.0,
 16.0,
 162.0,
 24.0,
 117.0,
 63.0,
 54.0,
 30.0,
 73.0,
 56.0,
 57.0,
 27.0,
 24.0,
 37.0,
 80.0,
 67.0,
 35.0,
 20.0,
 2092.0,
 29.0,

In [30]:
mdl.score()

TypeError: score() missing 2 required positional arguments: 'X' and 'y'