In [78]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from patsy import dmatrices

# funciones desarrolladas
from functions.agrupar_dfs_censo import *
from functions.cargar_data import *
from functions.impresion import *

In [79]:
dd_deptos = pd.read_csv('tablas/dd_deptos.csv')

In [80]:
y, X = dmatrices('personas_mig ~ nom_depto_orig + log_pbi_destino + log_dist -1', dd_deptos, return_type = 'dataframe')

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [82]:
X_train.shape, X_test.shape

((256, 21), (86, 21))

In [83]:
# alpha=0.0 determina el no uso de rgularización
model = PoissonRegressor(alpha=0, fit_intercept = False, max_iter=10000)
mdl = model.fit(X_train, y_train.values.ravel())
np.around(model.coef_, 4)

array([ 0.43  , -0.0253,  0.1162, -0.1806, -0.492 , -1.6187, -0.8578,
       -0.9925, -0.0654,  2.1845,  0.2019, -0.4774,  0.3539, -0.2829,
        0.3936, -0.9386, -0.1382,  0.2352, -0.45  ,  0.8642, -0.7661])

In [84]:
print(model.score(X, y.values.ravel()))

0.9418089691388523


In [85]:
y_pred = model.predict(X_test)
y_pred

array([ 105.24077847,  236.75686793,   44.0504327 ,  817.0906373 ,
        172.76142969,   95.60755913,   86.54816855,  100.68721815,
        239.35914559,   75.94274207,   52.05180594, 1700.43734291,
        289.90754221,  119.31149592,  145.67737115,   24.92768387,
        489.47237241,  286.97822993,  358.30840565,   66.87983921,
        208.20132017,  166.00041483,   70.65993454,  128.18842805,
        195.64051069,  498.56999415,  289.20175062,   54.80629014,
         80.93075326,  288.78152586,  355.90968829,  150.97242738,
       1801.29447314,   81.67033571,  147.7271954 ,   72.71876792,
         42.95931023,  245.10496725,  172.64666784,  313.91236762,
         83.92821718,  165.00632991,  142.0924345 ,  156.56927918,
         53.52596042,   66.85156242,   71.03860081,  102.48992402,
       3634.19242058,  117.06526358,  321.86640832,  182.01371292,
         73.57630684,   94.39441781,  117.7507994 ,   98.75342796,
         70.64910339,  544.82893726,   64.10639015,  137.18160

In [89]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(mdl, X, y.values.ravel(), scoring='r2')

print(scores)
print('\n')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

[-0.05307546  0.33188368 -0.35017115  0.26512283  0.70919782]


0.18 accuracy with a standard deviation of 0.36


https://stats.stackexchange.com/a/174846

Both allow you to compare models. But.

**Cross-validated $R^2$** is likely to give you the $R^2$ you would observe generalizing your model to unseen data (provided the distributions of the train data and the test data remain the same).

**Adjusted $R^2$** is a way to compare models, possibly helps you to pick up the best, penalizing them for the number of predictors they use. This does not give you any hint regarding the performance of the models on new data.

In [90]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.6959498766865473

#### Prueba con las matrices completas

Solo para comparar coeficientes y predicciones contra las que devuelves statsmodel

In [104]:
dd_deptos.personas_mig

0        914
1      33127
2       1387
3       2100
4        982
       ...  
337      174
338       13
339       28
340       25
341       38
Name: personas_mig, Length: 342, dtype: int64

In [108]:
# Si corremos con la matrices completas vemos que los resultados tanto de los coeficientes como de las predicciones son similares a los que devuelve statsmodel

# alpha=0.0 determina el no uso de rgularización
model = PoissonRegressor(alpha=0, fit_intercept = False, max_iter=10000)
mdl = model.fit(X, y.values.ravel())

y_pred = model.predict(X)

print(np.around(model.coef_, 4))


r2 = round(r2_score(dd_deptos['personas_mig'], y_pred), 4)
print('El R2 (coincide con el abordaje de statsmodel):', r2)

y_pred = model.predict(X)
list(np.round(y_pred))

[ 0.8907  0.3788  0.519   0.1649 -0.0836 -1.0428 -0.3388 -0.3429  0.2271
  2.5748  0.5033 -0.012   0.7054  0.0459  0.7633 -0.4886  0.247   0.6533
 -0.021   0.8527 -0.7834]
El R2 (coincide con el abordaje de statsmodel): 0.9726


[484.0,
 29872.0,
 797.0,
 3397.0,
 1081.0,
 705.0,
 2378.0,
 1714.0,
 5034.0,
 1237.0,
 1285.0,
 769.0,
 1159.0,
 953.0,
 4712.0,
 1438.0,
 900.0,
 739.0,
 1761.0,
 423.0,
 144.0,
 252.0,
 107.0,
 64.0,
 121.0,
 99.0,
 253.0,
 266.0,
 206.0,
 337.0,
 106.0,
 378.0,
 163.0,
 173.0,
 219.0,
 92.0,
 14004.0,
 54.0,
 93.0,
 351.0,
 125.0,
 81.0,
 297.0,
 216.0,
 602.0,
 140.0,
 144.0,
 87.0,
 137.0,
 107.0,
 445.0,
 161.0,
 102.0,
 88.0,
 1713.0,
 85.0,
 426.0,
 198.0,
 107.0,
 54.0,
 120.0,
 128.0,
 305.0,
 137.0,
 117.0,
 173.0,
 147.0,
 135.0,
 144.0,
 115.0,
 153.0,
 202.0,
 2291.0,
 47.0,
 505.0,
 62.0,
 97.0,
 85.0,
 129.0,
 86.0,
 229.0,
 150.0,
 184.0,
 70.0,
 73.0,
 106.0,
 263.0,
 242.0,
 83.0,
 50.0,
 1601.0,
 43.0,
 394.0,
 74.0,
 212.0,
 100.0,
 145.0,
 87.0,
 197.0,
 115.0,
 113.0,
 73.0,
 71.0,
 87.0,
 156.0,
 125.0,
 100.0,
 65.0,
 657.0,
 16.0,
 162.0,
 24.0,
 117.0,
 63.0,
 54.0,
 30.0,
 73.0,
 56.0,
 57.0,
 27.0,
 24.0,
 37.0,
 80.0,
 67.0,
 35.0,
 20.0,
 2092.0,
 29.0,

In [95]:
mdl.score()

TypeError: score() missing 2 required positional arguments: 'X' and 'y'