In [7]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error

In [8]:
# Cargar datos de viviendas en California
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target

In [9]:
# Seleccionar variables relevantes para el modelo
df_model = df[['MedInc', 'AveRooms', 'AveOccup', 'Latitude', 'target']].copy()

In [10]:
print("Primeras observaciones del dataset:")
print(df_model.head())
print("\nEstadisticas descriptivas:")
print(df_model.describe())

Primeras observaciones del dataset:
   MedInc  AveRooms  AveOccup  Latitude  target
0  8.3252  6.984127  2.555556     37.88   4.526
1  8.3014  6.238137  2.109842     37.86   3.585
2  7.2574  8.288136  2.802260     37.85   3.521
3  5.6431  5.817352  2.547945     37.85   3.413
4  3.8462  6.281853  2.181467     37.85   3.422

Estadisticas descriptivas:
             MedInc      AveRooms      AveOccup      Latitude        target
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000
mean       3.870671      5.429000      3.070655     35.631861      2.068558
std        1.899822      2.474173     10.386050      2.135952      1.153956
min        0.499900      0.846154      0.692308     32.540000      0.149990
25%        2.563400      4.440716      2.429741     33.930000      1.196000
50%        3.534800      5.229129      2.818116     34.260000      1.797000
75%        4.743250      6.052381      3.282261     37.710000      2.647250
max       15.000100    141.909091   1243

In [11]:
# Especificar formula del GLM
formula = 'target ~ MedInc + AveRooms + AveOccup + Latitude'

In [12]:
# Ajustar GLM con familia Poisson
model = smf.glm(formula=formula, data=df_model, family=sm.families.Poisson())
results = model.fit()

In [13]:
print("\n" + "="*70)
print("RESUMEN DEL MODELO GLM - VALORACION DE VIVIENDAS")
print("="*70)
print(results.summary())


RESUMEN DEL MODELO GLM - VALORACION DE VIVIENDAS
                 Generalized Linear Model Regression Results                  
Dep. Variable:                 target   No. Observations:                20640
Model:                            GLM   Df Residuals:                    20635
Model Family:                 Poisson   Df Model:                            4
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -29525.
Date:                Sun, 12 Oct 2025   Deviance:                       7082.5
Time:                        11:20:33   Pearson chi2:                 1.77e+04
No. Iterations:                     8   Pseudo R-squ. (CS):             0.2305
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
In

In [14]:
# Predicciones sobre el conjunto completo
predictions = results.predict(df_model)

In [15]:
# Calculo del RMSE
rmse = np.sqrt(mean_squared_error(df_model['target'], predictions))

In [22]:
df['Estimación'] = results.predict(df)
df['Estimación'] = df['Estimación'].round(3)
df['Error'] = df['target'] - df['Estimación']
#.head()
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target,Estimación,Error
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526,3.662,0.864
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585,3.727,-0.142
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521,2.991,0.530
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,2.453,0.960
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,1.826,1.596
...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,1.249,-0.468
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,1.422,-0.651
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,1.276,-0.353
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,1.309,-0.462


In [16]:
print("\n" + "="*70)
print("INDICADORES DE AJUSTE")
print("="*70)
print(f"RMSE: {rmse:.4f}")
print(f"AIC: {results.aic:.2f}")
print(f"BIC: {results.bic:.2f}")


INDICADORES DE AJUSTE
RMSE: 0.9081
AIC: 59060.98
BIC: -197925.89




In [17]:
# Interpretacion de coeficientes para contexto actuarial
print("\n" + "="*70)
print("INTERPRETACION PARA ASEGURAMIENTO DE HOGARES")
print("="*70)
print("\nCoeficientes del modelo (escala logaritmica):")
for variable, coef in results.params.items():
    if variable != 'Intercept':
        efecto_pct = (np.exp(coef) - 1) * 100
        print(f"{variable:15s}: {coef:8.4f} (cambio: {efecto_pct:+6.2f}%)")


INTERPRETACION PARA ASEGURAMIENTO DE HOGARES

Coeficientes del modelo (escala logaritmica):
MedInc         :   0.1598 (cambio: +17.33%)
AveRooms       :  -0.0232 (cambio:  -2.30%)
AveOccup       :  -0.0082 (cambio:  -0.82%)
Latitude       :  -0.0246 (cambio:  -2.43%)


In [18]:
print("\n" + "="*70)
print("NOTA DIDACTICA:")
print("El GLM Poisson es util para modelar valores positivos.")
print("En aseguramiento, ayuda a entender factores de riesgo y pricing.")
print("="*70)


NOTA DIDACTICA:
El GLM Poisson es util para modelar valores positivos.
En aseguramiento, ayuda a entender factores de riesgo y pricing.
