In [None]:
# ==================================================================================
# Diplomado en Estadística Aplicada a la Toma de Decisiones con Lenguaje R y Python
# Universidad Privada Boliviana
# ----------------------------------------------------------------------------------
#   MODELOS PREDICTIVOS PARA LA TOMA DE DECISIONES ESTRATEGICAS
# ----------------------------------------------------------------------------------
#         Enrique Alejandro Laurel Cossio, Septiembre 2024
# ==================================================================================
#                      Métricas de Evaluación de Ajuste
# ==================================================================================

In [29]:
# Cargamos Librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score,confusion_matrix,recall_score

# 1. REGRESIÓN

In [2]:
# cargamos datos
url='https://raw.githubusercontent.com/ealaurel/MODELOS_PREDICTIVOS/main/data/insurance.csv'
seguros = pd.read_csv(url,sep=',', encoding='iso-8859-1')

print(seguros.shape) #
seguros.head(2)

(1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [3]:
# una copia de la fuente de datos
df = seguros.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
# a dummies las variables categoricas
df = pd.get_dummies(df, columns=['region'], drop_first=True)
print(df.shape)
df.head(2)

(1338, 9)


Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,female,27.9,0,yes,16884.924,False,False,True
1,18,male,33.77,1,no,1725.5523,False,True,False


In [5]:
# a dummies
df = pd.get_dummies(df, columns=['sex','smoker'], drop_first=True)
print(df.shape)
df.head(2)

(1338, 9)


Unnamed: 0,age,bmi,children,charges,region_northwest,region_southeast,region_southwest,sex_male,smoker_yes
0,19,27.9,0,16884.924,False,False,True,False,True
1,18,33.77,1,1725.5523,False,True,False,True,False


In [6]:
# variables dummies  a entero
df = df.replace({True: 1, False: 0})

In [7]:
# Definir las variables independientes y dependientes
X = df[['age', 'sex_male', 'bmi', 'children', 'smoker_yes', 'region_northwest','region_southeast','region_southwest']]
y = df['charges'] # variables dependiente

# Añadir una constante a las variables independientes
X = sm.add_constant(X) # relacionado intercepto

In [8]:
# División de conjunto de datos en entrenamiento y testeo
# train - test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2) #Separamos 30% para test

In [9]:
# Ajustar el modelo de regresión lineal
modelo_OLS = sm.OLS(y_train, x_train).fit()

# Resumen del modelo
print(modelo_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.743
Model:                            OLS   Adj. R-squared:                  0.741
Method:                 Least Squares   F-statistic:                     334.7
Date:                Mon, 23 Sep 2024   Prob (F-statistic):          3.10e-267
Time:                        23:07:02   Log-Likelihood:                -9471.2
No. Observations:                 936   AIC:                         1.896e+04
Df Residuals:                     927   BIC:                         1.900e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -1.158e+04   1189.204  

In [12]:
# calculamos predicción
y_pred = modelo_OLS.predict(x_test)
y_pred


Unnamed: 0,0
17,2081.406969
1091,12095.342723
273,10543.987019
270,2596.691265
874,8389.435479
...,...
468,4215.635778
1232,11329.155173
682,33177.522618
86,35799.797379


In [14]:
# R cuadrado del modelo
r2 = r2_score(y_test, y_pred)
print(f"R cuadrado (R^2): {r2:.4f}")

R cuadrado (R^2): 0.7642


In [13]:
# Error Cuadratico Medio
mse = mean_squared_error(y_test, y_pred)
print(f"Error cuadrático medio (MSE): {mse:.4f}")
# Mientras menos mejor

Error cuadrático medio (MSE): 38108732.4898


# 2. CLASIFICACIÓN

In [16]:
# cargamos datos
url='https://raw.githubusercontent.com/ealaurel/MODELOS_PREDICTIVOS/main/data/ifood_df.csv'
ifood_df = pd.read_csv(url,sep=',', encoding='iso-8859-1')

print(ifood_df.shape) #
ifood_df.head(2)

(2205, 39)


Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,marital_Together,marital_Widow,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,MntTotal,MntRegularProds,AcceptedCmpOverall
0,58138.0,0,0,58,635,88,546,172,88,88,...,0,0,0,0,1,0,0,1529,1441,0
1,46344.0,1,1,38,11,1,6,2,1,6,...,0,0,0,0,1,0,0,21,15,0


In [17]:
# una copia a los datos
df = ifood_df.copy()

In [18]:
df.columns

Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Customer_Days', 'marital_Divorced', 'marital_Married',
       'marital_Single', 'marital_Together', 'marital_Widow',
       'education_2n Cycle', 'education_Basic', 'education_Graduation',
       'education_Master', 'education_PhD', 'MntTotal', 'MntRegularProds',
       'AcceptedCmpOverall'],
      dtype='object')

In [19]:
# seleccionamos variables explicativas y variables dependiente
X = df[['Kidhome','Recency','education_Basic','Complain']]
y = df['Response']

In [20]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)

In [21]:
# Ajusta el modelo de regresión logística
modelo_logit = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.397029
         Iterations 7


In [22]:
# Muestra un resumen del modelo
print(modelo_logit.summary())

                           Logit Regression Results                           
Dep. Variable:               Response   No. Observations:                 1764
Model:                          Logit   Df Residuals:                     1760
Method:                           MLE   Df Model:                            3
Date:                Mon, 23 Sep 2024   Pseudo R-squ.:                 0.03667
Time:                        23:21:11   Log-Likelihood:                -700.36
converged:                       True   LL-Null:                       -727.02
Covariance Type:            nonrobust   LLR p-value:                 1.565e-11
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Kidhome            -0.6746      0.126     -5.347      0.000      -0.922      -0.427
Recency            -0.0319      0.002    -18.034      0.000      -0.035      -0.028
education_Basic    -1.4569      

In [23]:
# prediccion
y_pred_train = modelo_logit.predict(X_train)
y_pred_train.head(2)

Unnamed: 0,0
415,0.043164
1609,0.159956


In [34]:
# Calcula la Exactitud del modelo en el entrenamiento
# y_train: valores verdadaderos
# y_pred: valores predichos
exactitud_train = accuracy_score(y_train, (y_pred_train>0.5).astype(int))
print("Exactitud o accuracy del modelo:", exactitud_train)

Exactitud o accuracy del modelo: 0.8560090702947846


In [35]:
# Realiza predicciones en el conjunto de datos de prueba
y_pred = modelo_logit.predict(X_test)
y_pred

Unnamed: 0,0
48,0.083407
193,0.029847
1824,0.152781
2038,0.218264
380,0.091019
...,...
549,0.072317
660,0.352970
1361,0.428724
480,0.066155


In [38]:
# Matriz de confusión
confusion = confusion_matrix(y_test, (y_pred>0.5).astype(int))
print("Matriz de confusión:")
print(confusion)

Matriz de confusión:
[[362   0]
 [ 79   0]]


In [39]:
# Exactitud o Accuracy del modelo
exactitud_test = accuracy_score(y_test, (y_pred>0.5).astype(int))
print("Precisión del modelo:", exactitud_test)

Precisión del modelo: 0.8208616780045351


In [40]:
# Medida de sensibilidad
recall = recall_score(y_test, (y_pred>0.5).astype(int))
print(f"Sensibilidad (Recall): {recall:.2f}")

Sensibilidad (Recall): 0.00
