# Testing the model

Using your solution so far, test the model on new data.

The new data is located in the ‘Bank_data_testing.csv’.

Good luck!

## Import the relevant libraries

In [5]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [10]:
raw_data=pd.read_csv("Bank-data-testing.csv")
raw_data.head(-10)

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,3,4.120,0.0,0.0,0.0,0.0,1468.0,yes
4,4,4.963,0.0,0.0,0.0,0.0,36.0,no
...,...,...,...,...,...,...,...,...
207,207,4.120,0.0,0.0,0.0,0.0,91.0,no
208,208,1.410,0.0,0.0,1.0,0.0,291.0,no
209,209,4.968,0.0,0.0,0.0,0.0,81.0,no
210,210,1.266,0.0,1.0,0.0,0.0,533.0,yes


In [21]:
#Ok vamos a limpiar la data y luego de limpiar vamos a asignar las variables independientes y el target
df=raw_data.drop(columns="Unnamed: 0")
df["y"]=df["y"].map({"no":0, "yes":1})
df=df.rename(columns={"y":"exito"})
df.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,exito
0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,4.12,0.0,0.0,0.0,0.0,1468.0,1
4,4.963,0.0,0.0,0.0,0.0,36.0,0


In [None]:
#Listo procedemos a partir el dataframe en dos porciones una de entrenamiento otra de testing, antes de eso hay que definir X, y Y 
x=df.drop("exito", axis=1)
y=df["exito"]

#Comprobando asignacion de X variables Feature
x.head()
#Ojo no olvidar la constante de intercepto cuando x=0
x_const=sm.add_constant(x)
x_const.head()

Unnamed: 0,const,interest_rate,credit,march,may,previous,duration
0,1.0,1.313,0.0,1.0,0.0,0.0,487.0
1,1.0,4.961,0.0,0.0,0.0,0.0,132.0
2,1.0,4.856,0.0,1.0,0.0,0.0,92.0
3,1.0,4.12,0.0,0.0,0.0,0.0,1468.0
4,1.0,4.963,0.0,0.0,0.0,0.0,36.0


### Declare the dependent and independent variables

procedemos a partir el dataframe en dos porciones una porcion para entrenamiento y otra para test. Asi podemos evaluar si hay Overfitting y que tan crítico es 
por lo general el acc en nuestra matrix de confusion de test debería ser menor que el acc en nuestra matrix de confusion de entrenamiento.

In [None]:
#Procedemos a partir los datos con la funncion train_test_split de Sklearn 
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test=train_test_split( x_const, y, test_size=0.2, random_state=42)


In [32]:
#Creamos el modelo de regresion logistica con el data train 
modelo_train=sm.Logit(y_train, x_train)
Res_mod_train=modelo_train.fit()

         Current function value: 0.312612
         Iterations: 35




In [None]:
#Log-likelihood es mayor a LL-Null esto es una buena señal y por otro lado LLR P-VALUE tiende a cero podemos ver todas las constantes de cada variable independiente.
Res_mod_train.summary()

0,1,2,3
Dep. Variable:,exito,No. Observations:,177.0
Model:,Logit,Df Residuals:,170.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 10 Oct 2025",Pseudo R-squ.:,0.549
Time:,11:03:41,Log-Likelihood:,-55.332
converged:,False,LL-Null:,-122.68
Covariance Type:,nonrobust,LLR p-value:,1.312e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0087,0.517,0.017,0.987,-1.005,1.023
interest_rate,-0.8354,0.169,-4.947,0.000,-1.166,-0.504
credit,13.1735,578.648,0.023,0.982,-1120.956,1147.302
march,-1.6857,0.577,-2.923,0.003,-2.816,-0.556
may,0.0043,0.336,0.013,0.990,-0.655,0.663
previous,27.2803,2.93e+05,9.32e-05,1.000,-5.74e+05,5.74e+05
duration,0.0071,0.001,5.825,0.000,0.005,0.009


In [42]:
#Inyecto la funcion de calculo de accuracy para calcular el acc de la matrix de confusion segun mi data set, target, modelo

def confusion_matrix(data_input,actual_target,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data_input)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_target, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [43]:
#Llamo la funcion de calculo de acc y calculo de matrix de confusion segun sus valores del dataframe fraccionado para el entrenamiento
res_train_model=confusion_matrix(x_train,y_train,Res_mod_train)
res_train_model

(array([[75., 13.],
        [ 9., 80.]]),
 np.float64(0.8757062146892656))

In [44]:
#Ahora hago el calculo de como se comporta el mismo modelo entrenado anteriormente con el dataframe fraccionado para el Test.De esta manera podemos comprobar el nivel de 
# Overfitting que hay presente en los datos anteriores es decir el nivel de ruido que hay en los datos de entrenamiento. Lo ideal es que disminuya lo menos posible

res_test_model=confusion_matrix(x_test,y_test, Res_mod_train)
res_test_model


(array([[18.,  5.],
        [ 3., 19.]]),
 np.float64(0.8222222222222222))

In [None]:
# En efecto podemos observar el acc de la matrix de confusion de los datos Test se reducido de 87.6% a 82.2% esto quiere decir que hay un leve overfitting 
# en el modelo de entrenamiento que es lo normal