In [21]:
import numpy as np
import pandas as pd
import warnings
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, f1_score

ModuleNotFoundError: No module named 'imblearn'

In [None]:
class OurLogisticRegression (BaseEstimator, ClassifierMixin):
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
       

    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)

    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        # weights initialization
        self.w = np.zeros(X.shape[1])

        for i in range(self.num_iter):
            z = np.dot(X, self.w)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h-y)) / y.size
            #print(gradient.shape, self.w.shape, z.shape,h.shape)
            #gradient = (h - y) / y.size
            self.w -= self.lr * gradient

    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        return self.__sigmoid(np.dot(X, self.w))

    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold
    
    def score(self, y_pred,y_test):
        return float(sum(y_pred == y_test)) / float(len(y_test))


In [None]:
df = pd.read_csv("Bank_Personal_Loan_Modelling.csv", encoding='latin-1')
df.head()

#X = np.array([[1,2],[1,3],[1,4],[1,5]])
#y = np.array([[0],[0],[1],[1]])
#ourRegression=LogisticRegression(alpha=0.01,iterations=10000)

#w, J_history = ourRegression.gradient_descent(X, y)

#print("W encontrado por gradiente descendente: ")
#print(w)
    


Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [None]:
features= df.columns.tolist()
print(features)
X=df[features]
y = df["Personal Loan"] 

# PUNTO 1 - MEJORAR EL ACCURACY 

# ESTANDARIZACION 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Calcula la matriz de covarianza
cov_matrix = np.cov(X_scaled, rowvar=False)

#PCA
pca = PCA()
pca.fit(cov_matrix)

componentes_principales = pca.components_
varianza_explicada = pca.explained_variance_ratio_

varianza_acumulativa = np.cumsum(varianza_explicada)
num_componentes_deseados = np.argmax(varianza_acumulativa >= 0.95) + 1  

X_reduced = pca.transform(X)[:, :num_componentes_deseados]

oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_scaled, y)

X_train,X_test,y_train,y_test=train_test_split(X_resampled,y_resampled,test_size=0.30,random_state=45)

['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']




In [None]:
model = OurLogisticRegression()

# fit the model to the training data
model.fit(X_train, y_train)

# Creamos un modelo de regresión logística
#model = LogisticRegression()

#model.fit(X_train, y_train)



In [None]:
warnings.filterwarnings("ignore", category=RuntimeWarning)

# PUNTO 2
# GRID SEARCH 
# Definimos la cuadrícula de hiperparámetros que queremos probar
param_grid = {
    'lr': [0.01, 0.1, 1],
    'num_iter': [1000, 5000],
    'fit_intercept': [True, False]
}

grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5))

# búsqueda de hiperparámetros en el conjunto de entrenamiento
grid_search.fit(X_train, y_train)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=OurLogisticRegression(),
             param_grid={'fit_intercept': [True, False], 'lr': [0.01, 0.1, 1],
                         'num_iter': [1000, 5000]})

In [None]:
warnings.filterwarnings("ignore", category=RuntimeWarning)
# predict probabilities for test set
##probs = model.predict_prob(X_test)

# predict classes for test set
#y_pred = model.predict(X_test, 0.5)


In [None]:
# mejores hiperparámetros encontrados
best_params = grid_search.best_params_
print("Mejores hiperparámetros encontrados:")
print(best_params)

# mejor modelo entrenado
best_model = grid_search.best_estimator_

probs = best_model.predict_prob(X_test)
y_pred = best_model.predict(X_test, threshold=0.5)

train_accuracy = accuracy_score(y_train, best_model.predict(X_train, threshold=0.5))
test_accuracy = accuracy_score(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)  # Calcula el Recall
f1 = f1_score(y_test, y_pred)  # Calcula el F1 Score

print("Precisión del mejor modelo entrenamiento:", train_accuracy)
print("Recall del mejor modelo:", recall)
print("F1 Score del mejor modelo:", f1)

print("Precisión del testing:", test_accuracy)

Mejores hiperparámetros encontrados:
{'fit_intercept': True, 'lr': 0.01, 'num_iter': 1000}
Precisión del mejor modelo entrenamiento: 0.9091428571428571
Recall del mejor modelo: 0.0
F1 Score del mejor modelo: 0.0
Precisión del testing: 0.892


In [None]:
#print(y_pred)
#print(y_test)

print("Precisión: ",best_model.score(y_pred,y_test))


Precisión:  0.892
