In [46]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import StratifiedKFold

In [47]:
class OurLogisticRegression (BaseEstimator, ClassifierMixin):
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
       

    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)

    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        # weights initialization
        self.w = np.zeros(X.shape[1])

        for i in range(self.num_iter):
            z = np.dot(X, self.w)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h-y)) / y.size
            #print(gradient.shape, self.w.shape, z.shape,h.shape)
            #gradient = (h - y) / y.size
            self.w -= self.lr * gradient

    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)

        return self.__sigmoid(np.dot(X, self.w))

    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold
    
    def score(self, y_pred,y_test):
        return float(sum(y_pred == y_test)) / float(len(y_test))


In [48]:
df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv")
df.head()

#X = np.array([[1,2],[1,3],[1,4],[1,5]])
#y = np.array([[0],[0],[1],[1]])
#ourRegression=LogisticRegression(alpha=0.01,iterations=10000)

#w, J_history = ourRegression.gradient_descent(X, y)

#print("W encontrado por gradiente descendente: ")
#print(w)
    


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [49]:
features=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
X=df[features]
y=df.Outcome

# PUNTO 1 - MEJORAR EL ACCURACY 

# ESTANDARIZACION 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Calcula la matriz de covarianza
cov_matrix = np.cov(X_scaled, rowvar=False)

# DISMINUCION DE VECTOR POR MATRIZ DE COVARIANZA

mini = np.min(np.abs(cov_matrix))
pos = np.argwhere(np.abs(cov_matrix) == mini)

# SE ELIMINA DIABETESPEDIGREEFUNCTION 

X_deleted = np.delete(X_scaled, 6, axis=1)

X_train,X_test,y_train,y_test=train_test_split(X_deleted,y,test_size=0.150,random_state=45)

In [50]:
model = OurLogisticRegression(lr=0.15, num_iter=3500)

# fit the model to the training data
model.fit(X_train, y_train)

# Creamos un modelo de regresión logística
#model = LogisticRegression()

#model.fit(X_train, y_train)



In [51]:
# PUNTO 2
# GRID SEARCH 
# Definimos la cuadrícula de hiperparámetros que queremos probar
param_grid = {
    'lr': [0.01, 0.1, 0.5, 0.001],
    'num_iter': [35, 350, 3500, 35000],
    'fit_intercept': [True, False]
}

grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5))

# búsqueda de hiperparámetros en el conjunto de entrenamiento
grid_search.fit(X_train, y_train)


In [52]:
# predict probabilities for test set
probs = model.predict_prob(X_test)

# predict classes for test set
y_pred = model.predict(X_test, 0.5)

In [53]:
# mejores hiperparámetros encontrados
best_params = grid_search.best_params_
print("Mejores hiperparámetros encontrados:")
print(best_params)

# mejor modelo entrenado
best_model = grid_search.best_estimator_

accuracy = best_model.score(y_pred, y_test)
print(f"Precisión: {accuracy}")

Mejores hiperparámetros encontrados:
{'fit_intercept': True, 'lr': 0.01, 'num_iter': 35}
Precisión: 0.7327586206896551


In [54]:
#print(y_pred)
#print(y_test)

print("Precisión: ",model.score(y_pred,y_test))


Precisión:  0.7327586206896551
