In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model, datasets
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [2]:
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def threshold(z):
    z = np.array([1 if z_ >= 0.5 else 0 for z_ in z])
    return z

class LogisticRegression():
    """
    Parameters:
    -----------
    n_iterations: The number of training iterations the algorithm will tune the weights for.
    learning_rate: The step length that will be used when updating the weights.
    """
    def __init__(self, n_iterations = 1000, learning_rate = 0.01):
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate
        self.w = None
        self.log_loss = 0

    def init_weight(self, n_features):
        # Init weights all values 1
        self.w = np.full((n_features, ), 0.1).reshape(-1, 1)

    def fit(self, X, y):

        # Insert one more column value 1 for bias
        X = np.insert(X, 0, 1, axis=1)

        n_samples, n_features = X.shape

        self.init_weight(n_features=X.shape[1])

        # Do gradient descent for n_iterations
        for i in range(self.n_iterations+1):

            # Calculate y prediction
            y_pred = sigmoid(np.dot(X, self.w))

            # Calculate Gradient Descent for Log Loss Error
            self.log_loss = (1/n_samples) * (np.dot(-y.T, np.log(y_pred)) - np.dot((1-y).T, np.log(1-y_pred)))

            if i % 100 == 0:
             print("Cost: ", self.log_loss)
            
            grad = (1/n_samples) * np.dot(X.T, (y_pred-y))

            #Update weights
            self.w -= self.learning_rate * grad


    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)
        y_pred = X.dot(self.w)
        return threshold(sigmoid(y_pred))

In [3]:
np.random.seed(3)

In [4]:
# Import our diabetes dataset
df = pd.read_csv("diabetes2.csv")

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
# Choose Glucose and BloodPressure column to classify
df = df[["Glucose", "BloodPressure", "Outcome"]]

In [7]:
df.describe()

Unnamed: 0,Glucose,BloodPressure,Outcome
count,768.0,768.0,768.0
mean,120.894531,69.105469,0.348958
std,31.972618,19.355807,0.476951
min,0.0,0.0,0.0
25%,99.0,62.0,0.0
50%,117.0,72.0,0.0
75%,140.25,80.0,1.0
max,199.0,122.0,1.0


We can see that the minimum values are 0. That is not logically right, since BloodPressure cannot be 0 (I'm not really sure about Glucose but let's just assume that it must be different from zero). So we're gonna delete value 0 from the data

In [8]:
# Delete value 0
df = df[(df[['Glucose','BloodPressure']] != 0).all(axis=1)]
df.describe()

Unnamed: 0,Glucose,BloodPressure,Outcome
count,728.0,728.0,728.0
mean,121.873626,72.438187,0.343407
std,30.679207,12.386109,0.475172
min,44.0,24.0,0.0
25%,100.0,64.0,0.0
50%,117.0,72.0,0.0
75%,141.25,80.0,1.0
max,199.0,122.0,1.0


In [9]:
X = df[['Glucose','BloodPressure']].values
X

array([[148,  72],
       [ 85,  66],
       [183,  64],
       ...,
       [121,  72],
       [126,  60],
       [ 93,  70]], dtype=int64)

In [10]:
y = df['Outcome'].values.reshape(-1, 1)

In [11]:
# Normalize grades to values between 0 and 1 for more efficient computation
normalized_range = preprocessing.MinMaxScaler(feature_range=(0, 1))

X = normalized_range.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
# Our model
model = LogisticRegression(n_iterations=1000, learning_rate = 0.25)
model.fit(X_train, y_train)

Cost:  [[0.72233207]]
Cost:  [[0.6181795]]
Cost:  [[0.59125294]]
Cost:  [[0.57232366]]
Cost:  [[0.55880346]]
Cost:  [[0.54897677]]
Cost:  [[0.54171068]]
Cost:  [[0.53625036]]
Cost:  [[0.53208545]]
Cost:  [[0.5288651]]
Cost:  [[0.52634398]]


In [14]:
# Predict
y_pred = model.predict(X_test)

In [15]:
print("Our model acurracy: %.2f %%" %(100*accuracy_score(y_test, y_pred))) 

Our model acurracy: 77.17 %


In [16]:
# Sklearn model
scikit_log_reg = linear_model.LogisticRegression()
scikit_log_reg.fit(X_train,y_train.flatten())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
y_pred1 = scikit_log_reg.predict(X_test)

In [18]:
print("Sklearn model acurracy: %.2f %%" %(100*accuracy_score(y_test.flatten(), y_pred1))) 

Sklearn model acurracy: 77.63 %


Both models are not very accurate (maybe because noises, outliers or something else...).
So let's just compare our model and sklearn model results

In [19]:
# Compare our intercept and Sklearn intercept
print("Our model intercept: ", model.w[0])
print("Sklearn model intercept: ", scikit_log_reg.intercept_)

Our model intercept:  [-3.05706066]
Sklearn model intercept:  [-3.09853719]


In [20]:
# Compare our coefficient and Sklearn coefficient
print("Our model coefficient: ", model.w[1:].flatten())
print("Sklearn model coefficient: ", scikit_log_reg.coef_)

Our model coefficient:  [4.27193748 0.49104607]
Sklearn model coefficient:  [[4.08158726 0.7863205 ]]
