# Logistic Regression From Scratch

In [2]:
import numpy as np
from sklearn.datasets import make_blobs

In [12]:
class LogisticRegression:
    """
    1. i need LogisticRegression class
    2. I need __init__ function with learning rate and #iteration parameter
    3. I need train function with X,y parameter
    4. I need predict function
    5. I need sigmoid function for gradiant decent and hypothesis
    """
    def __init__(self, learning_rate = 0.1, iteration = 10000):
        """
        :param learning_rate: A samll value needed for gradient decent, default value id 0.1.
        :param iteration: Number of training iteration, default value is 10,000.
        """
        self.lr = learning_rate
        self.it = iteration
    
    def cost_function(self, y, y_pred):
        """
        :param y: Original target value.
        :param y_pred: predicted target value.
        """
        return -1 / self.m * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

    # hypothesis function.   
    def sigmoid(self, z):
        """
        :param z: Value to calculate sigmoid.
        """
        return 1 / (1 + np.exp(-z))

    def train(self, X, y):
        """
        :param X: training data feature values ---> N Dimentional vector.
        :param y: training data target value -----> 1 Dimentional array.
        """
        # Target value should be in the shape of (n, 1) not (n, ).
        # So, this will check that and change the shape to (n, 1), if not.
        try:
            y.shape[1]
        except IndexError as e:
            # we need to change it to the 1 D array, not a list.
            print("ERROR: Target array should be a one dimentional array not a list"
                  "----> here the target value not in the shape of (n,1). \nShape ({shape_y_0},1) and {shape_y} not match"
                  .format(shape_y_0 = y.shape[0] , shape_y = y.shape))
            return 
        # m is number of training samples.
        self.m  = X.shape[0]
        # n is number of features/columns/dependant variables.
        self.n = X.shape[1]

        # Set the initial weight.
        self.w = np.zeros((self.n , 1))
        # bias.
        self.b = 0

        for it in range(1, self.it+1):
            # 1. Find the predicted value.
            # 2. Find the Cost function.
            # 3. Find the derivation of weights and bias.
            # 4. Apply Gradient Decent.

            y_pred = self.sigmoid(np.dot(X, self.w) + self.b)

            cost = self.cost_function(y, y_pred)

            # Derivation of w and b.
            dw = 1 / self.m * np.dot(X.T , (y_pred - y))
            db = 1 / self.m * np.sum(y_pred - y)

            # Chnage the parameter value/ apply Gradient decent.
            self.w = self.w - (self.lr * dw)
            self.b = self.b - (self.lr * db)

            if it % 1000 == 0:
                print("The Cost function for the iteration {}----->{} :)".format(it, cost))
    
    def predict(self, test_X):
        """
        :param: test_X: Values need to be predicted.
        """
        y_pred = self.sigmoid(np.dot(test_X, self.w) + self.b)
        # output of the sigmoid function is between [0 - 1], then need to convert it to class values either 0 or 1.
        y_pred_class = y_pred >=0.5

        return y_pred_class

In [19]:
# Define the traning data.
X, y = make_blobs(n_samples=5000, centers=2)

# Chnage the shape of the target to 1 dimentional array.
y = y[:, np.newaxis]

print("="*100)
print("Number of training data samples-----> {}".format(X.shape[0]))
print("Number of training features --------> {}".format(X.shape[1]))
print("Shape of the target value ----------> {}".format(y.shape))

Number of training data samples-----> 5000
Number of training features --------> 2
Shape of the target value ----------> (5000, 1)


In [20]:
#define the parameters
param = {
    "learning_rate" : 0.1,
    "iteration" : 10000
}
print("="*100)
log_reg = LogisticRegression(**param)

# Train the model.
log_reg.train(X, y) 

# Predict the values.
y_pred = log_reg.predict(X)

#calculate accuracy.
acc = np.sum(y==y_pred)/X.shape[0]
print("="*100)
print("Accuracy of the prediction is {}".format(acc))

The Cost function for the iteration 1000----->0.03602371339407513 :)
The Cost function for the iteration 2000----->0.03083958133641108 :)
The Cost function for the iteration 3000----->0.02804445307164748 :)
The Cost function for the iteration 4000----->0.026035002983647044 :)
The Cost function for the iteration 5000----->0.02442819074384672 :)
The Cost function for the iteration 6000----->0.023077020411096397 :)
The Cost function for the iteration 7000----->0.021909144479974677 :)
The Cost function for the iteration 8000----->0.02088269812859516 :)
The Cost function for the iteration 9000----->0.019970511567438353 :)
The Cost function for the iteration 10000----->0.019153415288694367 :)
Accuracy of the prediction is 0.9942


# Logistic Regression Using sklearn

In [21]:
from sklearn.linear_model import LogisticRegression as LogisticRegression_sklearn
from sklearn.metrics import accuracy_score

In [22]:
# data is already defined, going to use the same data for comparision.
print("="*100)
print("Number of training data samples-----> {}".format(X.shape[0]))
print("Number of training features --------> {}".format(X.shape[1]))

Number of training data samples-----> 5000
Number of training features --------> 2


In [23]:
log_reg_sklearn = LogisticRegression_sklearn()
log_reg_sklearn.fit(X, y)

# predict the value
y_pred_sklearn = log_reg_sklearn.predict(X)
acc = accuracy_score(y, y_pred_sklearn)
print("="*100)
print("Accuracy of the prediction is {}".format(acc))

Accuracy of the prediction is 0.9978


  y = column_or_1d(y, warn=True)


In [24]:
# The main different between this scratch and sklearn model is speed.
# Sklearn model train and predict more faster than sractch code.
#  anyways this scracth code is maily for understanding the behind math and logics :) :)