## Logistic Regression from scratch

In [63]:
import numpy as np

In [64]:
#the class:
class Logistic_Regression():
    def __init__(self,no_iterations,alpha,threshold=0.5, lambda_=0.0):
        self.no_iterations=no_iterations
        self.alpha=alpha
        self.threshold=threshold
        self.lambda_=lambda_ #if the user doesnt add it then theres no reg : def =0 

    def fit(self,X,y):
        self.m,self.n=X.shape
        self.w=np.zeros(self.n)
        self.b=0.0
        self.X=X
        self.y=y
        for i in range(self.no_iterations) :
          z=np.dot(self.X,self.w,)+self.b # z
          f=1/(1+np.exp(-z)) #sigmoid
          error=f-self.y
          self.dj_dw = (X.T @ error) / self.m + (self.lambda_ / self.m) * self.w
          self.dj_db=np.sum(error)/self.m
          self.update_weights()
                   
        
    def update_weights(self,):
        self.w=self.w-(self.alpha*self.dj_dw)
        self.b=self.b-(self.alpha*self.dj_db)
    def pred_proba(self,x_test):
        z=np.dot(x_test,self.w)+self.b
        f=1/(1+np.exp(-z))
        return f
        
    def predict(self,x_test):
        f=self.pred_proba(x_test)
        pred=(f>=self.threshold).astype(int)
        return pred

    def score(self,x,y):
        pred=self.predict(x)
        correct_pred= (pred==y).astype(int)
        score=np.sum(correct_pred)/y.shape[0]
        return score
        
        
    
    
    

In [65]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [66]:
# will use the model to predict whether a patient has diabetes or not
df=pd.read_csv(r"D:\Downloads\diabetes.csv")


In [67]:
df.head() # data overview

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [68]:
df.describe() # the features dont seem to be on the same scale, will scale them using 
# standard scaler (standardization)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [69]:
df.info() # data has no nulls, (no cleaning needed, simply for just testing the model)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [70]:
df.shape

(768, 9)

In [71]:
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [72]:
# split into features and target" 
X=df.drop(["Outcome"], axis=1)
y=df['Outcome']


In [73]:
# train test split: 
x_train, x_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42)
# scaling :
scaler=StandardScaler()
scaler.fit(x_train) # fit on the train data
# then transform train and test after that, to avoid data leakage:
x_train_scaled=scaler.transform(x_train)
x_test_scaled=scaler.transform(x_test)



In [74]:
# using the model:
model=Logistic_Regression(no_iterations=10000,alpha=0.01,lambda_=0.01)
model.fit(x_train_scaled,y_train)
score=model.score(x_test_scaled,y_test)
print(f"Accuracy of the model is: {score}")

Accuracy of the model is: 0.7532467532467533


In [75]:
#Comparing against sklearn's logistic regression model:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=10000, C=1/0.01)
lr.fit(x_train_scaled,y_train)
print("Sklearn Accuracy: ", lr.score(x_test_scaled,y_test))


Sklearn Accuracy:  0.7532467532467533


In [76]:
print("Your model weights:", model.w)
print("Sklearn weights:", lr.coef_)
print("Difference:", np.linalg.norm(model.w - lr.coef_.flatten()))


Your model weights: [ 0.2137474   1.09058711 -0.25644195  0.04793187 -0.21009286  0.79323858
  0.23370411  0.42604765]
Sklearn weights: [[ 0.21365093  1.09167244 -0.25679729  0.04856644 -0.21080438  0.79349282
   0.23373964  0.42647036]]
Difference: 0.001570674681303678


In [77]:
# The difference between the model’s weights and sklearn’s weights and the accuracies is very small (about 0.0016).
# This means the implementation works almost the same as sklearn’s.
# It shows that the logistic regression code is correct and reliable.
