In [1]:
# logistic regression from scratch

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00537/sobar-72.csv')
df

Unnamed: 0,behavior_sexualRisk,behavior_eating,behavior_personalHygine,intention_aggregation,intention_commitment,attitude_consistency,attitude_spontaneity,norm_significantPerson,norm_fulfillment,perception_vulnerability,perception_severity,motivation_strength,motivation_willingness,socialSupport_emotionality,socialSupport_appreciation,socialSupport_instrumental,empowerment_knowledge,empowerment_abilities,empowerment_desires,ca_cervix
0,10,13,12,4,7,9,10,1,8,7,3,14,8,5,7,12,12,11,8,1
1,10,11,11,10,14,7,7,5,5,4,2,15,13,7,6,5,5,4,4,1
2,10,15,3,2,14,8,10,1,4,7,2,7,3,3,6,11,3,3,15,1
3,10,11,10,10,15,7,7,1,5,4,2,15,13,7,4,4,4,4,4,1
4,8,11,7,8,10,7,8,1,5,3,2,15,5,3,6,12,5,4,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,10,14,14,10,15,6,7,5,15,14,10,15,13,9,8,12,12,11,9,0
68,10,12,15,10,15,8,8,5,15,14,8,12,14,11,7,13,15,11,14,0
69,10,8,11,6,10,6,4,3,13,9,8,14,12,9,7,11,12,10,10,0
70,9,12,13,10,13,6,6,5,14,13,10,13,12,11,8,12,11,13,15,0


In [3]:
X= df.iloc[:,0:19].values
Y= df.iloc[:,19].values

In [4]:
# train the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 1/10, random_state = 42)

In [5]:
#Sigmoid Function
#def sigmoid(Z):

def sigmoid(z):
  return 1.0/(1 + np.exp(-z))

In [6]:
def loss(y, y_hat):
  loss = -np.mean(y*np.log(y_hat) - (1 - y)*np.log(1 - y_hat))
  return loss

In [7]:
def gradients(X, y, y_hat):
    m = X.shape[0]
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    db = (1/m)*np.sum((y_hat - y)) 
    return dw, db

In [8]:
def normalize(X):
    m, n = X.shape
    for i in range(n):
        X = (X - X.mean(axis=0))/X.std(axis=0)       
    return X

In [9]:
def train(X, y, bs, epochs, lr):
    m, n = X.shape
    w = np.random.uniform(size=(n,1))
    b = 0
    y = y.reshape(m,1)
    x = normalize(X)
    losses = []
    for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = gradients(xb, yb, y_hat)
            w -= lr*dw
            b -= lr*db
        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
    return w, b, losses

In [10]:
def predict(X, w, b):
    x = normalize(X)
    preds = sigmoid(np.dot(X, w) + b)
    pred_class = []
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    return np.array(pred_class)

In [11]:
def accuracy(y, y_hat):
    accuracy = np.sum(y == y_hat) / len(y)
    return accuracy

In [12]:
w, b, l = train(X_train, y_train, bs=1, epochs=1000, lr=0.0001)
print("The accuracy of the model is :",accuracy(y_test, y_hat=predict(X_test,w,b))*100,"%")

  loss = -np.mean(y*np.log(y_hat) - (1 - y)*np.log(1 - y_hat))
  loss = -np.mean(y*np.log(y_hat) - (1 - y)*np.log(1 - y_hat))


The accuracy of the model is : 75.0 %


In [13]:
y_test

array([1, 0, 1, 1, 0, 0, 1, 0], dtype=int64)

In [14]:
predict(X_test,w,b)

array([1, 0, 0, 0, 0, 0, 1, 0])

In [15]:
# using standard library
from sklearn.linear_model import LogisticRegression

In [57]:
#standard library logistic regressin with SDG model

In [16]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

classifier = SGDClassifier(loss='log', max_iter=1000)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)

actual_accuracy = accuracy_score(np.array(y_test),np.array(y_pred),normalize=True)
conf_mat = confusion_matrix(y_test,y_pred)

print("Confusion Matrix:\n", conf_mat)

Confusion Matrix:
 [[4 0]
 [1 3]]


In [17]:
print(y_pred)

[1 0 1 0 0 0 1 0]


In [18]:
print("The accuracy of the model is :",actual_accuracy*100,"%")

The accuracy of the model is : 87.5 %
