In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# download of diabetes kaggle dataset
# https://www.kaggle.com/datasets/mathchi/diabetes-data-set
data_path = "kaggle-dataset/diabetes.csv"
df = pd.read_csv(data_path)

df.head()
df.info()
print(df.describe(include="all"))

<class 'pandas.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31

In [3]:
# features and target
X = df.drop("Outcome", axis=1).values
y = df["Outcome"].values.reshape(-1, 1)   # column vector

# train / test split (you can fix random_state for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# scale features using StandardScaler (allowed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape, y_train.shape)

(614, 8) (614, 1)


In [4]:
# logistic regression

def sigmoid(z):
    # avoid overflow
    z = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z))

def compute_loss_and_gradients(X_batch, y_batch, w, b, l2_lambda=0.0):
    """
    Binary cross-entropy loss and gradients for one mini-batch.
    """
    m = X_batch.shape[0]

    # forward
    z = X_batch @ w + b          # shape (m, 1)
    y_hat = sigmoid(z)           # shape (m, 1)

    # numerical stability
    eps = 1e-15
    y_hat_clipped = np.clip(y_hat, eps, 1 - eps)

    # binary cross entropy
    loss = - (1.0 / m) * np.sum(
        y_batch * np.log(y_hat_clipped) + (1 - y_batch) * np.log(1 - y_hat_clipped)
    )

    # add L2 penalty on w (not b)
    if l2_lambda > 0:
        loss += (l2_lambda / (2 * m)) * np.sum(w ** 2)

    # gradients
    dz = (y_hat - y_batch)       # shape (m, 1)
    dw = (1.0 / m) * (X_batch.T @ dz)   # shape (n_features, 1)
    db = (1.0 / m) * np.sum(dz)

    # gradient of L2 term
    if l2_lambda > 0:
        dw += (l2_lambda / m) * w

    return loss, dw, db


In [5]:
def logistic_regression_sgd(
    X_train,
    y_train,
    lr=0.01,
    n_epochs=100,
    batch_size=32,
    l2_lambda=0.0,
    verbose=True
):
    m, n = X_train.shape
    # initialize parameters
    w = np.zeros((n, 1))
    b = 0.0

    loss_history = []

    for epoch in range(n_epochs):
        # shuffle indices
        indices = np.random.permutation(m)
        X_shuffled = X_train[indices]
        y_shuffled = y_train[indices]

        epoch_loss = 0.0
        n_batches = int(np.ceil(m / batch_size))

        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]

            loss, dw, db = compute_loss_and_gradients(X_batch, y_batch, w, b, l2_lambda)

            # SGD update
            w -= lr * dw
            b -= lr * db

            epoch_loss += loss

        avg_epoch_loss = epoch_loss / n_batches
        loss_history.append(avg_epoch_loss)

        if verbose and (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{n_epochs}, loss = {avg_epoch_loss:.4f}")

    return w, b, loss_history


In [6]:
w, b, loss_history = logistic_regression_sgd(
    X_train,
    y_train,
    lr=0.01,
    n_epochs=200,
    batch_size=32,
    l2_lambda=0.001,
    verbose=True
)
print("Final bias:", b)
print("Final weights shape:", w.shape)


Epoch 10/200, loss = 0.5565
Epoch 20/200, loss = 0.5158
Epoch 30/200, loss = 0.4980
Epoch 40/200, loss = 0.4819
Epoch 50/200, loss = 0.4968
Epoch 60/200, loss = 0.4709
Epoch 70/200, loss = 0.4743
Epoch 80/200, loss = 0.4653
Epoch 90/200, loss = 0.4703
Epoch 100/200, loss = 0.4709
Epoch 110/200, loss = 0.4607
Epoch 120/200, loss = 0.4597
Epoch 130/200, loss = 0.4836
Epoch 140/200, loss = 0.4643
Epoch 150/200, loss = 0.4681
Epoch 160/200, loss = 0.4679
Epoch 170/200, loss = 0.4726
Epoch 180/200, loss = 0.4643
Epoch 190/200, loss = 0.4691
Epoch 200/200, loss = 0.4606
Final bias: -0.866097782111814
Final weights shape: (8, 1)


In [7]:
def predict_proba(X, w, b):
    z = X @ w + b
    return sigmoid(z)

def predict_classes(X, w, b, threshold=0.5):
    probs = predict_proba(X, w, b)
    return (probs >= threshold).astype(int)


In [8]:
# predictions
y_train_pred = predict_classes(X_train, w, b)
y_test_pred = predict_classes(X_test, w, b)

# accuracy
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

train_acc = accuracy(y_train, y_train_pred)
test_acc = accuracy(y_test, y_test_pred)

print(f"Train accuracy: {train_acc:.4f}")
print(f"Test accuracy: {test_acc:.4f}")


Train accuracy: 0.7932
Test accuracy: 0.7143


In [9]:
results_df = pd.DataFrame({
    "Actual": y_test.flatten(),
    "Predicted": y_test_pred.flatten()
})
results_df.head(20)


Unnamed: 0,Actual,Predicted
0,0,1
1,0,0
2,0,0
3,1,0
4,0,0
5,0,0
6,1,0
7,1,1
8,0,0
9,0,1


In [10]:
confusion = pd.crosstab(results_df["Actual"], results_df["Predicted"],
                        rownames=["Actual"], colnames=["Predicted"])
confusion


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,82,18
1,26,28
