In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [49]:
url = "https://online.stat.psu.edu/onlinecourses/sites/stat501/files/data/leukemia_remission.txt"
df = pd.read_csv(url, sep='\t')

In [50]:
df.describe()

Unnamed: 0,REMISS,CELL,SMEAR,INFIL,LI,BLAST,TEMP
count,27.0,27.0,27.0,27.0,27.0,27.0,27.0
mean,0.333333,0.881481,0.635185,0.570741,1.003704,0.688519,0.997407
std,0.480384,0.186645,0.214052,0.237567,0.467795,0.53496,0.01483
min,0.0,0.2,0.32,0.08,0.4,0.0,0.98
25%,0.0,0.825,0.43,0.335,0.65,0.23,0.99
50%,0.0,0.95,0.65,0.63,0.9,0.52,0.99
75%,1.0,1.0,0.835,0.74,1.25,1.06,1.005
max,1.0,1.0,0.97,0.92,1.9,2.06,1.04


In [51]:
X = df.drop('REMISS', axis=1).values
y = df['REMISS'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [52]:
# 1 Scikit-learn
sk_model = LogisticRegression(penalty='l2', C=1.0)
sk_model.fit(X_train_scaled, y_train)
y_pred_sk = sk_model.predict(X_test_scaled)
acc_sk = accuracy_score(y_test, y_pred_sk)


In [53]:
# 2 Batch Gradient Descent
# เพิ่ม Column 1 เข้าไปข้างหน้าสำหรับค่า Bias (Intercept)
X_train_b = np.c_[np.ones((len(X_train_scaled), 1)), X_train_scaled]
X_test_b = np.c_[np.ones((len(X_test_scaled), 1)), X_test_scaled]

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

m = len(X_train_b)
eta = 0.1  # Learning rate
n_iterations = 1000
theta = np.zeros(X_train_b.shape[1]) # เริ่มต้นเป็น 0

for iteration in range(n_iterations):
    # 1. คำนวณความน่าจะเป็น (Predictions)
    y_proba = sigmoid(X_train_b.dot(theta))
    # 2. คำนวณความต่าง (Error)
    error = y_proba - y_train
    # 3. คำนวณ Gradient (ใช้ข้อมูลทั้ง Batch)
    gradients = (1/m) * X_train_b.T.dot(error)
    # 4. อัปเดตพารามิเตอร์ (Weights)
    theta = theta - eta * gradients

In [54]:

test_proba = sigmoid(X_test_b.dot(theta))
y_pred_manual = (test_proba >= 0.5).astype(int)
acc_manual = accuracy_score(y_test, y_pred_manual)

print(f"Accuracy (Scikit-learn): {acc_sk:.4f}")
print(f"Accuracy (Batch GD): {acc_manual:.4f}")

Accuracy (Scikit-learn): 0.5000
Accuracy (Batch GD): 0.5000


## k-Fold Cross-Validation

In [55]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression()
# cv=5 คือแบ่งเป็น 5 กลุ่ม
scores = cross_val_score(model, X_train_scaled, y_train, cv=5)

print(f"Accuracy แต่ละรอบ: {scores}")
print(f"Accuracy เฉลี่ย: {scores.mean():.2f}")

Accuracy แต่ละรอบ: [1.   0.75 0.75 0.75 0.5 ]
Accuracy เฉลี่ย: 0.75
