In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes

# 1. Load a Real Dataset (Diabetes)
data = load_diabetes()
# We'll use BMI (index 2) as our feature to keep it easy to visualize
X = data.data[:, 2].reshape(-1, 1) 
y = data.target.reshape(-1, 1)

# 2. Define the Model (Normal Equation: theta = (X^T X)^-1 X^T y)
def train_model(X_train, y_train):
    # Add a column of ones to X for the intercept (bias term)
    X_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    
    # Solve for weights: (X^T * X)^-1 * X^T * y
    # np.linalg.pinv is used for the pseudo-inverse (more stable)
    weights = np.linalg.pinv(X_b.T @ X_b) @ X_b.T @ y_train
    return weights

def predict(X, weights):
    # Add column of ones to test data
    X_b = np.c_[np.ones((X.shape[0], 1)), X]
    return X_b @ weights

def mse(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

# 3. K-Fold Cross-Validation
def k_fold_cv(X, y, k=5):
    n_samples = len(X)
    indices = np.arange(n_samples)
    np.random.seed(42)
    np.random.shuffle(indices)
    
    folds = np.array_split(indices, k)
    mse_scores = []

    print(f"Evaluating BMI vs Disease Progression ({k}-Folds)")
    print("-" * 45)

    for i in range(k):
        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] 
                                    for j in range(k) if j != i])

        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

        # Train and Predict
        weights = train_model(X_train, y_train)
        y_pred = predict(X_test, weights)

        # Score
        score = mse(y_test, y_pred)
        mse_scores.append(score)
        
        print(f"Fold {i+1} | Intercept: {weights[0][0]:.2f}
            | Slope: {weights[1][0]:.2f} | MSE: {score:.2f}")

    return mse_scores, weights # Returning last weights for visualization

# 4. Run and Visualize
scores, final_weights = k_fold_cv(X, y, k=5)

print("-" * 45)
print(f"Average MSE: {np.mean(scores):.2f}")
print(f"Root MSE: {np.sqrt(np.mean(scores)):.2f} (Average error in disease units)")

# Plot the final fold's model
plt.scatter(X, y, alpha=0.5, label="Actual Data")
plt.plot(X, predict(X, final_weights), color='red', label="Final Model Line")
plt.xlabel("BMI (Normalized)")
plt.ylabel("Disease Progression")
plt.title("Linear Regression: BMI vs Diabetes Progression")
plt.legend()
plt.show()