In [4]:
# fetch_openml → to download datasets from OpenML (example: MNIST digits)
# We'll often use this to get high-dimensional data where PCA is useful.
from sklearn.datasets import fetch_openml

# PCA (Principal Component Analysis) → dimensionality reduction technique
# It helps to reduce features while keeping most of the variance in the data.
from sklearn.decomposition import PCA

# LogisticRegression → a classifier (supervised ML model)
# We’ll later test how PCA-transformed features affect classification accuracy.
from sklearn.linear_model import LogisticRegression

# train_test_split → to split dataset into training and testing sets
# Important for evaluating how well the model generalizes.
from sklearn.model_selection import train_test_split

In [5]:
X, y = fetch_openml('mnist_784', return_X_y = True)
"""
# 'mnist_784' → standard name for MNIST dataset with 784 features.
# Why 784?
# - Each MNIST image is 28 x 28 pixels = 784 pixels total.
# - Each pixel is stored as one feature (value 0–255 for grayscale intensity).
# - So every row in X is a flattened image of length 784.
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=67)

In [6]:
# Create a PCA object
# -------------------
# n_components=10 → we want to keep only 10 principal components (features).
# - The original data has 784 features (28x28 pixels per image).
# - PCA compresses these 784 features into 10 while retaining as much variance as possible.
# - This is useful to speed up training and reduce noise.
pca = PCA(n_components=10)

# Fit PCA on the training data and transform it
# ---------------------------------------------
# - pca.fit_transform(X_train) learns the principal components from the training set
#   and applies the dimensionality reduction.
# - Result: X_train_reduced → same number of rows (images) but fewer columns (features).
X_train_reduced = pca.fit_transform(X_train)

# Apply the same transformation to the test data
# ----------------------------------------------
# - Important: We only "fit" PCA on the training data.
# - Then we "transform" test data using the same transformatin,
#   so the model doesn’t "cheat" by learning from the test set
X_test_reduced = pca.transform(X_test)

# Print shapes to compare before vs after PCA
print("Original training data shape:", X_train.shape)
# → (n_samples, 784)  because each image is 28x28 pixels flattened

print("Reduced training data shape:", X_train_reduced.shape)
# → (n_samples, 10)   because we kept only 10 principal components

print("Original test data shape:", X_test.shape)
print("Reduced test data shape:", X_test_reduced.shape)

Original training data shape: (56000, 784)
Reduced training data shape: (56000, 10)
Original test data shape: (14000, 784)
Reduced test data shape: (14000, 10)


In [10]:
# Cell: Compare Logistic Regression with and without PCA
# ------------------------------------------------------

# --- Logistic Regression on Original Data (784 features) ---
clf_full = LogisticRegression(max_iter=100)   # model using all pixels
clf_full.fit(X_train, y_train)                # fit on raw features
acc_full = clf_full.score(X_test, y_test)     # evaluate on test data

# --- Logistic Regression on PCA-Reduced Data (10 features) ---
clf_reduced = LogisticRegression(max_iter=100)   # model using compressed features
clf_reduced.fit(X_train_reduced, y_train)        # fit on PCA-transformed data
acc_reduced = clf_reduced.score(X_test_reduced, y_test)   # evaluate on reduced test data

# --- Results ---
print("Logistic Regression (original 784 features):", acc_full)
print("Logistic Regression (PCA-reduced 10 features):", acc_reduced)

"""
Explanation:
------------
- acc_full    → accuracy when model sees ALL 784 pixel values.
- acc_reduced → accuracy when model only sees 10 compressed PCA features.

Key takeaway:
- Full features = higher accuracy (more info).
- PCA features = faster training, smaller data, but slight accuracy drop.
- PCA is a trade-off between efficiency and performance.
"""

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression (original 784 features): 0.9207142857142857
Logistic Regression (PCA-reduced 10 features): 0.7993571428571429


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'\nExplanation:\n------------\n- acc_full    → accuracy when model sees ALL 784 pixel values.\n- acc_reduced → accuracy when model only sees 10 compressed PCA features.\n\nKey takeaway:\n- Full features = higher accuracy (more info).\n- PCA features = faster training, smaller data, but slight accuracy drop.\n- PCA is a trade-off between efficiency and performance.\n'

In [14]:
# PCA Variance Explained
import numpy as np
print("Explained variance ratio per component:\n", pca.explained_variance_ratio_)

print("\nTotal variance retained (sum):", np.sum(pca.explained_variance_ratio_))

Explained variance ratio per component:
 [0.0974149  0.07158571 0.06143781 0.05414128 0.0488586  0.04299859
 0.03284287 0.02906657 0.02762179 0.0233081 ]

Total variance retained (sum): 0.48927621135911376
