In [10]:
from sklearn.datasets import load_iris  # -> Classic dataset of iris flowers (includes features like sepal length, sepal width, petal length, petal width, and their species)
from sklearn.model_selection import train_test_split  # -> Splits the dataset into training and testing sets
from sklearn.preprocessing import StandardScaler  
# StandardScaler:
# - A preprocessing tool that standardizes features by removing the mean and scaling to unit variance.
# - Formula: z = (x - mean) / standard_deviation
# - After scaling:
#     → Each feature has mean = 0
#     → Each feature has standard deviation = 1
# - Useful for algorithms that are sensitive to feature scale,
#   e.g., KNN, SVM, Logistic Regression, PCA, Neural Networks.
# - Prevents features with larger numerical ranges (e.g., cm vs mm) from dominating the model.

In [11]:
X, y = load_iris(return_X_y=True)  
# - load_iris(return_X_y=True) → directly returns:
#     X → features (150 samples × 4 features)
#     y → labels (species encoded as 0, 1, 2)

X  
# If you display X:
# - The first feature (sepal length) has values around 5.1, 4.9, 4.7, ... (range ~4.3 to 7.9)
# - The last feature (petal width) has smaller values like 0.2, 0.4, ... up to ~2.5
#
# Problem:
# - Features are on very different scales:
#     Sepal length → 4.3 to 7.9  (range ~3.6)
#     Petal width  → 0.1 to 2.5  (range ~2.4)
#
# - Algorithms like KNN (k-nearest neighbors) use distance calculations (e.g., Euclidean distance).
#   → If one feature has larger numerical values, it dominates the distance metric.
#   → Example: differences of 3 in sepal length outweigh differences of 0.2 in petal width,
#     even if petal width is more important biologically.

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [12]:
scaler = StandardScaler()  
# Create a StandardScaler object → used to compute mean & std and apply scaling.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  
# Split the dataset into training (80%) and testing (20%).

# ---- TRAINING DATA SCALING ----
X_train_scaled = scaler.fit_transform(X_train)  
# .fit()      → Calculates the mean and standard deviation for EACH feature in the training set.
# .transform()→ Uses those values (mean, std) to scale the training data.
# .fit_transform() = .fit() + .transform() in one step.

# ---- TEST DATA SCALING ----
X_test_scaled = scaler.transform(X_test)  
# Here we ONLY use .transform():
# - Because we must apply the SAME scaling (same mean and std) from the training set.
# - We do NOT call .fit() again on test data, otherwise:
#     → we’d compute new mean/std from test set
#     → this would cause "data leakage" (information from test set leaking into training).
# - Using transform only ensures that the test data is scaled consistently with training.

# ---- Why not just "scaler.X_train"? ----
# - StandardScaler does not automatically scale data just by passing X_train.
# - You must explicitly call .fit() and .transform().
# - Otherwise, X_train would stay unchanged (raw values, unscaled).

In [13]:
import numpy as np  # -> NumPy is a library for numerical operations (arrays, math, stats, etc.)

# Standardization (manual way without StandardScaler):
# Formula: (X - mean) / std
# This centers the data (mean=0) and scales it (std=1)

(X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

# np.mean(X_train, axis=0) -> compute mean of each column (feature) across all rows (samples)
# np.std(X_train, axis=0)  -> compute standard deviation of each column (feature)

# X_train - mean -> subtract column mean from each value (centering step → mean becomes 0)
# / std           -> divide each value by column std (scaling step → std becomes 1)

# Result:
# Each feature column in X_train will now have mean = 0 and std = 1
# Exactly what StandardScaler.fit_transform(X_train) does under the hood

array([[-0.59743525, -0.19998266,  0.40732476,  0.37133235],
       [-1.09357525,  0.96496487, -1.24476168, -0.81167337],
       [-1.34164525, -0.19998266, -1.35869868, -1.20600861],
       [-0.22533025, -1.36493018,  0.69216725,  1.02855775],
       [-0.47340025,  2.5958914 , -1.35869868, -1.33745369],
       [-1.21761025,  0.03300685, -1.30173018, -1.46889877],
       [-0.96954025,  0.49898586, -1.18779318, -0.94311845],
       [ 1.63519475,  1.19795437,  1.31882073,  1.68578316],
       [-1.09357525,  0.96496487, -1.41566718, -1.20600861],
       [ 0.51887975, -1.36493018,  0.63519875,  0.37133235],
       [ 0.39484475, -2.0638987 ,  0.40732476,  0.37133235],
       [-0.47340025, -1.59791969,  0.00854527, -0.15444797],
       [ 0.51887975,  0.49898586,  0.52126176,  0.50277743],
       [-1.09357525, -0.19998266, -1.24476168, -1.33745369],
       [ 0.64291475,  0.26599635,  0.86307275,  1.422893  ],
       [-1.83778524, -0.19998266, -1.41566718, -1.33745369],
       [-1.21761025,  1.

In [14]:
from sklearn.preprocessing import MinMaxScaler  
# MinMaxScaler:
# - A preprocessing tool that rescales features to a specific range (default: [0, 1]).
# - Formula: X_scaled = (X - X_min) / (X_max - X_min)
# - Unlike StandardScaler (which centers data to mean=0, std=1),
#   MinMaxScaler keeps the original shape of the distribution but squeezes values into the chosen range.
# - Useful when:
#     * You want all features strictly within [0, 1] (or another range you set).
#     * Algorithms sensitive to absolute value ranges (e.g., Neural Networks, KNN, distance-based methods).
# - Example:
#     Original feature values: [2, 4, 6, 8, 10]
#     After MinMaxScaler([0,1]): [0, 0.25, 0.5, 0.75, 1]

In [20]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled= scaler.transform(X_test)

X_train_scaled

array([[0.28571429, 0.36363636, 0.59322034, 0.58333333],
       [0.17142857, 0.59090909, 0.10169492, 0.20833333],
       [0.11428571, 0.36363636, 0.06779661, 0.08333333],
       [0.37142857, 0.13636364, 0.6779661 , 0.79166667],
       [0.31428571, 0.90909091, 0.06779661, 0.04166667],
       [0.14285714, 0.40909091, 0.08474576, 0.        ],
       [0.2       , 0.5       , 0.11864407, 0.16666667],
       [0.8       , 0.63636364, 0.86440678, 1.        ],
       [0.17142857, 0.59090909, 0.05084746, 0.08333333],
       [0.54285714, 0.13636364, 0.66101695, 0.58333333],
       [0.51428571, 0.        , 0.59322034, 0.58333333],
       [0.31428571, 0.09090909, 0.47457627, 0.41666667],
       [0.54285714, 0.5       , 0.62711864, 0.625     ],
       [0.17142857, 0.36363636, 0.10169492, 0.04166667],
       [0.57142857, 0.45454545, 0.72881356, 0.91666667],
       [0.        , 0.36363636, 0.05084746, 0.04166667],
       [0.14285714, 0.63636364, 0.06779661, 0.        ],
       [0.4       , 0.22727273,

In [19]:
# The Manual Idea of Above Methods.

X_min = np.min(X_train, axis=0)
X_max = np.max(X_test, axis=0)

(X_train - X_min) / (X_max - X_min)

array([[0.3030303 , 0.44444444, 0.61403509, 0.60869565],
       [0.18181818, 0.72222222, 0.10526316, 0.2173913 ],
       [0.12121212, 0.44444444, 0.07017544, 0.08695652],
       [0.39393939, 0.16666667, 0.70175439, 0.82608696],
       [0.33333333, 1.11111111, 0.07017544, 0.04347826],
       [0.15151515, 0.5       , 0.0877193 , 0.        ],
       [0.21212121, 0.61111111, 0.12280702, 0.17391304],
       [0.84848485, 0.77777778, 0.89473684, 1.04347826],
       [0.18181818, 0.72222222, 0.05263158, 0.08695652],
       [0.57575758, 0.16666667, 0.68421053, 0.60869565],
       [0.54545455, 0.        , 0.61403509, 0.60869565],
       [0.33333333, 0.11111111, 0.49122807, 0.43478261],
       [0.57575758, 0.61111111, 0.64912281, 0.65217391],
       [0.18181818, 0.44444444, 0.10526316, 0.04347826],
       [0.60606061, 0.55555556, 0.75438596, 0.95652174],
       [0.        , 0.44444444, 0.05263158, 0.04347826],
       [0.15151515, 0.77777778, 0.07017544, 0.        ],
       [0.42424242, 0.27777778,