# Imports

In [43]:
import numpy as np 
import pandas as pd 

from sklearn import set_config


from sklearn.datasets import load_iris, load_digits
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay 


# Intro

This notebook provides notes for common classes and methods used in scikit-learn. It also answers questions to concepts I had trouble understanding. 

In [44]:
X, y = load_iris(return_X_y=True)
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng)
print(X.shape)
print(X_train.shape)

(150, 4)
(120, 4)


## Estimators

**Definition:** An estimator is any object in scikit-learn that learns from data by calling `fit()`.

*   It can be a transformer (e.g., `StandardScaler`), a predictor (e.g., `LogisticRegression`), or both.
*   Essentially, if an object has a `fit()` method, it's an estimator.

In [45]:
scaler = StandardScaler()  # Estimator
model = LogisticRegression()  # Estimator

scaler.fit(X_train)  # ✅ Works because StandardScaler is an estimator
model.fit(X_train, y_train)  # ✅ Works because LogisticRegression is an estimator

### Transformer

**Definition:** A transformer is a special type of estimator that modifies data by calling `transform()`.

*   Transformers are used for preprocessing (scaling, encoding, feature selection, etc.).
*   They learn something from `fit()` and apply it with `transform()`.
*   Many transformers also support `fit_transform()` to do both in one step.

In [46]:
scaler = StandardScaler()
encoder = OneHotEncoder()
pca = PCA(n_components=2)

scaler.fit(X_train)  # ✅ Learns mean & std
X_scaled = scaler.transform(X_train)  # ✅ Applies scaling

X_scaled = scaler.fit_transform(X_train)  # ✅ Shortcut (fit + transform)

pca.fit(X_train)  # ✅ Learns principal components
X_pca = pca.transform(X_train)  # ✅ Applies dimensionality reduction
X_pca = pca.fit_transform(X_train)  # ✅ Shortcut (fit + transform)

**Rule:** If an object has both `fit()` and `transform()`, it's a transformer.

### Predictor

**Definition:** A predictor is an estimator that makes predictions by calling `predict()`.

*   Predictors are models that take input data and output predictions.
*   Some predictors also support `predict_proba()` for probability estimates.

In [47]:
clf = LogisticRegression()  # Predictor
rf = RandomForestClassifier()  # Predictor

clf.fit(X_train, y_train)  # ✅ Learns from data
y_pred = clf.predict(X_test)  # ✅ Makes predictions

rf.fit(X_train, y_train)  # ✅ Learns from data
y_prob = rf.predict_proba(X_test)  # ✅ Outputs class probabilities


**Rule:** If an object has `predict()`, it's a predictor.

# Pipeline

In [48]:
X, y = load_digits(return_X_y=True)
df = pd.DataFrame(X)
df['target_digit'] = y


rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=rng)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

duplicates = [np.any(np.all(row == X_train, axis=1)) for row in X_test]
print(np.any(duplicates))  # Should be False


pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SGDClassifier(random_state=rng))
])

param_grid = [
    {'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__alpha': [0.0001, 0.00003, 0.00005, 0.00007]}
]

# pipe.fit(X_train, y_train)
# # training set
# y_pred_train = pipe.predict(X_train)
# y_pred_train_score = classification_report(y_train, y_pred_train) 
# # test set
# y_pred = pipe.predict(X_test)
# y_score = classification_report(y_test, y_pred)
# print(f"Score on train set:\n{y_pred_train_score}\n\nScore on test data\n{y_score}")
grid = GridSearchCV(pipe, param_grid=param_grid, cv=30, n_jobs=-1)
grid.fit(X_train, y_train)

print(f"\nBest Estimator:\n{grid.best_estimator_}\nBest Params:\n{grid.best_params_} \nBest score:\n{grid.best_score_}\n\nCV Results\n\n{pd.DataFrame(grid.cv_results_)}")
df


(1437, 64) (360, 64)
(1437,) (360,)
False

Best Estimator:
Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 SGDClassifier(alpha=3e-05, penalty='l1',
                               random_state=RandomState(MT19937) at 0x19CBAB0B040))])
Best Params:
{'model__alpha': 3e-05, 'model__penalty': 'l1'} 
Best score:
0.9547724586288416

CV Results

    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.053779      0.007294         0.001734        0.000629   
1        0.132930      0.017408         0.001267        0.000442   
2        0.133864      0.014382         0.001300        0.000458   
3        0.040809      0.005264         0.001267        0.000442   
4        0.102590      0.009971         0.001267        0.000442   
5        0.100923      0.013238         0.001434        0.000496   
6        0.043910      0.005017         0.001300        0.000458   
7        0.106657      0.011577         0.001300        0.000458   
8      

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target_digit
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


In [49]:

y_pred = grid.best_estimator_.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.91      0.89      0.90        35
           2       0.94      0.94      0.94        36
           3       0.96      0.93      0.95        29
           4       0.91      0.97      0.94        30
           5       0.93      0.95      0.94        40
           6       1.00      0.95      0.98        44
           7       0.90      0.97      0.94        39
           8       0.87      0.85      0.86        39
           9       0.95      0.93      0.94        41

    accuracy                           0.94       360
   macro avg       0.94      0.94      0.94       360
weighted avg       0.94      0.94      0.94       360



In [50]:
cv_score = cross_val_score(grid.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')
print(cv_score)
print('\nAvg accuracy: \n', cv_score.mean())

[0.94791667 0.95138889 0.96167247 0.94425087 0.96167247]

Avg accuracy: 
 0.9533802748741772


### Pipeline Questions

#### Pipeline Order
**Question**: So when predictions = `pipe.predict(X_test)` is called, is it taking the scaler into account?
- Yes! ✅ When you call: predictions = pipe.predict(X_test), the entire pipeline is applied, meaning both the scaler and the classifier are used.

When you call `pipe.predict(X_test)`, the pipeline follows these steps in order:

1.  **`StandardScaler` transforms `X_test`:**
    The raw test data (`X_test`) is scaled using the `MinMaxScaler` that was already fitted on `X_train`.

2.  **`SGDClassifier` makes predictions on the scaled data:**
    The classifier (`SGDClassifier`) receives the transformed data and makes predictions.
    


# GridSearchCV

### GridSearchCV Questions

#### Param Grid Syntax
syntax is :

*estimator*__*parameter* (estimator, two underscores, parameter)

**Question**: What is the difference (using above pipeline's example) between

```
param_grid = [
    {'model__penalty': ['l2', 'l1', 'elasticnet']},
    {'model__alpha': [0.0001, 0.001, 0.01, 0.1]}
]
```

and

```
param_grid = {
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__alpha': [0.0001, 0.001, 0.01, 0.1]
}

```



In [51]:

# this has TWO different grids (there is a list of dictionaries)
param_grid = [
    {'model__penalty': ['l2', 'l1', 'elasticnet']}, #grid 1
    {'model__alpha': [0.0001, 0.001, 0.01, 0.1]} #grid 2
]

# this is ONE grid (one dictionary)
param_grid = {
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__alpha': [0.0001, 0.001, 0.01, 0.1]
}


The **first** param_grid will **NOT** consider all combinations of `penalty` and `alpha`, `GridSearchCV` treats them as two separate independent grids.

It will:

- 1. Try tuning only `model__penalty` while keeping the default value for `model__alpha` (which is 0.0001).
- 2. Then, it will try tuning only `model__alpha` while keeping the default value for `model__penalty` (which is 'l2').

The **second** param_grid will **WILL** consider all combinations of `penalty` and `alpha`, `GridSearchCV` treats them as one grid.


# Column Transformer

# SimpleImputer

Replaces NaNs with a strategy like mean, median, mode, or a constant.

In [52]:
from sklearn.impute import SimpleImputer
import numpy as np

X_2 = np.array([[1, 2, np.nan], [4, np.nan, 6], [7, 8, 9]])
imputer = SimpleImputer(strategy="mean")  # Replace NaNs with column mean
X_imputed = imputer.fit_transform(X_2)
X_imputed

array([[1. , 2. , 7.5],
       [4. , 5. , 6. ],
       [7. , 8. , 9. ]])

# IterativeImputer

Estimates missing values by predicting them based on other features.

In [53]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer()
X_imputed = imputer.fit_transform(X_2)
X_imputed

array([[1.        , 2.        , 3.00203274],
       [4.        , 4.99796925, 6.        ],
       [7.        , 8.        , 9.        ]])

# Handling Multicollinearity

## Maybe Best Way - Pandas Correlation Matrix

If two features have a correlation above a threshold (e.g., 0.9), drop one of them.

In [54]:
corr_matrix = df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with high correlation
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

# Drop highly correlated features
df_reduced = df.drop(to_drop, axis=1)
print(df.shape)
print(df_reduced.shape)


(1797, 65)
(1797, 64)


## Principal Component Analysis (PCA)

Principal Component Analysis (PCA) reduces redundancy by transforming correlated features into a set of uncorrelated components.


**1. To Reduce the Number of Features (Dimensionality Reduction)**

*   If your dataset has many features (high dimensionality), PCA can help reduce the number while keeping most of the information.
*   This is useful because high-dimensional data can lead to the curse of dimensionality, making models slower and more prone to overfitting.
*   **📌 Use case:** You have a dataset with hundreds of features, but many are redundant or correlated. PCA helps compress the data.

**2. To Handle Multicollinearity (Highly Correlated Features)**

*   If features are highly correlated, models like linear regression and logistic regression may struggle.
*   PCA creates new uncorrelated features (principal components), which can improve model performance.
*   **📌 Use case:** You have a dataset with many correlated variables (e.g., stock prices, sensor data). PCA helps remove redundancy.

**3. To Speed Up Training for Computational Efficiency**

*   Some machine learning models (e.g., SVMs, k-NN) slow down with too many features.
*   Reducing dimensions with PCA can make training and predictions faster.
*   **📌 Use case:** You're working with image data (e.g., 64x64 pixels = 4,096 features per image). Reducing dimensions with PCA speeds up training.

**4. To Visualize High-Dimensional Data**

*   PCA can reduce a dataset from many dimensions to 2D or 3D, making it easier to plot and visualize patterns.
*   This is often used in exploratory data analysis (EDA).
*   **📌 Use case:** You want to visualize customer clusters in a 100-feature dataset. PCA helps reduce it to 2D for plotting.

**5. To Denoise Data (Feature Compression)**

*   PCA helps filter out small variations or noise by focusing on the most important components.
*   It can be useful for image compression or removing noise from sensor data.
*   **📌 Use case:** You're working with speech recognition or ECG signals with lots of noise. PCA helps keep only useful information.

**When Should You NOT Use PCA?**

*   **❌ When feature interpretability is important** – PCA transforms features into abstract components that are harder to interpret.
*   **❌ When you have categorical features** – PCA only works with numerical data.
*   **❌ When your model already handles correlated features well** – Some algorithms (e.g., tree-based models like Random Forests) don't need PCA.

In [55]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # Reduce to 2 components

print('X Train Shape')
print(X_train.shape)

X_pca = pca.fit_transform(X_train)

print('\nPCA')
print(X_pca)

print('\nPCA Shape')
print(X_pca.shape)

X Train Shape
(1437, 64)

PCA
[[ 23.37877851  -4.96617545]
 [-15.21542888 -15.48613197]
 [-21.04421107  -4.89824828]
 ...
 [  3.47676504  18.65116785]
 [  3.21484471  15.29047993]
 [ -4.29186733   2.59963173]]

PCA Shape
(1437, 2)


## Pandas vs PCA

| Method                                      | When to Use                                                                                                | Pros                                                                                                  | Cons                                                                                                               |
| :------------------------------------------ | :--------------------------------------------------------------------------------------------------------- | :---------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------- |
| Pandas Correlation Matrix (Feature Dropping) | When you have redundant features that don't add much value.                                              | - Keeps interpretability (e.g., in regression models).<br> - Simple to implement.                       | - May remove useful features.<br> - Manual threshold selection (e.g., >0.9 correlation).                       |
| PCA (Principal Component Analysis)         | When you want to reduce dimensionality but keep all information.                                            | - Keeps all data (just transforms it).<br> - Helps avoid multicollinearity.                           | - Harder to interpret (new features are combinations of old ones).<br> - Might lose small but important details. |

**When to Use What?**

*   **For regression models (like Linear Regression, Logistic Regression)** → Use the correlation matrix to drop features (because collinearity causes unstable coefficients).
*   **For machine learning models like SVM, Random Forest, or Neural Networks** → PCA can be useful (since models don't require interpretability, and PCA can improve efficiency).
*   **If you need high interpretability** → Feature dropping is better.
*   **If dimensionality is too high and you don't care about feature meaning** → Use PCA.

**Conclusion**

*   If you can afford to lose features, drop them using pandas correlation matrix.
*   If you want to keep all information, use PCA, but be aware that transformed features are no longer in the original space.

## Using Regularization

Use the estimator's built in Regularization / C / Penalty / Alpha .. etc parameter to adjust the Regularization. 

- Lasso Regression (L1 penalty) can shrink some feature coefficients to zero, effectively removing redundant ones.
- Ridge Regression (L2 penalty) penalizes large coefficients, reducing their impact but not removing them completely.

# Pipeline Template

In [56]:
# template for Pipeline. Edit column transformer variable to edit the data columns needed; edit 'model' step in pipeline to be whichever estimator needed

rng = np.random.RandomState(0)

# make_column_transformer takes Tuples of the form (transformer, columns); columns take [] if transformer expects 2D array, no brackets if it expects 1D array
column_transformer = make_column_transformer([
    (StandardScaler(), ['column name']),
    (OneHotEncoder(), ['column name'])],
    n_jobs=-1)

# name of step, estimator
pipeline = Pipeline([
    ('column transformer', column_transformer),
    ('imputer',SimpleImputer()),
    ('feature reduction', PCA()),
    ('model', SGDClassifier())
])

# parameter grid for GridSearchCV
param_grid = {

}

# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=rng)

grid = GridSearchCV(pipeline,
                    param_grid=param_grid,
                    n_jobs=-1,
                    cv=5,
                    return_train_score=True
                    )



---
---

# Quick Notes

- Never call fit() on test data
- sklearn.preprocessing