# Scikit-learn (sklearn) — Commonly Used Methods & Classes: Code Examples

This Colab-ready notebook demonstrates the most commonly used scikit-learn methods/classes mentioned earlier: `fit`, `predict`, `predict_proba`, `score`, popular estimators, preprocessing, model selection, metrics, pipelines, and feature selection.


In [None]:
# If you're running in Google Colab, scikit-learn is usually preinstalled.
# Uncomment to upgrade if needed:
# !pip -q install -U scikit-learn

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report,
    mean_squared_error, mean_absolute_error, r2_score
)

# Models
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Unsupervised + DR
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE

# Feature selection
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE, VarianceThreshold

import warnings
warnings.filterwarnings("ignore")
print("Imports OK ✅")


## 1) Core estimator API: `fit`, `predict`, `predict_proba`, `score`
Most sklearn estimators share this interface.

In [None]:
# We'll use Iris for classification
iris = datasets.load_iris(as_frame=True)
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)            # fit
y_pred = clf.predict(X_test)         # predict
y_proba = clf.predict_proba(X_test)  # predict_proba (classification)
acc = clf.score(X_test, y_test)      # score (default metric depends on estimator)

acc, y_pred[:5], y_proba[:2]


## 2) Popular classification models
Quick examples for: `LogisticRegression`, `RandomForestClassifier`, `DecisionTreeClassifier`, `SVC`, `KNeighborsClassifier`, `GaussianNB`.

In [None]:
models_cls = {
    "LogisticRegression": LogisticRegression(max_iter=300),
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
    "SVC(probability=True)": SVC(probability=True, random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "GaussianNB": GaussianNB()
}

results = {}
for name, m in models_cls.items():
    m.fit(X_train, y_train)
    results[name] = m.score(X_test, y_test)

pd.Series(results).sort_values(ascending=False)


## 3) Popular regression models
Examples for: `LinearRegression`, `Ridge`, `Lasso`, `ElasticNet`, `RandomForestRegressor`, `DecisionTreeRegressor`, `SVR`.

In [None]:
# California Housing for regression
cal = datasets.fetch_california_housing(as_frame=True)
Xr = cal.data
yr = cal.target

Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr, yr, test_size=0.25, random_state=42)

models_reg = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001),
    "ElasticNet": ElasticNet(alpha=0.001, l1_ratio=0.5),
    "RandomForestRegressor": RandomForestRegressor(random_state=42, n_estimators=200, n_jobs=-1),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
    "SVR": SVR(C=10.0, epsilon=0.1)
}

reg_metrics = {}
for name, m in models_reg.items():
    m.fit(Xr_train, yr_train)
    pred = m.predict(Xr_test)
    reg_metrics[name] = {
        "RMSE": mean_squared_error(yr_test, pred, squared=False),
        "MAE": mean_absolute_error(yr_test, pred),
        "R2": r2_score(yr_test, pred),
        "score()": m.score(Xr_test, yr_test)
    }

pd.DataFrame(reg_metrics).T.sort_values("R2", ascending=False).head(10)


## 4) Data preprocessing
### 4.1 Scaling & normalization: `StandardScaler`, `MinMaxScaler`, `RobustScaler`, `Normalizer`

In [None]:
from sklearn.preprocessing import FunctionTransformer

X_small = X_train.iloc[:10].copy()

scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler(),
    "Normalizer": Normalizer()
}

scaled_samples = {}
for name, s in scalers.items():
    scaled_samples[name] = pd.DataFrame(s.fit_transform(X_small), columns=X_small.columns).head(3)

scaled_samples["StandardScaler"], scaled_samples["MinMaxScaler"]


### 4.2 Encoding categorical data: `OneHotEncoder`, `LabelEncoder`, `OrdinalEncoder`

In [None]:
df_cat = pd.DataFrame({
    "color": ["red", "blue", "green", "blue"],
    "size":  ["S", "M", "L", "S"]
})

# OneHotEncoder (for models; returns array / sparse matrix)
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
ohe_out = ohe.fit_transform(df_cat[["color"]])

# OrdinalEncoder (keeps order you define, or alphabetical by default)
ord_enc = OrdinalEncoder(categories=[["S", "M", "L"]])  # explicit order
ord_out = ord_enc.fit_transform(df_cat[["size"]])

# LabelEncoder (typically for y labels, not X features)
le = LabelEncoder()
y_labels = le.fit_transform(["spam", "ham", "spam", "eggs"])

ohe_out, ord_out.ravel(), y_labels, le.classes_


### 4.3 Missing values: `SimpleImputer`, `KNNImputer`

In [None]:
df_miss = pd.DataFrame({
    "a": [1.0, np.nan, 3.0, 4.0],
    "b": [np.nan, 2.0, 3.0, np.nan]
})

simp = SimpleImputer(strategy="mean")
knn = KNNImputer(n_neighbors=2)

simp_out = pd.DataFrame(simp.fit_transform(df_miss), columns=df_miss.columns)
knn_out  = pd.DataFrame(knn.fit_transform(df_miss), columns=df_miss.columns)

df_miss, simp_out, knn_out


## 5) Model selection & evaluation
### 5.1 `train_test_split`

In [None]:
# Already used above, but here's the pattern:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train2.shape, X_test2.shape


### 5.2 Cross-validation: `cross_val_score`, `cross_validate`

In [None]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=300))

scores = cross_val_score(pipe, X, y, cv=5, scoring="accuracy")
cv = cross_validate(pipe, X, y, cv=5, scoring=["accuracy", "f1_macro"], return_train_score=True)

scores, {k: np.mean(v) for k, v in cv.items() if k.startswith("test_")}


### 5.3 Hyperparameter search: `GridSearchCV`, `RandomizedSearchCV`

In [None]:
# GridSearchCV example for SVC
svc_pipe = make_pipeline(StandardScaler(), SVC(probability=True, random_state=42))
param_grid = {
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", "auto"]
}
grid = GridSearchCV(svc_pipe, param_grid=param_grid, cv=3, scoring="accuracy")
grid.fit(X_train, y_train)

grid.best_params_, grid.best_score_, grid.score(X_test, y_test)


In [None]:
# RandomizedSearchCV example for RandomForestClassifier
from scipy.stats import randint

rf = RandomForestClassifier(random_state=42)
param_dist = {
    "n_estimators": randint(50, 400),
    "max_depth": randint(2, 20),
    "min_samples_split": randint(2, 10)
}
rand = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=12, cv=3, scoring="accuracy", random_state=42, n_jobs=-1)
rand.fit(X_train, y_train)

rand.best_params_, rand.best_score_, rand.score(X_test, y_test)


### 5.4 Metrics (classification)
`accuracy_score`, `precision_score`, `recall_score`, `f1_score`, `roc_auc_score`, `confusion_matrix`, `classification_report`

In [None]:
# Use breast cancer dataset (binary classification)
bc = datasets.load_breast_cancer(as_frame=True)
Xb = bc.data
yb = bc.target

Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, test_size=0.25, random_state=42, stratify=yb)

clf_b = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))
clf_b.fit(Xb_train, yb_train)

pred_b = clf_b.predict(Xb_test)
proba_b = clf_b.predict_proba(Xb_test)[:, 1]

metrics = {
    "accuracy": accuracy_score(yb_test, pred_b),
    "precision": precision_score(yb_test, pred_b),
    "recall": recall_score(yb_test, pred_b),
    "f1": f1_score(yb_test, pred_b),
    "roc_auc": roc_auc_score(yb_test, proba_b),
}
metrics, confusion_matrix(yb_test, pred_b)


In [None]:
print(classification_report(yb_test, pred_b, target_names=bc.target_names))


### 5.5 Metrics (regression)
`mean_squared_error`, `mean_absolute_error`, `r2_score`

In [None]:
reg = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
reg.fit(Xr_train, yr_train)
pred_r = reg.predict(Xr_test)

rmse = mean_squared_error(yr_test, pred_r, squared=False)
mae = mean_absolute_error(yr_test, pred_r)
r2  = r2_score(yr_test, pred_r)

{"RMSE": rmse, "MAE": mae, "R2": r2, "score()": reg.score(Xr_test, yr_test)}


## 6) Pipelines & composition
### `Pipeline`, `make_pipeline`, `ColumnTransformer`

In [None]:
# Create a toy mixed-type dataset
df = pd.DataFrame({
    "age": [25, 32, 47, np.nan, 52, 23],
    "income": [50000, 64000, 120000, 58000, np.nan, 48000],
    "city": ["London", "Paris", "London", "Berlin", "Paris", "Berlin"],
    "segment": ["A", "B", "A", "C", "B", "A"]
})
y_toy = np.array([0, 1, 0, 1, 1, 0])

num_cols = ["age", "income"]
cat_cols = ["city", "segment"]

numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols)
    ]
)

full_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=300))
])

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(df, y_toy, test_size=0.33, random_state=42, stratify=y_toy)
full_model.fit(X_train_t, y_train_t)
full_model.score(X_test_t, y_test_t)


## 7) Feature selection
`SelectKBest`, `chi2`, `f_classif`, `RFE`, `VarianceThreshold`

In [None]:
# For chi2, features must be non-negative
X_nonneg = MinMaxScaler().fit_transform(X)  # iris scaled to [0,1]

# SelectKBest + chi2
skb_chi2 = SelectKBest(score_func=chi2, k=2)
X_chi2 = skb_chi2.fit_transform(X_nonneg, y)
chi2_selected = np.array(iris.feature_names)[skb_chi2.get_support()]

# SelectKBest + f_classif (ANOVA)
skb_f = SelectKBest(score_func=f_classif, k=2)
X_f = skb_f.fit_transform(X, y)
f_selected = np.array(iris.feature_names)[skb_f.get_support()]

chi2_selected, f_selected, X_chi2.shape, X_f.shape


In [None]:
# RFE with LogisticRegression
rfe = RFE(estimator=LogisticRegression(max_iter=300), n_features_to_select=2)
rfe.fit(X, y)
rfe_selected = np.array(iris.feature_names)[rfe.get_support()]

# VarianceThreshold
vt = VarianceThreshold(threshold=0.0)
vt.fit(X)
vt_selected = np.array(iris.feature_names)[vt.get_support()]

rfe_selected, vt_selected


## 8) Clustering
`KMeans`, `DBSCAN`, `AgglomerativeClustering`

In [None]:
X_u = StandardScaler().fit_transform(iris.data)

kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto")
dbscan = DBSCAN(eps=0.6, min_samples=5)
agglo = AgglomerativeClustering(n_clusters=3)

labels_k = kmeans.fit_predict(X_u)
labels_d = dbscan.fit_predict(X_u)
labels_a = agglo.fit_predict(X_u)

pd.DataFrame({
    "KMeans": labels_k,
    "DBSCAN": labels_d,
    "Agglomerative": labels_a
}).head()


## 9) Dimensionality reduction / visualization
`PCA`, `TruncatedSVD`, `TSNE`

In [None]:
X_std = StandardScaler().fit_transform(iris.data)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_std)

svd = TruncatedSVD(n_components=2, random_state=42)
X_svd = svd.fit_transform(X_std)

# t-SNE can be slow on large datasets; Iris is small.
tsne = TSNE(n_components=2, random_state=42, init="pca", learning_rate="auto")
X_tsne = tsne.fit_transform(X_std)

X_pca[:3], X_svd[:3], X_tsne[:3]


## ✅ Done
You now have runnable examples for the commonly used scikit-learn methods/classes listed earlier.