In [45]:
import sklearn
import pandas as pd

# 4. Dataset Transformations (preprocessing)

## 4.1 Scaling

In [None]:
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, RobustScaler)

# StandardScaler (mean=0, std=1) - Most common
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use same parameters!

# MinMaxScaler (0-1 range)
minmax = MinMaxScaler()
X_train_minmax = minmax.fit_transform(X_train)

# RobustScaler (uses median and IQR, robust to outliers)
robust = RobustScaler()
X_train_robust = robust.fit_transform(X_train)

## 4.2 Encoding

In [41]:
from sklearn.preprocessing import ( LabelEncoder, OneHotEncoder )

# LabelEncoder (ordinal encoding for target)
le = LabelEncoder()
y_encoded = le.fit_transform(y_train)
# y_decoded = le.inverse_transform(y_encoded)

# OneHotEncoder (for categorical features)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# X_ohe = ohe.fit_transform(X_categorical)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


## 4.3 Imputation

In [42]:
from sklearn.impute import SimpleImputer, KNNImputer

# Simple Imputer
imputer = SimpleImputer(strategy='mean')  # 'median', 'most_frequent', 'constant'
# X_imputed = imputer.fit_transform(X_with_nan)

# KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)
# X_knn_imputed = knn_imputer.fit_transform(X_with_nan)

## 4.4 Feature Engineering

In [43]:
from sklearn.preprocessing import PolynomialFeatures, Binarizer, Normalizer

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train)

# Binarizer
binarizer = Binarizer(threshold=0.0)
X_binary = binarizer.transform(X_train)

# Normalizer (L1, L2 normalization per sample)
normalizer = Normalizer(norm='l2')
X_normalized = normalizer.transform(X_train)

## 4.5 Pipelines

In [36]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# Simple Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Or using make_pipeline (auto-names steps)
pipe = make_pipeline(
    StandardScaler(),
    PCA(n_components=2),
    RandomForestClassifier(random_state=42)
)
pipe.fit(X_train, y_train)

# ColumnTransformer (different transformations for different columns)
numeric_features = [0, 1, 2, 3]
categorical_features = []

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        # ('cat', OneHotEncoder(), categorical_features)
    ])

# Full pipeline with preprocessing
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
full_pipeline.fit(X_train, y_train)


In [None]:
# ============================================================================
# CODESIGNAL QUICK PATTERNS
# ============================================================================
print("\n" + "="*80)
print("CODESIGNAL QUICK PATTERNS")
print("="*80)
print("""
# PATTERN 1: Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PATTERN 2: Fit and Predict
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# PATTERN 3: Evaluate
from sklearn.metrics import accuracy_score, mean_squared_error
score = accuracy_score(y_test, y_pred)  # Classification
mse = mean_squared_error(y_test, y_pred)  # Regression

# PATTERN 4: Cross-Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print(f"Mean CV Score: {scores.mean():.3f}")

# PATTERN 5: Grid Search
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [50, 100], 'max_depth': [3, 5]}
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_

# PATTERN 6: Pipeline
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), PCA(n_components=2), RandomForestClassifier())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# PATTERN 7: Scale Data (ALWAYS fit on train only!)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Same scaler!

# PATTERN 8: Handle Missing Values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
""")

# 1. Supervised Learning

In [15]:
from sklearn.datasets import load_iris, load_diabetes, make_blobs, make_classification

iris = load_iris()
X_class, y_class = iris.data, iris.target # classification

diabetes = load_diabetes()
X_reg, y_reg = diabetes.data, diabetes.target  # Regression

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

## 1.1 Linear Models

In [16]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Ridge (L2 regularization)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Lasso (L1 regularization)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Logistic Regression (classification)
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)  # Probabilities

## 1.2. Linear and Quadratic Discriminant Analysis

In [17]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)

# QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
y_pred = qda.predict(X_test)

## 1.3 Kernel Ridge Regression

In [18]:

from sklearn.kernel_ridge import KernelRidge

kr = KernelRidge(alpha=1.0, kernel='rbf')
kr.fit(X_train, y_train)
y_pred = kr.predict(X_test)

## 1.4 SVMs

In [19]:
from sklearn.svm import SVC, SVR

# SVC (classification)
svc = SVC(kernel='rbf', C=1.0, gamma='scale')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

# SVR (regression)
svr = SVR(kernel='rbf', C=1.0, gamma='scale')
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

## 1.5 Stochastic Gradient Descent

In [20]:
from sklearn.linear_model import SGDClassifier, SGDRegressor

# SGD Classifier
sgd_clf = SGDClassifier(loss='hinge', max_iter=1000, random_state=42)
sgd_clf.fit(X_train, y_train)

# SGD Regressor
sgd_reg = SGDRegressor(loss='squared_error', max_iter=1000, random_state=42)
sgd_reg.fit(X_train, y_train)

## 1.6 Nearest Neighbors

In [21]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# KNN Regressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train, y_train)
y_pred = knn_reg.predict(X_test)

## 1.7  Gaussian Processes

In [22]:
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

# GP Classifier
gpc = GaussianProcessClassifier(kernel=RBF())
gpc.fit(X_train, y_train)

# GP Regressor
gpr = GaussianProcessRegressor(kernel=RBF())
gpr.fit(X_train, y_train)



## 1.8. Cross decomposition

In [23]:
from sklearn.cross_decomposition import PLSRegression

pls = PLSRegression(n_components=2)
pls.fit(X_train, y_train)
y_pred = pls.predict(X_test)

## 1.9 Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# Gaussian NB (continuous features)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Multinomial NB (count data)
mnb = MultinomialNB()
# mnb.fit(X_train_counts, y_train)  # Requires non-negative features

# Bernoulli NB (binary features)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)


## 1. 10 Decision Trees

In [25]:
## 1.10 Decision Trees

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)

# Decision Tree Regressor
dt_reg = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_reg.fit(X_train, y_train)

# Feature importance
feature_importance = dt_clf.feature_importances_

## 1.11 Ensembles: Gradient bppsting, RFs, bagging, voting, stacking

In [26]:

from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    AdaBoostClassifier, AdaBoostRegressor,
    BaggingClassifier, BaggingRegressor,
    VotingClassifier, VotingRegressor,
    StackingClassifier, StackingRegressor
)
# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)

# AdaBoost
ada = AdaBoostClassifier(n_estimators=50, random_state=42)
ada.fit(X_train, y_train)

# Bagging
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging.fit(X_train, y_train)

# Voting (combine multiple models)
voting = VotingClassifier(
    estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('gnb', GaussianNB())],
    voting='soft'  # 'hard' for majority vote, 'soft' for probability averaging
)
voting.fit(X_train, y_train)

# Stacking (meta-learner)
stacking = StackingClassifier(
    estimators=[('rf', RandomForestClassifier()), ('lr', LogisticRegression())],
    final_estimator=LogisticRegression()
)
stacking.fit(X_train, y_train)



## 1.12. Multiclass and multioutput 

In [28]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
import numpy as np
# One-vs-Rest
ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(X_train, y_train)

# One-vs-One
ovo = OneVsOneClassifier(SVC())
ovo.fit(X_train, y_train)

# Multi-output (multiple targets)
# Create multi-output target
y_multi = np.column_stack([y_train, (y_train + 1) % 3])
multi_clf = MultiOutputClassifier(RandomForestClassifier())
multi_clf.fit(X_train, y_multi)

## 1.13. Feature selection

In [29]:
from sklearn.feature_selection import (
    SelectKBest, f_classif, f_regression, chi2,
    RFE, RFECV,
    SelectFromModel
)

# SelectKBest (univariate)
selector = SelectKBest(f_classif, k=2)
X_selected = selector.fit_transform(X_train, y_train)
selected_features = selector.get_support()

# RFE (Recursive Feature Elimination)
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2)
X_rfe = rfe.fit_transform(X_train, y_train)

# Select from Model (use model's feature importance)
sfm = SelectFromModel(RandomForestClassifier(), threshold='median')
X_sfm = sfm.fit_transform(X_train, y_train)


## 1.14. Semi-supervised learning

In [30]:
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

# Create semi-supervised scenario (some labels unknown = -1)
y_semi = y_train.copy()
y_semi[50:] = -1  # Mark some as unlabeled

# Label Propagation
lp = LabelPropagation()
lp.fit(X_train, y_semi)

# Label Spreading
ls = LabelSpreading()
ls.fit(X_train, y_semi)

## 1.15. Isotonic regression

In [31]:
from sklearn.isotonic import IsotonicRegression

iso = IsotonicRegression()
iso.fit(X_train[:, 0], y_train)  # Univariate only
y_pred = iso.predict(X_test[:, 0])


## 1.16 probability calibration

In [32]:
from sklearn.calibration import CalibratedClassifierCV

# Calibrate probabilities
calibrated = CalibratedClassifierCV(GaussianNB(), cv=3, method='isotonic')
calibrated.fit(X_train, y_train)
y_pred_proba_cal = calibrated.predict_proba(X_test)

## 1.17. Neural network models (supervised)

In [33]:
from sklearn.neural_network import MLPClassifier, MLPRegressor

# MLP Classifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_clf.fit(X_train, y_train)
y_pred = mlp_clf.predict(X_test)

# MLP Regressor
mlp_reg = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_reg.fit(X_train, y_train)

# 2. Unsupervised learning
2.1. Gaussian mixture models

2.2. Manifold learning

2.3. Clustering

2.4. Biclustering

2.5. Decomposing signals in components (matrix factorization problems)

2.6. Covariance estimation

2.7. Novelty and Outlier Detection

2.8. Density Estimation

2.9. Neural network models (unsupervised)


## 2.1 GAUSSIAN MIXTURE MODELS

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X_train)
labels = gmm.predict(X_train)
probs = gmm.predict_proba(X_train)

## 2.2 MANIFOLD LEARNING

In [44]:
from sklearn.manifold import TSNE, MDS, Isomap, LocallyLinearEmbedding

# t-SNE (most common for visualization)
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train)

# MDS
mds = MDS(n_components=2, random_state=42)
X_mds = mds.fit_transform(X_train)

# Isomap
isomap = Isomap(n_components=2)
X_isomap = isomap.fit_transform(X_train)

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


## 2.3 Clustering

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MeanShift, SpectralClustering

# K-Means (most common)
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X_train)
centers = kmeans.cluster_centers_

# DBSCAN (density-based)
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X_train)

# Hierarchical/Agglomerative
agg = AgglomerativeClustering(n_clusters=3)
labels = agg.fit_predict(X_train)

# Mean Shift
ms = MeanShift()
labels = ms.fit_predict(X_train)

## 2.4 BICLUSTERING

In [None]:
from sklearn.cluster import SpectralBiclustering, SpectralCoclustering

# Spectral Biclustering
bicluster = SpectralBiclustering(n_clusters=(3, 3), random_state=42)
bicluster.fit(X_train)
row_labels = bicluster.row_labels_
col_labels = bicluster.column_labels_

## 2.5 DECOMPOSING SIGNALS (MATRIX FACTORIZATION)

In [None]:
from sklearn.decomposition import PCA, NMF, FastICA, TruncatedSVD, FactorAnalysis

# PCA (most common)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)
explained_var = pca.explained_variance_ratio_

# NMF (Non-negative Matrix Factorization)
nmf = NMF(n_components=2, random_state=42)
X_nmf = nmf.fit_transform(np.abs(X_train))  # Requires non-negative data

# ICA (Independent Component Analysis)
ica = FastICA(n_components=2, random_state=42)
X_ica = ica.fit_transform(X_train)

# Truncated SVD (like PCA but for sparse matrices)
svd = TruncatedSVD(n_components=2, random_state=42)
X_svd = svd.fit_transform(X_train)

## 2.6 COVARIANCE ESTIMATION

In [None]:
from sklearn.covariance import EmpiricalCovariance, LedoitWolf, OAS, ShrunkCovariance

# Empirical Covariance
cov = EmpiricalCovariance()
cov.fit(X_train)
covariance_matrix = cov.covariance_

# Ledoit-Wolf (shrinkage)
lw = LedoitWolf()
lw.fit(X_train)

## 2.7 NOVELTY AND OUTLIER DETECTION

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

# Isolation Forest (most common)
iso_forest = IsolationForest(contamination=0.1, random_state=42)
outliers = iso_forest.fit_predict(X_train)  # -1 = outlier, 1 = inlier

# Local Outlier Factor
lof = LocalOutlierFactor(contamination=0.1)
outliers = lof.fit_predict(X_train)

# One-Class SVM
oc_svm = OneClassSVM(nu=0.1)
outliers = oc_svm.fit_predict(X_train)

## 2.8 DENSITY ESTIMATION

In [None]:
from sklearn.neighbors import KernelDensity

kde = KernelDensity(kernel='gaussian', bandwidth=0.5)
kde.fit(X_train)
log_density = kde.score_samples(X_test)

## 2.9 NEURAL NETWORK MODELS (UNSUPERVISED)

In [None]:
from sklearn.neural_network import BernoulliRBM

rbm = BernoulliRBM(n_components=50, random_state=42)
rbm.fit(X_train)
X_transformed = rbm.transform(X_train)

# 3. Model selection and evaluation
3.1. Cross-validation: evaluating estimator performance
3.2. Tuning the hyper-parameters of an estimator
3.3. Tuning the decision threshold for class prediction
3.4. Metrics and scoring: quantifying the quality of predictions
3.5. Validation curves: plotting scores to evaluate models

## 3.1 CROSS-VALIDATION

In [None]:
from sklearn.model_selection import (
    cross_val_score, cross_validate,
    KFold, StratifiedKFold, TimeSeriesSplit
)

# Simple cross-validation score
model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"CV Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std():.3f})")

# Cross-validate with multiple metrics
cv_results = cross_validate(
    model, X_train, y_train, cv=5,
    scoring=['accuracy', 'precision_macro', 'recall_macro'],
    return_train_score=True
)

# KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

# Stratified KFold (preserves class distribution)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X_train, y_train):
    pass

## 3.2. Tuning the hyper-parameters of an estimator

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Grid Search (exhaustive)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}
grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid.fit(X_train, y_train)
print(f"Best params: {grid.best_params_}")
print(f"Best score: {grid.best_score_:.3f}")
best_model = grid.best_estimator_

# Randomized Search (faster for large param spaces)
from scipy.stats import randint, uniform
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 20)
}
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist,
    n_iter=20,
    cv=5,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)

## 3.3. Tuning the decision threshold for class prediction

In [None]:
# Get probability predictions
model = LogisticRegression()
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class

# Apply custom threshold
threshold = 0.7
y_pred_custom = (y_proba >= threshold).astype(int)

# Find optimal threshold using ROC curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# optimal_idx = np.argmax(tpr - fpr)
# optimal_threshold = thresholds[optimal_idx]

## 3.4. Metrics and scoring: quantifying the quality of predictions

In [None]:
from sklearn.metrics import (
    # Classification metrics
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc,
    log_loss,
    # Regression metrics
    mean_squared_error, mean_absolute_error, r2_score,
    mean_absolute_percentage_error
)
# CLASSIFICATION METRICS
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

print("=== CLASSIFICATION METRICS ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred, average='macro'):.3f}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro'):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='macro'):.3f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:\n{cm}")

# Classification Report (comprehensive)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC AUC (for binary or multi-class with OvR)
# auc_score = roc_auc_score(y_test, y_proba, multi_class='ovr')

# Log Loss
ll = log_loss(y_test, y_proba)
print(f"Log Loss: {ll:.3f}")

# REGRESSION METRICS
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_reg = lr.predict(X_test)

print("\n=== REGRESSION METRICS ===")
print(f"MSE: {mean_squared_error(y_test, y_pred_reg):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_reg)):.3f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_reg):.3f}")
print(f"R² Score: {r2_score(y_test, y_pred_reg):.3f}")

## 3.5. Validation curves: plotting scores to evaluate models

In [40]:
from sklearn.model_selection import validation_curve, learning_curve

# Validation Curve (single hyperparameter)
param_range = [1, 3, 5, 7, 10]
train_scores, val_scores = validation_curve(
    RandomForestClassifier(random_state=42),
    X_train, y_train,
    param_name='max_depth',
    param_range=param_range,
    cv=5,
    scoring='accuracy'
)

# Learning Curve (effect of training size)
train_sizes, train_scores_lc, val_scores_lc = learning_curve(
    RandomForestClassifier(random_state=42),
    X_train, y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)