<a href="https://colab.research.google.com/github/gulshan0201/DATA-Science/blob/main/LAB_6_Hyperparameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from scipy.stats import randint, uniform

# 1) Dataset
X, y = make_classification(
    n_samples=5000, n_features=100, n_informative=20, n_redundant=20,
    n_classes=3, weights=[0.6,0.3,0.1], flip_y=0.02, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)


In [2]:
# Scoring metrics
scoring = {'acc': make_scorer(accuracy_score), 'f1': make_scorer(f1_score, average='macro')}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Models with parameter grids
param_grids = {
    "SVM": {
        "svc__C": [0.1, 1, 10],
        "svc__gamma": ['scale', 0.01, 0.1]
    },
    "RandomForest": {
        "rf__n_estimators": [100, 300],
        "rf__max_depth": [None, 20, 50],
        "rf__min_samples_split": [2, 5]
    },
    "LogisticRegression": {
        "logreg__C": [0.1, 1, 10],
        "logreg__penalty": ['l2']
    },
    "KNN": {
        "knn__n_neighbors": [3, 5, 11],
        "knn__weights": ['uniform', 'distance']
    }
}



In [3]:
# Pipelines
pipelines = {
    "SVM": Pipeline([('scaler', StandardScaler()), ('svc', SVC())]),
    "RandomForest": Pipeline([('rf', RandomForestClassifier(random_state=42))]),
    "LogisticRegression": Pipeline([('scaler', StandardScaler()), ('logreg', LogisticRegression(max_iter=2000, multi_class='multinomial'))]),
    "KNN": Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
}

# GridSearchCV
print("=== GridSearchCV Results ===")
for name, pipe in pipelines.items():
    gs = GridSearchCV(pipe, param_grids[name], cv=cv, scoring='f1_macro', n_jobs=-1)
    gs.fit(X_train, y_train)
    print(f"{name}: Best Params={gs.best_params_}, Best CV F1={gs.best_score_:.3f}")

# RandomizedSearchCV with broader distributions
param_dists = {
    "SVM": {
        "svc__C": uniform(0.01, 100),
        "svc__gamma": uniform(0.001, 1)
    },
    "RandomForest": {
        "rf__n_estimators": randint(100, 500),
        "rf__max_depth": randint(10, 100),
        "rf__min_samples_split": randint(2, 10)
    },
    "LogisticRegression": {
        "logreg__C": uniform(0.01, 10)
    },
    "KNN": {
        "knn__n_neighbors": randint(3, 50),
        "knn__weights": ['uniform', 'distance']
    }
}

print("\n=== RandomizedSearchCV Results ===")
for name, pipe in pipelines.items():
    rs = RandomizedSearchCV(pipe, param_distributions=param_dists[name], n_iter=10, cv=cv, scoring='f1_macro', random_state=0, n_jobs=-1)
    rs.fit(X_train, y_train)
    print(f"{name}: Best Params={rs.best_params_}, Best CV F1={rs.best_score_:.3f}")


=== GridSearchCV Results ===
SVM: Best Params={'svc__C': 10, 'svc__gamma': 'scale'}, Best CV F1=0.817
RandomForest: Best Params={'rf__max_depth': None, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}, Best CV F1=0.669




LogisticRegression: Best Params={'logreg__C': 10, 'logreg__penalty': 'l2'}, Best CV F1=0.674
KNN: Best Params={'knn__n_neighbors': 5, 'knn__weights': 'distance'}, Best CV F1=0.669

=== RandomizedSearchCV Results ===
SVM: Best Params={'svc__C': np.float64(54.89135039273247), 'svc__gamma': np.float64(0.7161893663724195)}, Best CV F1=0.248
RandomForest: Best Params={'rf__max_depth': 87, 'rf__min_samples_split': 2, 'rf__n_estimators': 365}, Best CV F1=0.664




LogisticRegression: Best Params={'logreg__C': np.float64(7.161893663724195)}, Best CV F1=0.674
KNN: Best Params={'knn__n_neighbors': 6, 'knn__weights': 'distance'}, Best CV F1=0.660


In [2]:
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
try:
    from scipy.stats import loguniform
except:
    from scipy.stats import reciprocal as loguniform

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# ---------- Config ----------
RANDOM_STATE = 42
N_SAMPLES = 800
N_FEATURES = 30
TEST_SIZE = 0.2
CV_FOLDS = 2
N_JOBS = 1
RAND_N_ITER_ORIG = 5
RAND_N_ITER_EXP  = 8

# ---------- Data ----------
X, y = make_classification(
    n_samples=N_SAMPLES, n_features=N_FEATURES,
    n_informative=10, n_redundant=5,
    n_classes=3, weights=[0.6, 0.3, 0.1],
    flip_y=0.02, random_state=RANDOM_STATE
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# ---------- Pipelines ----------
pipelines = {
    "SVM": Pipeline([("scaler", StandardScaler()), ("svc", SVC())]),
    "RandomForest": Pipeline([("rf", RandomForestClassifier(random_state=RANDOM_STATE))]),
    "LogisticRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs", random_state=RANDOM_STATE))
    ]),
    "KNN": Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier())]),
}

# ---------- Small Grids ----------
param_grids_original = {
    "SVM": {"svc__C": [0.1, 1], "svc__gamma": ["scale", 0.01]},
    "RandomForest": {"rf__n_estimators": [50, 100], "rf__max_depth": [None, 20]},
    "LogisticRegression": {"logreg__C": [0.1, 1], "logreg__penalty": ["l2"]},
    "KNN": {"knn__n_neighbors": [3, 5], "knn__weights": ["uniform", "distance"]},
}
param_dists_original = {
    "SVM": {"svc__C": uniform(0.01, 10), "svc__gamma": uniform(0.001, 0.05)},
    "RandomForest": {"rf__n_estimators": randint(50, 150), "rf__max_depth": randint(10, 40)},
    "LogisticRegression": {"logreg__C": uniform(0.01, 5)},
    "KNN": {"knn__n_neighbors": randint(3, 15), "knn__weights": ["uniform", "distance"]},
}

# ---------- Small Expanded Grids ----------
param_grids_expanded = {
    "SVM": {"svc__C": [0.01, 0.1, 1, 5], "svc__gamma": ["scale", 1e-3, 1e-2], "svc__kernel": ["rbf"]},
    "RandomForest": {"rf__n_estimators": [50, 150, 300], "rf__max_depth": [None, 20, 50], "rf__min_samples_split": [2, 5]},
    "LogisticRegression": {"logreg__C": [0.01, 0.1, 1, 5], "logreg__penalty": ["l2"], "logreg__class_weight": [None, "balanced"]},
    "KNN": {"knn__n_neighbors": [3, 5, 7], "knn__weights": ["uniform", "distance"], "knn__p": [1, 2]},
}
param_dists_expanded = {
    "SVM": {"svc__C": loguniform(1e-3, 10), "svc__gamma": loguniform(1e-4, 1e-1), "svc__kernel": ["rbf"]},
    "RandomForest": {"rf__n_estimators": randint(50, 300), "rf__max_depth": randint(10, 100)},
    "LogisticRegression": {"logreg__C": loguniform(1e-3, 10), "logreg__class_weight": [None, "balanced"]},
    "KNN": {"knn__n_neighbors": randint(3, 15), "knn__weights": ["uniform", "distance"], "knn__p": randint(1, 3)},
}

# ---------- Helpers ----------
def evaluate_on_test(estimator, X_test, y_test):
    y_pred = estimator.predict(X_test)
    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average="macro")

def run_both_searches(pipes, grids, dists, rand_n_iter, label):
    rows = []
    for name, pipe in pipes.items():
        scoring = {"f1_macro": "f1_macro", "accuracy": "accuracy"}

        gs = GridSearchCV(pipe, grids[name], scoring=scoring, refit="f1_macro",
                          cv=cv, n_jobs=N_JOBS, verbose=0)
        gs.fit(X_train, y_train)
        t_acc, t_f1 = evaluate_on_test(gs.best_estimator_, X_test, y_test)
        rows.append({
            "Model + Method": f"{name} — Grid {label}",
            "Best Params (CV)": gs.best_params_,
            "Best CV F1 (mean)": float(gs.best_score_),
            "Test Accuracy": float(t_acc),
            "Test Macro-F1": float(t_f1),
        })

        rs = RandomizedSearchCV(pipe, param_distributions=dists[name], n_iter=rand_n_iter,
                                scoring=scoring, refit="f1_macro",
                                cv=cv, n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0)
        rs.fit(X_train, y_train)
        t_acc_r, t_f1_r = evaluate_on_test(rs.best_estimator_, X_test, y_test)
        rows.append({
            "Model + Method": f"{name} — Randomized {label}",
            "Best Params (CV)": rs.best_params_,
            "Best CV F1 (mean)": float(rs.best_score_),
            "Test Accuracy": float(t_acc_r),
            "Test Macro-F1": float(t_f1_r),
        })
    return pd.DataFrame(rows)

# ---------- Run ----------
df_original = run_both_searches(pipelines, param_grids_original, param_dists_original, RAND_N_ITER_ORIG, "(Original)")
df_expanded = run_both_searches(pipelines, param_grids_expanded, param_dists_expanded, RAND_N_ITER_EXP, "(Expanded)")

# ---------- Output ----------
pd.set_option("display.max_colwidth", None)
print("\n=== Task 1 Results — ORIGINAL Searches (Ultra-Compact) ===")
print(df_original.to_string(index=False))
print("\n=== Task 1 Results — EXPANDED Searches (Ultra-Compact) ===")
print(df_expanded.to_string(index=False))






=== Task 1 Results — ORIGINAL Searches (Ultra-Compact) ===
                            Model + Method                                                  Best Params (CV)  Best CV F1 (mean)  Test Accuracy  Test Macro-F1
                     SVM — Grid (Original)                              {'svc__C': 1, 'svc__gamma': 'scale'}           0.474726        0.75000       0.599814
               SVM — Randomized (Original) {'svc__C': 7.3299394181140505, 'svc__gamma': 0.03093292420985183}           0.580636        0.81875       0.757909
            RandomForest — Grid (Original)                   {'rf__max_depth': None, 'rf__n_estimators': 50}           0.510104        0.74375       0.616339
      RandomForest — Randomized (Original)                    {'rf__max_depth': 16, 'rf__n_estimators': 132}           0.505682        0.78125       0.669565
      LogisticRegression — Grid (Original)                         {'logreg__C': 1, 'logreg__penalty': 'l2'}           0.500106        0.71250       0