In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import numpy as np

## warning learn methods don't get hopes that your accuracy for this data set is misleading use this methods for real and big data sets

In [17]:
df = sns.load_dataset('iris')

In [18]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [19]:
X = df.drop('species',axis=1)
y = df['species']

In [20]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [22]:
# Define hyperparameter distributions for each base learner
# Using only training data - NO TEST DATA LEAKAGE

# Decision Tree parameters
dt_param_dist = {
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'criterion': ['gini', 'entropy']
}

# SVC parameters
svc_param_dist = {
    'C': uniform(0.1, 10),
    'gamma': ['scale', 'auto'] + list(uniform(0.001, 1).rvs(5)),
    'kernel': ['rbf', 'poly', 'sigmoid']
}

# KNN parameters
knn_param_dist = {
    'n_neighbors': randint(3, 15),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Logistic Regression parameters
lr_param_dist = {
    'C': uniform(0.01, 10),
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

In [23]:
# Perform RandomizedSearchCV for each base learner
# IMPORTANT: Using only X_train and y_train - NO TEST DATA USED HERE
print("Tuning Decision Tree...")
dt_random = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_distributions=dt_param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
dt_random.fit(X_train, y_train)
print(f"Best DT params: {dt_random.best_params_}")
print(f"Best DT CV score: {dt_random.best_score_:.4f}\n")

print("Tuning SVC...")
svc_random = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    param_distributions=svc_param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
svc_random.fit(X_train, y_train)
print(f"Best SVC params: {svc_random.best_params_}")
print(f"Best SVC CV score: {svc_random.best_score_:.4f}\n")

print("Tuning KNN...")
knn_random = RandomizedSearchCV(
    KNeighborsClassifier(),
    param_distributions=knn_param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
knn_random.fit(X_train, y_train)
print(f"Best KNN params: {knn_random.best_params_}")
print(f"Best KNN CV score: {knn_random.best_score_:.4f}\n")

print("Tuning Logistic Regression...")
lr_random = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_distributions=lr_param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)
lr_random.fit(X_train, y_train)
print(f"Best LR params: {lr_random.best_params_}")
print(f"Best LR CV score: {lr_random.best_score_:.4f}")

Tuning Decision Tree...
Best DT params: {'criterion': 'gini', 'max_depth': 19, 'min_samples_leaf': 3, 'min_samples_split': 13}
Best DT CV score: 0.9333

Tuning SVC...
Best SVC params: {'C': np.float64(6.932635188254582), 'gamma': np.float64(0.03715772347834456), 'kernel': 'rbf'}
Best SVC CV score: 0.9917

Tuning KNN...
Best KNN params: {'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'uniform'}
Best KNN CV score: 0.9833

Tuning Logistic Regression...
Best LR params: {'C': np.float64(3.7554011884736247), 'penalty': 'l2', 'solver': 'lbfgs'}
Best LR CV score: 0.9667


In [24]:
# Create new objects with best parameters from RandomizedSearchCV
base_learners = [
    ('dt', DecisionTreeClassifier(**dt_random.best_params_, random_state=42)),
    ('svc', SVC(**svc_random.best_params_, probability=True, random_state=42)),
    ('knn', KNeighborsClassifier(**knn_random.best_params_)),
    ('lr', LogisticRegression(**lr_random.best_params_, max_iter=1000, random_state=42))
]

In [25]:
meta_learner = LogisticRegression(max_iter=100)

In [26]:
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv= 5
)

In [27]:
stacking_clf.fit(X_train,y_train)

0,1,2
,estimators,"[('dt', ...), ('svc', ...), ...]"
,final_estimator,LogisticRegression()
,cv,5
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,19
,min_samples_split,13
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,C,np.float64(6.932635188254582)
,kernel,'rbf'
,degree,3
,gamma,np.float64(0....5772347834456)
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,n_neighbors,6
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(3.7554011884736247)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [36]:
y_pred = stacking_clf.predict(X_test)

In [37]:
y_pred
accuracy = accuracy_score(y_test, y_pred)

In [30]:
accuracy

1.0

In [31]:
print(f"\nTest Set Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Test Set Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [33]:
# Compare individual optimized base learners vs stacking ensemble
print("\n" + "="*60)
print("COMPARISON: Individual Base Learners vs Stacking Ensemble")
print("="*60)

for name, model in base_learners:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    print(f"{name.upper()} accuracy: {acc:.4f}")

print(f"\nSTACKING ENSEMBLE accuracy: {accuracy:.4f}")
print("="*60)


COMPARISON: Individual Base Learners vs Stacking Ensemble
DT accuracy: 0.9667
SVC accuracy: 1.0000
KNN accuracy: 0.9667
LR accuracy: 1.0000

STACKING ENSEMBLE accuracy: 1.0000


In [38]:
train_pred = stacking_clf.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)

In [39]:
train_acc

0.975

In [40]:
accuracy

1.0

In [44]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    stacking_clf, X_train, y_train, cv=5, scoring="accuracy"
)

print("CV scores:", cv_scores)
print("CV mean:", cv_scores.mean())


CV scores: [0.95833333 0.95833333 0.95833333 1.         1.        ]
CV mean: 0.975


In [43]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
scores = cross_val_score(stacking_clf, X, y, cv=rs)

print(scores, scores.mean())


[1.         0.96666667 0.96666667 0.93333333 0.93333333] 0.9600000000000002


In [54]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [55]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=None
)

In [56]:
rf_model.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
y_pred_rf = rf_model.predict(X_test)

In [58]:
acc_rf = accuracy_score(y_test, y_pred_rf)

In [59]:
acc_rf

0.9

In [60]:
from xgboost import XGBClassifier

In [61]:
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)

In [62]:
ada_model.fit(X_train, y_train)

0,1,2
,estimator,
,n_estimators,100
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,42


In [63]:
y_pred_ada = ada_model.predict(X_test)

In [64]:
acc_ada = accuracy_score(y_test,y_pred_ada)

In [65]:
acc_ada

0.9333333333333333

In [91]:

gb_model = GradientBoostingClassifier(n_estimators=9998, learning_rate=0.1, random_state=42)

In [92]:
gb_model.fit(X_train,y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,9998
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [93]:
y_gb = gb_model.predict(X_test)

In [94]:
acc_gb = accuracy_score(y_test,y_gb)

In [95]:
acc_gb

0.9666666666666667

In [96]:
xgb_model = XGBClassifier(n_estimators=9999, learning_rate=1, max_depth=3, use_label_encoder=False, eval_metric='mlogloss', random_state=42)

     

In [97]:
xgb_model.fit(X_train,y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [98]:
y_xg = xgb_model.predict(X_test)

In [99]:
ac_xg = accuracy_score(y_test,y_xg)

In [100]:
ac_xg

0.9333333333333333