# Best Model Selection

To select the best model, when using multiple models in pipeline, we can use the technique of cross-validation and evaluation metrics to compare the performance of the models. Here is a simple example to demonstrate how to select the best model using cross-validation and evaluation metrics.

In [11]:
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [12]:
# Load the data
titanic = sns.load_dataset('titanic')

# Select the features and the target
X = titanic[['pclass', 'age', 'sex', 'survived', 'embarked']]
y = titanic['survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# create a list of models to evaluate
models = [('RandomForestClassifier:', RandomForestClassifier(random_state=42)),
          ('GradientBoostingClassifier:', GradientBoostingClassifier(random_state=42)),
          ('XGBClassifier:', XGBClassifier(random_state=42))]

best_model = None
best_score = 0.0

# Iterate over the models and evaluate the model

for name, model in models:
    # Create a pipeline
    pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                         ('encoder', OneHotEncoder(handle_unknown='ignore')),
                         ('model', model)])

# perform cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5)

# calculate the mean score
mean_score = scores.mean()

# fit the model
pipeline.fit(X_train, y_train)

# make predictions on the test set
y_pred = pipeline.predict(X_test)

# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# print the performance of the metrics
print('Model:', name)
print('Mean cross-validation score:', mean_score)
print('Test accuracy:', accuracy)

# check if the accuracy of the current model has the best accuracy
if accuracy > best_score:
    best_score = accuracy
    best_model = model

# retrieve the best model
print('Best model:', best_model)

Model: XGBClassifier:
Mean cross-validation score: 1.0
Test accuracy: 1.0
Best model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
