# Best Model Selection in Pipeline

In [8]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load the Titanic dataset from seaborn
titanic_data = sns.load_dataset('titanic')

# Select the features and target
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = titanic_data['survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [('Random Forest', RandomForestClassifier(random_state=42)),
          ('Gradient Boosting', GradientBoostingClassifier(random_state=42))]

best_model = None
best_accuracy = 0

# Iterate over the models
for name, model in models:
    # Create a pipeline with the model and the necessary preprocessing steps
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
# Perform cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5)

# Calculate the mean accuracy
mean_accuracy = np.mean(scores)

# Fit the model
pipeline.fit(X_train, y_train)

# Predict the target
y_pred = pipeline.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the performance metrics
print(f'Model: {name}')
print(f'Cross-validation Accuracy: {mean_accuracy:.2f}')
print(f'Test Accuracy: {accuracy:.2f}')
print()

# Check if the current model is the best one
if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = pipeline
    
# Retrieve the best model
print('Best model:', best_model)



Model: Gradient Boosting
Cross-validation Accuracy: 0.81
Test Accuracy: 0.80

Best model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', GradientBoostingClassifier(random_state=42))])
