In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

df = sns.load_dataset('titanic')
X=df.drop(['survived','deck','embark_town','alive','who','adult_male'], axis=1)
y=df['survived']

In [3]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,alone
0,3,male,22.0,1,0,7.25,S,Third,False
1,1,female,38.0,1,0,71.2833,C,First,False
2,3,female,26.0,0,0,7.925,S,Third,True
3,1,female,35.0,1,0,53.1,S,First,False
4,3,male,35.0,0,0,8.05,S,Third,True


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=43, stratify=y)

In [5]:
numeric_features = ['age','fare']
categorical_features = ['sex','embarked','class','alone']

num_preproc = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_preproc = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_preproc, numeric_features),
    ('cat',cat_preproc, categorical_features)
])

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [None]:
param_grid = {
    'classifier__n_estimators':[50,100],
    'classifier__max_depth':[3,5,None],
    'classifier__max_features':['sqrt','log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
y_pred = grid_search.predict(X_test)
print("Best parameters used:", grid_search.best_params_)
print("Accuracy;", accuracy_score(y_true=y_test, y_pred=y_pred))

Best parameters used: {'classifier__max_depth': 5, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 100}
Accuracy; 0.8156424581005587


In [None]:
joblib.dump(grid_search.best_estimator_ ,'hptuningtask.pkl')

['hptuningtask.pkl']