In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data_train = pd.read_csv("../input/titanic/train.csv", index_col ='PassengerId')

In [3]:
(data_train.isnull().sum()*100)/data_train.shape[0]

Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [4]:
data_train = data_train.drop(["Name", "Ticket", "Cabin"], axis=1)

In [5]:
X = data_train.drop("Survived", axis=1)
y = data_train["Survived"]

In [6]:
numerical = [col for col in X.select_dtypes(exclude='object')]
categorial = [col for col in X.select_dtypes(include='object')]

In [7]:
numerical_pipeline = Pipeline(steps=[
                    ('impute', SimpleImputer(strategy='constant'))
])

In [8]:
categorial_pipeline = Pipeline(steps = [
                                ("impute_cat", SimpleImputer(strategy="most_frequent")),
                                ("encode",OneHotEncoder(handle_unknown='ignore') )
])

In [9]:
preprocessing = ColumnTransformer(transformers=[
                            ("num", numerical_pipeline, numerical),
                            ("cat", categorial_pipeline, categorial)
])

In [10]:
model_2 = GradientBoostingClassifier() 
pipeline_2 = Pipeline(steps = [
                            ("preprocessing", preprocessing),
                            ("model",model_2)
])

In [11]:
param = {
        'model__n_estimators': np.arange(50, 1000, 100)
}

In [12]:
grid = GridSearchCV(pipeline_2, param_grid = param, cv=5)

In [13]:
grid.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer(strategy='constant'))]),
                                                                         ['Pclass',
                                                                          'Age',
                                                                          'SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('impute_cat',
                

In [14]:
grid.best_score_

0.8305442219571905

In [15]:
grid.best_estimator_

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='constant'))]),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('impute_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Sex', 'Embarked'])])),
                ('model', GradientBoostingClassifier(n_estimators=450)

In [16]:
data_test = pd.read_csv("../input/titanic/test.csv", index_col ="PassengerId")

In [17]:
Survived = grid.predict(data_test)

In [18]:
data = pd.DataFrame({"PassengerId":data_test.index, "Survived": Survived}) 

In [19]:
data.to_csv("submission.csv", index = False)