In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Packages

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, MissingIndicator
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

# Load Data

In [None]:
train=pd.read_csv('../input/titanic/train.csv')
test=pd.read_csv('../input/titanic/test.csv')

print(train.head())
print(test.head())

X_train=train.drop(['PassengerId','Survived','Name', 'Ticket'], axis=1)
y_train=train['Survived']

X_test=test.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

# EDA

In [None]:
train.info()
X_test.info()

In [None]:
X_train['Family'] = X_train['SibSp'] + X_train['Parch']
X_test['Family'] = X_test['SibSp'] + X_test['Parch']

X_test=X_test.drop(['SibSp', 'Parch'], axis=1)
X_train=X_train.drop(['SibSp', 'Parch'], axis=1)

print(X_train.head())

In [None]:
def Floor(X):
    X.fillna('NAN', inplace=True)
    first=[]
    for x in X:
        first.append(x[0])
    return first

In [None]:
def Prefix(X):
    prefix=X.str.extract(' ([A-Za-z]+)\.')
    prefix.replace(['Ms', 'Mlle'], 'Miss', inplace=True)
    prefix.replace(['Mme'], 'Mrs', inplace=True)
    prefix.replace(list(set(prefix[0]) - {'Mr', 'Miss', 'Mrs', 'Master'}), 'Rare', inplace=True)
    return prefix

X_train['Prefix']=Prefix(train['Name'])
X_test['Prefix']=Prefix(test['Name'])


In [None]:
X_train['Cabin']=Floor(X_train['Cabin'])
X_test['Cabin']=Floor(X_test['Cabin'])
X_train.drop(['Embarked'], axis=1, inplace=True)
X_test.drop(['Embarked'], axis=1, inplace=True)

print(X_train.head())

# Create Pipeline

In [None]:
#embark = Pipeline(steps=[
#    ('impute',SimpleImputer(strategy='most_frequent')),
#    ('encode', OneHotEncoder())
#])

fare = Pipeline(steps=[
    ('num', IterativeImputer())
    #('bin', KBinsDiscretizer(encode='ordinal')),
])

age = Pipeline(steps=[
    ('num', IterativeImputer())
    #('bin', KBinsDiscretizer(encode='ordinal')),
])

ct=ColumnTransformer(transformers=[
    ('encode', OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Prefix', 'Cabin']), #Floor
    ('age', age, ['Age']),
    ('fare', fare, ['Fare'])]
    #('cabin', MissingIndicator(), ['Cabin']),
    #('embark', embark, ['Embarked'])]
                    ,remainder='passthrough')

titanic_pipe=Pipeline(steps=[
    ('preprocessor', ct),
    ('model', XGBClassifier(objective='binary:logistic', random_state=0, booster='gbtree'))
])

params={'model__gamma': [1,2,3],
        'model__max_depth': [7],
        'model__learning_rate': [0.03, 0.04, 0.05]}
        #preprocessor__fare__bin__n_bins':[2, 3, 4]}
        #'preprocessor__age__bin__n_bins':[2, 3, 4]}

model=GridSearchCV(titanic_pipe, params, scoring='accuracy', cv=5)

In [None]:
model.fit(X_train,y_train)
print(model.best_score_)
print(model.best_params_)
res=permutation_importance(model, X_train, y_train,  n_repeats=10, scoring='accuracy', random_state=0)
print(res.importances_mean)
print(res.importances_std)

In [None]:
pred=model.predict(X_test)
output=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':pred})
output.to_csv('my_submission.csv', index=False)