In [10]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import (GridSearchCV,
                                     StratifiedKFold)

from sklearn.ensemble import (RandomForestClassifier, 
                              AdaBoostClassifier, 
                              GradientBoostingClassifier, 
                              ExtraTreesClassifier, 
                              VotingClassifier)

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

SEED = 17

In [12]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
PassengerId = test.PassengerId
train_len = len(train)
df = pd.concat([train, test])

train.head(3)

In [13]:
df.info()

In [14]:
df['Age'] = df['Age'].fillna(train['Age'].median())
df['Fare'] = df['Fare'].fillna(train['Fare'].median())

In [15]:
g = sns.kdeplot(train["Age"][(train["Survived"] == 0)], color="Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1)], ax =g, color="Blue", shade= True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])

In [16]:
g = sns.kdeplot(train["Fare"][(train["Survived"] == 0)], color="Red", shade = True)
g = sns.kdeplot(train["Fare"][(train["Survived"] == 1)], ax =g, color="Blue", shade= True)
g.set_xlabel("Fare")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])

In [17]:
df['Age_cat'] = (df['Age']//15)*15
df['Fare_cat'] = pd.qcut(df['Fare'], 5)

In [18]:
df['Family_size'] = df['SibSp'] + df['Parch']

g = sns.factorplot(x="Family_size",y="Survived", data=df, kind="bar", size=6)
g.despine(left=True)
g = g.set_ylabels("Survival probability")

In [19]:
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.') #extracting title from name

g = sns.countplot(x="Title",data=df)
g = plt.setp(g.get_xticklabels(), rotation=45) 

In [20]:
df['Title'].replace({'Mlle': 'Miss', 'Mme': 'Mrs', 'Ms': 'Miss'}, inplace = True)
df['Title'].replace(['Don', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 
                     'the Countess', 'Jonkheer', 'Dona'], 'Rare', inplace = True)

In [21]:
df['Surname'] = df['Name'].apply(lambda x: str.split(x, ',')[0])

df['Relatives_survival'] = 0.5 

for grp_index, grp_data in df[['Survived','Name', 'Surname', 'Fare', 'Ticket', 'PassengerId', 
                            'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Surname', 'Fare']):
    
    if len(grp_data) > 1: 
        for index, row in grp_data.iterrows():
            surv_max = grp_data.drop(index)['Survived'].max() 
            surv_min = grp_data.drop(index)['Survived'].min() 
            passID = row['PassengerId']
            
            if surv_max == 1:
                df.loc[df['PassengerId'] == passID, 'Relatives_survival'] = 1 
            elif surv_min == 0:
                df.loc[df['PassengerId'] == passID, 'Relatives_survival'] = 0 

for grp_index, grp_data in df.groupby('Ticket'):
    
    if len(grp_data) > 1:
        for index, row in grp_data.iterrows(): 
            if (row['Relatives_survival'] == 0) or (row['Relatives_survival'] == 0.5):
                surv_max = grp_data.drop(index)['Survived'].max()
                surv_min = grp_data.drop(index)['Survived'].min()
                passID = row['PassengerId']
                
                if surv_max == 1:
                    df.loc[df['PassengerId'] == passID, 'Relatives_survival'] = 1
                elif surv_min == 0:
                    df.loc[df['PassengerId'] == passID, 'Relatives_survival'] = 0

In [22]:
df['Age_cat'] = LabelEncoder().fit_transform(df['Age_cat'])
df['Fare_cat'] = LabelEncoder().fit_transform(df['Fare_cat'])
df['Title'] = LabelEncoder().fit_transform(df['Title'])
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df.drop(['PassengerId', 'Age', 'Fare', 'Name', 'SibSp', 'Parch', 
         'Ticket', 'Cabin', 'Title', 'Embarked', 'Surname'], axis = 1, inplace = True)

In [23]:
df.head(3)

In [24]:
train = df[:train_len]
x_train = train.drop('Survived', axis=1)
y_train = train.Survived.astype(int)
test = df[train_len:].drop("Survived",axis=1)

In [25]:
gbc_params = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] }


rfc_params = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

svc_params = {'kernel': ['rbf'], 
              'gamma': [ 0.001, 0.01, 0.1, 1],
              'C': [1, 10, 50, 100,200, 300, 1000]}

ext_params = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

ada_params = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

skf = StratifiedKFold(n_splits=5)

In [26]:
gbc = GradientBoostingClassifier(random_state=SEED)
rfc = RandomForestClassifier(random_state=SEED)
svc = SVC(probability=True, random_state=SEED)
ext = ExtraTreesClassifier(random_state=SEED)
dtc = DecisionTreeClassifier(random_state=SEED)
ada = AdaBoostClassifier(dtc, random_state=SEED)

In [27]:
base_models_params = [[gbc, gbc_params],
                      [rfc, rfc_params],
                      [svc, svc_params],
                      [ext, ext_params],
                      [ada, ada_params]]

In [28]:
best_models = []
for model, params in base_models_params:
    gridsearch = GridSearchCV(model, params, cv=skf, n_jobs=-1, verbose=1, scoring='accuracy').fit(
        x_train, y_train)
    best_models.append(gridsearch.best_estimator_)

In [29]:
voting_clf_hard = VotingClassifier([(str(model.__class__).split('.')[-1][:-2], model) for model in best_models], 
                                  voting='hard', verbose=1, n_jobs=-1)
voting_clf_soft = VotingClassifier([(str(model.__class__).split('.')[-1][:-2], model) for model in best_models], 
                              voting='soft', verbose=1, n_jobs=-1)


voting_clf_hard.fit(x_train, y_train)
voting_clf_soft.fit(x_train, y_train)

prediction_hard = voting_clf_hard.predict(test)
prediction_soft = voting_clf_soft.predict(test)

In [30]:
def write_to_submission_file(predictions, PassengerID, out_file='submission.csv', 
                             columns=['PassengerID', 'Survived']):
    predicted_df = pd.DataFrame(np.array([PassengerId, predictions]).T, columns=columns).astype(int)
    predicted_df.to_csv(out_file, index=False)
    return predicted_df

In [31]:
write_to_submission_file(prediction_hard, PassengerId, out_file='submission_hard.csv')
write_to_submission_file(prediction_soft, PassengerId, out_file='submission_soft.csv');