In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# load data
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

data_df = train_df.append(test_df)

In [None]:
data_df.head()

In [None]:
# create new title column extracted title from name ('Mrs', 'Mr', ...)
data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.')
data_df.head()

In [None]:
#impute missing value of age
# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
data_df.replace({"Title":mapping}, inplace=True)

titles = data_df['Title'].unique()
titles.sort()

for ind, title in enumerate(titles):
    imputing_age = data_df.groupby("Title")['Age'].median()[ind]
    data_df.loc[(data_df['Title'] == title) & (data_df['Age'].isnull()), 'Age'] = imputing_age

In [None]:
if("Title" in data_df.columns):
    data_df.drop("Title", axis=1, inplace=True)
data_df.head()

In [None]:
# add family size
data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']
data_df.head()

In [None]:
# imputing fare
data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)
data_df.head()

In [None]:
# adding family survival

# add Last_Name column
data_df['Last_Name'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])

DEFAULT_SURVIVAL_VALUE = 0.5
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    if(len(grp_df)!=1):
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind, axis=0)['Survived'].max()
            smin = grp_df.drop(ind, axis=0)['Survived'].min()
            passID = row['PassengerId']
            if(smax==1):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif(smin==0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                
print("Number of passengers with family survival information:", 
      data_df.loc[data_df['Family_Survival']!=0.5].shape[0])            

In [None]:
data_df.head()

In [None]:
for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(data_df[data_df['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in TRAIN_DF and TEST_DF:
train_df['Family_Survival'] = data_df['Family_Survival'][:891]
test_df['Family_Survival'] = data_df['Family_Survival'][891:]

In [None]:
# make fare bins

# fill fare missing value with median
data_df['Fare'].fillna(data_df['Fare'].median(), inplace=True)

# making bins
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)
                             
# encode bins
le = LabelEncoder()
data_df['FareBin_Code'] = le.fit_transform(data_df['FareBin'])

train_df['FareBin_Code'] = data_df['FareBin_Code'][:891]
test_df['FareBin_Code'] = data_df['FareBin_Code'][891:]

train_df.drop(['Fare'], 1, inplace=True)
test_df.drop(['Fare'], 1, inplace=True)                             

In [None]:
data_df.head()

In [None]:
# make age bins
data_df['AgeBin'] = pd.qcut(data_df['Age'], 4)

label = LabelEncoder()
data_df['AgeBin_Code'] = label.fit_transform(data_df['AgeBin'])

train_df['AgeBin_Code'] = data_df['AgeBin_Code'][:891]
test_df['AgeBin_Code'] = data_df['AgeBin_Code'][891:]

train_df.drop(['Age'], 1, inplace=True)
test_df.drop(['Age'], 1, inplace=True)

In [None]:
data_df.head()

In [None]:
# map sex to 0,1 value
sex_mapping = {"male":1, "female":0}
data_df.replace({"Sex":sex_mapping}, inplace=True)
data_df.head()

In [None]:
# drop redundant columns
train_df['Sex'].replace(['male','female'],[0,1],inplace=True)
test_df['Sex'].replace(['male','female'],[0,1],inplace=True)

train_df.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
               'Embarked'], axis = 1, inplace = True)
test_df.drop(['Name','PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
              'Embarked'], axis = 1, inplace = True)

In [None]:
train_df.head()

In [None]:
train_data = train_df.drop("Survived",axis=1)
train_label = train_df['Survived']

In [None]:
params = {"kernel":['rbf', 'linear', 'poly'], "C":[1,10,100,1000], "gamma":[0.1,0.01,0.001]}

svm = SVC()
# grid_search_svc = GridSearchCV(svm, param_grid=params, n_jobs=-1, cv=5)
# grid_search_svc.fit(train_data, train_label)


In [None]:
print(grid_search_svc)

In [None]:
svm.fit(train_data, train_label)

In [None]:
svm.score(train_data, train_label)

In [None]:
# tree based model
#params = {"kernel":['rbf', 'linear', 'poly'], "C":[1,10,100,1000], "gamma":[0.1,0.01,0.001]}

dt = DecisionTreeClassifier(max_depth=100, max_leaf_nodes=100)
dt.fit(train_data, train_label)
dt.score(train_data, train_label)

In [None]:
rf = RandomForestClassifier()
rf.fit(train_data, train_label)
rf.score(train_data, train_label)

In [None]:
temp = pd.DataFrame(pd.read_csv("../input/test.csv")['PassengerId'])
temp['Survived'] = rf.predict(test_df)
temp.to_csv("../working/submission.csv", index = False)