In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score 
from xgboost.sklearn import XGBClassifier

In [2]:
data_train = pd.read_csv('../input/train.csv')
data_test = pd.read_csv('../input/test.csv')

In [20]:
def extract_name_title(name_column):
    return name_column.apply(lambda name: re.findall(r'.+, (\w+). .+', name)[0])

def update_nan_age(data, target_index=5, title_index=-2):
    data_name_title_group = data.groupby('NameTitle')
    data_copied = pd.DataFrame.copy(data)
    for v in data_copied.values:
        mean_age = data_name_title_group.get_group(v[title_index]).Age.mean()
        
        if pd.isna(v[target_index]) or pd.isnull(v)[target_index]:
            data_copied.loc[data_copied.PassengerId == v[0], 'Age'] = 0 if pd.isna(mean_age) else mean_age
    return data_copied

def convert_sex(sex_column):
    sex_map = {'male': 0, 'female': 1}
    return sex_column.apply(lambda sex: sex_map[sex])

def convert_embarked(embarked_column):
    dic = {'S': 0, 'C': 1, 'Q': 2}
    return embarked_column.apply(lambda embarked: dic.get(embarked, 0))

def update_nan_cabin(data, target_index=10, pclass_index=2):
    data_copied = pd.DataFrame.copy(data)
    cabin_map = {key: index for index, key in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ')}
    data_copied.Cabin = data_copied.Cabin.apply(lambda cabin: cabin if pd.isna(cabin) else cabin_map[cabin[0]])
    pclass_group = data_copied.groupby('Pclass')
    print(pclass_group.get_group(1).Cabin.value_counts().index[0])
    for v in data_copied.values:
        if pd.isna(v[target_index]) or pd.isnull(v)[target_index]:
            data_copied.loc[data_copied.PassengerId == v[0], 'Cabin'] = pclass_group.get_group(v[pclass_index]).Cabin.value_counts().index[0]
    return data_copied

def convert_family_size(data):
    return data.SibSp + data.Parch

def convert_fare(data):
    data_copied = pd.DataFrame.copy(data)
    data_copied.Fare = data_train.Fare.fillna(data_train.Fare.mean())
#     data_copied['FareBand'] = pd.qcut(data_copied['Fare'], 4)
    data_copied.loc[ data_copied['Fare'] <= 8, 'Fare'] = 0
    data_copied.loc[(data_copied['Fare'] > 8) & (data_copied['Fare'] <= 16), 'Fare'] = 1
    data_copied.loc[(data_copied['Fare'] > 16) & (data_copied['Fare'] <= 32), 'Fare']   = 2
    data_copied.loc[ data_copied['Fare'] > 32, 'Fare'] = 3
    data_copied['Fare'] = data_copied['Fare'].astype(int)
#     data_copied = data_copied.drop(['FareBand'], axis=1)
    return data_copied

def convert_age(data):
    data_copied = pd.DataFrame.copy(data)
#     data_copied['AgeBand'] = pd.cut(data_copied['Age'], 6)
    data_copied.Age = data_copied.Age / data_copied.FamilySize
    data_copied.loc[ data_copied['Age'] <= 14, 'Age'] = 0
    data_copied.loc[(data_copied['Age'] > 14) & (data_copied['Age'] <= 28), 'Age'] = 1
    data_copied.loc[(data_copied['Age'] > 28) & (data_copied['Age'] <= 42), 'Age'] = 2
    data_copied.loc[(data_copied['Age'] > 42) & (data_copied['Age'] <= 56), 'Age'] = 3
    data_copied.loc[(data_copied['Age'] > 56) & (data_copied['Age'] <= 70), 'Age'] = 4
    data_copied.loc[(data_copied['Age'] > 70)] = 5
    data_copied['Age'] = data_copied['Age'].astype(int)
#     data_copied = data_copied.drop(['AgeBand'], axis=1)
    return data_copied

def drop_unnecessary(data):
    return data.drop(['Ticket', 'Name', 'NameTitle', 'SibSp', 'Parch', 'Embarked'], axis=1)

In [5]:
xgb_model = XGBClassifier(
    learning_rate =0.05,
     n_estimators=235,
     max_depth=5,
     min_child_weight=0.45,
     gamma=1.7,
     subsample=0.6,
     colsample_bytree=0.825,
     objective= 'binary:logistic',
     nthread=8,
     seed=27)

In [21]:
tmp = update_nan_cabin(data_train)

2.0
