In [40]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score 
from xgboost.sklearn import XGBClassifier

In [76]:
data_train = pd.read_csv('../input/train.csv')
data_test = pd.read_csv('../input/test.csv')
target = data_train.Survived

data_train.drop(['Survived'], axis=1, inplace=True)
print(data_train.describe())
print(data_test.describe())
data_all = pd.concat([data_train, data_test])
print(data_all.Name.describe())

       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean    446.000000    2.308642   29.699118    0.523008    0.381594   32.204208
std     257.353842    0.836071   14.526497    1.102743    0.806057   49.693429
min       1.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%     223.500000    2.000000   20.125000    0.000000    0.000000    7.910400
50%     446.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%     668.500000    3.000000   38.000000    1.000000    0.000000   31.000000
max     891.000000    3.000000   80.000000    8.000000    6.000000  512.329200
       PassengerId      Pclass         Age       SibSp       Parch        Fare
count   418.000000  418.000000  332.000000  418.000000  418.000000  417.000000
mean   1100.500000    2.265550   30.272590    0.447368    0.392344   35.627188
std     120.810458    0.841838   14.181209    0.8967

In [120]:

def construct_name_title_age_map(data):
    _data = pd.DataFrame.copy(data)
    _data['NameTitle'] = _data.Name.apply(lambda name: re.findall(r'.+, (\w+). .+', name)[0])
    titles = _data.NameTitle.unique()
    title_group = _data.groupby('NameTitle')
    return {title: title_group.get_group(title).Age.mean().astype(int) for title in titles}

def fix_age_using_name_title(data, title_age_map):
    _data = pd.DataFrame.copy(data)
    _data['NameTitle'] = _data.Name.apply(lambda name: re.findall(r'.+, (\w+). .+', name)[0])
    for title, mage in title_age_map.items():
        _data.loc[(_data.Age.isna().any()) & (_data.NameTitle == title), 'Age'] = mage
    return _data

        
name_title_age_map = construct_name_title_age_map(data_all)
# Following use to fix data's missing Age with mean age of the same title
fix_age_using_name_title(data_train, name_title_age_map)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NameTitle
0,1,3,"Braund, Mr. Owen Harris",male,32.0,1,0,A/5 21171,7.2500,,S,Mr
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,36.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,3,"Heikkinen, Miss. Laina",female,21.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,36.0,1,0,113803,53.1000,C123,S,Mrs
4,5,3,"Allen, Mr. William Henry",male,32.0,0,0,373450,8.0500,,S,Mr
5,6,3,"Moran, Mr. James",male,32.0,0,0,330877,8.4583,,Q,Mr
6,7,1,"McCarthy, Mr. Timothy J",male,32.0,0,0,17463,51.8625,E46,S,Mr
7,8,3,"Palsson, Master. Gosta Leonard",male,5.0,3,1,349909,21.0750,,S,Master
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,36.0,0,2,347742,11.1333,,S,Mrs
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,36.0,1,0,237736,30.0708,,C,Mrs


In [None]:
def convert_sex(sex_column):
    """
    Convert Sex to 
    """
    sex_map = {'male': 0, 'female': 1}
    return sex_column.apply(lambda sex: sex_map[sex])

def convert_embarked(embarked_column):
    """
    Convert Embarked
    """
    dic = {'S': 0, 'C': 1, 'Q': 2}
    return embarked_column.apply(lambda embarked: dic.get(embarked, 0))

def construct_family_size(data):
    """
    Convert FamilySize
    """
    return data.SibSp + data.Parch

def convert_age(data):
    """
    Convert Age
    """
    data_copied = pd.DataFrame.copy(data)
#     data_copied['AgeBand'] = pd.cut(data_copied['Age'], 6)
# Here should use in price
    data_copied.Age = data_copied.Age / data_copied.FamilySize
    data_copied.loc[ data_copied['Age'] <= 14, 'Age'] = 0
    data_copied.loc[(data_copied['Age'] > 14) & (data_copied['Age'] <= 28), 'Age'] = 1
    data_copied.loc[(data_copied['Age'] > 28) & (data_copied['Age'] <= 42), 'Age'] = 2
    data_copied.loc[(data_copied['Age'] > 42) & (data_copied['Age'] <= 56), 'Age'] = 3
    data_copied.loc[(data_copied['Age'] > 56) & (data_copied['Age'] <= 70), 'Age'] = 4
    data_copied.loc[(data_copied['Age'] > 70)] = 5
    data_copied['Age'] = data_copied['Age'].astype(int)
#     data_copied = data_copied.drop(['AgeBand'], axis=1)
    return data_copied

def update_nan_age(data, target_index=5, title_index=-2):
    data_name_title_group = data.groupby('NameTitle')
    data_copied = pd.DataFrame.copy(data)
    for v in data_copied.values:
        mean_age = data_name_title_group.get_group(v[title_index]).Age.mean()
        
        if pd.isna(v[target_index]) or pd.isnull(v)[target_index]:
            data_copied.loc[data_copied.PassengerId == v[0], 'Age'] = 0 if pd.isna(mean_age) else mean_age
    return data_copied



def update_nan_cabin(data, target_index=10, pclass_index=2):
    data_copied = pd.DataFrame.copy(data)
    cabin_map = {key: index for index, key in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ')}
    data_copied.Cabin = data_copied.Cabin.apply(lambda cabin: cabin if pd.isna(cabin) else cabin_map[cabin[0]])
    pclass_group = data_copied.groupby('Pclass')
    print(pclass_group.get_group(1).Cabin.value_counts().index[0])
    for v in data_copied.values:
        if pd.isna(v[target_index]) or pd.isnull(v)[target_index]:
            data_copied.loc[data_copied.PassengerId == v[0], 'Cabin'] = pclass_group.get_group(v[pclass_index]).Cabin.value_counts().index[0]
    return data_copied



def drop_unnecessary(data):
    return data.drop(['Ticket', 'Name', 'NameTitle', 'SibSp', 'Parch', 'Embarked'], axis=1)

2.0


In [None]:
xgb_model = XGBClassifier(
    learning_rate =0.05,
     n_estimators=235,
     max_depth=5,
     min_child_weight=0.45,
     gamma=1.7,
     subsample=0.6,
     colsample_bytree=0.825,
     objective= 'binary:logistic',
     nthread=8,
     seed=27)