In [1]:
import numpy as np
import pandas as pd
# XGBoost is an optimized distributed gradient boosting library
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin

In [2]:
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/train.csv')

In [3]:
# impute missing values using:
#   the median for numeric columns and
#   the most common value for string columns
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [4]:
feature_columns_to_use = ['Pclass','Sex','Age','Fare','Parch']
nonnumeric_columns = ['Sex']

big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

In [5]:
# numerically encode nonnumeric feature columns (Sex)
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

In [6]:
# split into test/trains sets
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
train_y = train_df['Survived']

In [7]:
# This is the first xgboost submission using parameters from the tutorial
# gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)
# predictions = gbm.predict(test_X)

In [8]:
# going to generate various submissions with different parameters tuning

In [27]:
# max-depth parameter tuning
for i in range(7, 9):
    gbm = xgb.XGBClassifier(max_depth=i, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)
    predictions = gbm.predict(test_X)
    
    # kaggle submission
    submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                                'Survived': predictions })
    submission.to_csv('submissions/submission_xgboost_max_depth_{}.csv'.format(i), index=False)
    
# submissions: 3 (acc ~75%), 6 (acc ~77%), 9 (acc ~74%)
# submissions: 0 (acc ~63%)
# submissions: 1 (acc ~76%), 2 (acc ~73%)
# submissions: 4 (acc ~75%), 5 (acc ~77%), 7 (acc ~77%), 8 (acc ~76%)

In [10]:
# more parameter tuning
for i in range(40, 41):
    gbm = xgb.XGBClassifier(max_depth=i, n_estimators=i*100, learning_rate=0.05).fit(train_X, train_y)
    predictions = gbm.predict(test_X)
    
    # kaggle submission
    submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                                'Survived': predictions })
    submission.to_csv('submissions/xgb_max_d_6_n_est_{}.csv'.format(i), index=False)

# submissions: 100 (acc ~77%), 200 (acc ~75%), 300 (acc ~75%), 400 (acc ~76%),
#              500 (acc ~77%),  600 (acc ~76%), 700 (acc ~74%), 800 (acc ~73%)

# 4000 (acc ~73%)