Some of my main findings thus far:
* Started without feature engineering, only performed data cleaning, ended with XGBoost algorithm with Grid Search which gave me roughly 88% training accuracy
* Then followed Sima's excellent feature engineering post, just to get some experience with different techniques for handling different data types, used the same model with Grid Search, and ended with an 83% training accuracy...ended up here that without GridSearch, using the default XGBoost, I was able to get better test set performance (via the submission)
* Next step will be to ensemble the models, using another Kernel

In [135]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from xgboost import XGBClassifier
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [136]:
%matplotlib inline

In [137]:
training = pd.read_csv("train.csv")
testing = pd.read_csv("test.csv")

full_data = [training,testing]

PassengerId = testing['PassengerId']

# Data Preprocessing

In [138]:
for dataset in full_data:
    
    dataset["Pclass"] = dataset["Pclass"].astype("category")
    dataset["Sex"] = dataset["Sex"].astype("category")
    dataset["Embarked"] = dataset["Embarked"].astype("category")

training["Survived"] = training["Survived"].astype("int64")

# Feature Engineering
Most of this section is credited to Sina, for her excellent feature engineering Kernel

In [139]:
#Empty Embarked values, fill with the most common
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

#Creating a Family Size feature that combines number of siblings/spouse and number of children/parents
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
#IsAlone - based off Family Size to separate those with no family from those with family
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
#Categorize Fare into buckets
training['CategoricalFare'] = pd.qcut(training['Fare'],4)

#To Deal with Age, we can impute values for the 177 missing Ages, then categorize them like the Fare
#Using mean +/- 2*standard deviation as the range of values that the missing values can take
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg-(2*age_std), age_avg+(2*age_std),size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    
#Then split these ages for the Training set
training['CategoricalAge'] = pd.cut(training['Age'],5)

#Apply the binning for Age and Fare from the training set to the test set
testing.loc[ testing['Fare'] <= 7.91,'CategoricalFare'] = "(-0.001,7.91]"
testing.loc[(testing['Fare'] > 7.91) & (testing['Fare'] <= 14.454), 'CategoricalFare'] = "(7.91,14.454]"
testing.loc[(testing['Fare'] > 14.454) & (testing['Fare'] <= 31.0), 'CategoricalFare'] = "(14.454, 31.0]"
testing.loc[testing['Fare'] > 31.0, 'CategoricalFare'] = "(31.0, 512.329]"

testing.loc[ testing['Age'] <= 16.0,'CategoricalAge'] = "(-0.08, 16.0]"
testing.loc[(testing['Age'] > 16.0) & (testing['Age'] <= 32.0), 'CategoricalAge'] = "(16.0, 32.0]"
testing.loc[(testing['Age'] > 32.0) & (testing['Age'] <= 48.0), 'CategoricalAge'] = "(32.0, 48.0]"
testing.loc[(testing['Age'] > 48.0) & (testing['Age'] <= 64.0), 'CategoricalAge'] = "(48.0, 64.0]"
testing.loc[ testing['Age'] > 64.0,'CategoricalAge'] = "(64.0, 80.0]"

#Final data conversions
for dataset in full_data:
    dataset["FamilySize"] = dataset["FamilySize"].astype("category")
    dataset['IsAlone'] = dataset['IsAlone'].astype('category')
    dataset['CategoricalFare'] = dataset['CategoricalFare'].astype('category')
    dataset['CategoricalAge'] = dataset['CategoricalAge'].astype('category')



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



# One-hot encoding
Taking all categorical variables and performing one-hot encoding.  Then, removing one of the categories (can be done via the parameter within get_dummies, but doing it here explicitly

In [6]:
for column in training.select_dtypes(include=['category']).columns:
    dummy_columns = pd.get_dummies(data=training[column],prefix=column)
    training = pd.merge(training, pd.DataFrame(dummy_columns.iloc[:,1:]), left_index=True, right_index=True)

for column in testing.select_dtypes(include=['category']).columns:
    dummy_columns = pd.get_dummies(data=testing[column],prefix=column)
    testing = pd.merge(testing, pd.DataFrame(dummy_columns.iloc[:,1:]), left_index=True, right_index=True)
    
for column in training.select_dtypes(include=['uint8']).columns:
    training[column] = training[column].astype(np.int32)

for column in testing.select_dtypes(include=['uint8']).columns:
    testing[column] = testing[column].astype(np.int32)

# Prep for Modeling - Encoded Features

In [7]:
training= training.drop(['Age','SibSp','Parch','Pclass','Sex','Embarked','FamilySize','IsAlone','CategoricalFare','CategoricalAge','Name','Ticket','Fare','Cabin'],axis=1)
testing= testing.drop(['Age','SibSp','Parch','Pclass','Sex','Embarked','FamilySize','IsAlone','CategoricalFare','CategoricalAge','Name','Ticket','Fare','Cabin'],axis=1)

y_train = training.iloc[:,1]
X_train = training.drop(["Survived","PassengerId"],axis=1)
X_test = testing.drop(["PassengerId"], axis=1)

#Remove [] from variable names:
X_train.columns = [['Pclass_2', 'Pclass_3', 'Sex_male', 'Embarked_Q', 'Embarked_S',
       'FamilySize_2', 'FamilySize_3', 'FamilySize_4', 'FamilySize_5',
       'FamilySize_6', 'FamilySize_7', 'FamilySize_8', 'FamilySize_11',
       'IsAlone_1', 'CategoricalFare_(7.91-14.454)',
       'CategoricalFare_(14.454-31.0)', 'CategoricalFare_(31.0-512.329)',
       'CategoricalAge_(16.0-32.0)', 'CategoricalAge_(32.0-48.0)',
       'CategoricalAge_(48.0-64.0)', 'CategoricalAge_(64.0-80.0)']]
X_test.columns = [['Pclass_2', 'Pclass_3', 'Sex_male', 'Embarked_Q', 'Embarked_S',
       'FamilySize_2', 'FamilySize_3', 'FamilySize_4', 'FamilySize_5',
       'FamilySize_6', 'FamilySize_7', 'FamilySize_8', 'FamilySize_11',
       'IsAlone_1', 'CategoricalFare_(7.91-14.454)',
       'CategoricalFare_(14.454-31.0)', 'CategoricalFare_(31.0-512.329)',
       'CategoricalAge_(16.0-32.0)', 'CategoricalAge_(32.0-48.0)',
       'CategoricalAge_(48.0-64.0)', 'CategoricalAge_(64.0-80.0)']]

# Prep for Modeling - Non-Encoded Features

In [140]:
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare']         = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare']       = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age']      = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;




training = training.drop(['PassengerId', 'Name', 'Ticket', 'CategoricalFare', 'CategoricalAge', 'Cabin'], axis=1)
testing = testing.drop(['PassengerId', 'Name', 'Ticket', 'CategoricalFare', 'CategoricalAge', 'Cabin'], axis=1) 
y_train = training.iloc[:,0]
X_train = training.drop(["Survived"],axis=1)
X_test = testing

In [141]:
training.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,0,3,1,1,1,0,0,0,2,0
1,1,1,0,2,1,0,3,1,2,0
2,1,3,0,1,0,0,1,0,1,1
3,1,1,0,2,1,0,3,0,2,0
4,0,3,1,2,0,0,1,0,1,1


# Data Modeling

In [142]:
# Some useful parameters which will come in handy later on
ntrain = training.shape[0]
ntest = testing.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

In [143]:
#I do not understand this part, need to gain understanding here in order to really understand the ensemble.  He says "you cannot train the base models on the full training data, generate predictions of the test set, then output these for the second-level training"
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        #changed this to fit, I didn't override scikit-learn
        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In order to ensemble, I will need to generate multiple models, will start with the same ones he uses, in addition to my XGBoost above

In [144]:
rf = RandomForestClassifier(n_jobs = -1, 
                            n_estimators = 500,
                            criterion = "entropy",
                            warm_start = True,
                            max_depth = 6,
                            min_samples_leaf = 2,
                            max_features = 'sqrt',
                            verbose=0,
                            random_state = 0)
#rf.fit(X_train, y_train)

et = ExtraTreesClassifier(n_jobs = -1,
                         n_estimators = 500,
                         max_depth = 8,
                         min_samples_leaf = 2,
                         verbose = 0)
#et.fit(X_train,y_train)

ada = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.75)
#ada.fit(X_train,y_train)

gb = GradientBoostingClassifier(n_estimators = 500,
                               max_depth = 5,
                               min_samples_leaf = 2,
                               verbose = 0)
#gb.fit(X_train, y_train)


xg = XGBClassifier(learning_rate=1, 
                   gamma=0.2,
                   max_depth=2, 
                   n_estimators=200,
                   base_score=0.1,
                   min_child_weight=1)

In [145]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
x_train = X_train.values # Creates an array of the train data
x_test = X_test.values # Creats an array of the test data

In [146]:

# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
xg_oof_train, xg_oof_test = get_oof(xg, x_train, y_train, x_test) #XG Boost

print("Training is complete")

Training is complete


In [147]:
rf_feature = rf.feature_importances_
et_feature = et.feature_importances_
ada_feature = ada.feature_importances_
gb_feature = gb.feature_importances_
xg_feature = xg.feature_importances_

In [148]:
cols = X_train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_feature,
     'Extra Trees  feature importances': et_feature,
      'AdaBoost feature importances': ada_feature,
    'Gradient Boost feature importances': gb_feature,
    'XG Boost feature importances': xg_feature
    })

In [149]:
feature_dataframe

Unnamed: 0,AdaBoost feature importances,Extra Trees feature importances,Gradient Boost feature importances,Random Forest feature importances,XG Boost feature importances,features
0,0.044,0.173857,0.14464,0.186106,0.105263,Pclass
1,0.018,0.548245,0.091364,0.392919,0.087719,Sex
2,0.028,0.049631,0.171638,0.073164,0.245614,Age
3,0.258,0.027406,0.071209,0.042813,0.087719,SibSp
4,0.216,0.023102,0.047434,0.033303,0.070175,Parch
5,0.018,0.074748,0.182166,0.126488,0.070175,Fare
6,0.016,0.038717,0.13933,0.039577,0.087719,Embarked
7,0.388,0.042434,0.111802,0.080798,0.245614,FamilySize
8,0.014,0.02186,0.040416,0.024831,0.0,IsAlone


In [150]:
# Create the new column containing the average of values

feature_dataframe['mean'] = feature_dataframe.mean(axis= 1)
feature_dataframe.head()

Unnamed: 0,AdaBoost feature importances,Extra Trees feature importances,Gradient Boost feature importances,Random Forest feature importances,XG Boost feature importances,features,mean
0,0.044,0.173857,0.14464,0.186106,0.105263,Pclass,0.130773
1,0.018,0.548245,0.091364,0.392919,0.087719,Sex,0.22765
2,0.028,0.049631,0.171638,0.073164,0.245614,Age,0.11361
3,0.258,0.027406,0.071209,0.042813,0.087719,SibSp,0.097429
4,0.216,0.023102,0.047434,0.033303,0.070175,Parch,0.078003


In [151]:
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
            x= x,
             y= y,
            width = 0.5,
            marker=dict(
               color = feature_dataframe['mean'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'Barplots of Mean Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')

In [152]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel(),
     'XGBoost': xg_oof_train.ravel()
    })
base_predictions_train.head()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest,XGBoost
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0


In [153]:
#Show a heatmap of the correlation between predictions, ideally, we'd have little-to-no correlation
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Portland',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

In [154]:
x_train_step2 = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, xg_oof_train), axis=1)
x_test_step2 = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, xg_oof_test), axis=1)

In [155]:
gbm = XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 n_jobs= -1,
 scale_pos_weight=1).fit(x_train_step2, y_train)
predictions = gbm.predict(x_test_step2)

In [156]:
#Predict the Test set results
y_pred_train = gbm.predict(x_train_step2)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train,y_pred_train)
cm



array([[504,  45],
       [ 94, 248]])

In [157]:

output = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': predictions })
output.to_csv("StackingSubmission.csv", index=False)
