In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
# Get train/test dfs

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# Separate X,y from train dataset:
tot_cols = train_df.columns
y_train_all = train_df.loc[:,'Survived']

# I consider only these as useful cols:
X_cols = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']
# I don't consider Name (probably useless), Ticket/Fare (same cleaner info is in Pclass), 
# 'Cabin' (could be interesting - maybe cabin number is related to lacation on boat - 
# but almost 80% of entries are nans.
# I decide to keep 'Embarked' as it maight be realted to cabin location/vicinity with escape exits
X_train_all = train_df.loc[:,X_cols]
X_test_all = test_df.loc[:,X_cols]

# Set PassengerId as index:
X_train_all = X_train_all.set_index('PassengerId')
X_test_all = X_test_all.set_index('PassengerId')

# In order to explore model accuracy before submitting to Kaggle competition, 
# I want to test accuracy on different models. I get 
X_train, X_test, y_train, y_test = train_test_split(X_train_all, y_train_all, train_size=0.8, test_size=0.2)

In [5]:
# Which cols have nans?

print('X_train:')
for col in X_train.columns:
    print(col, X_train[col].isnull().sum())

print('\nX_test:')
for col in X_test.columns:
    print(col, X_test[col].isnull().sum())

X_train:
Pclass 0
Sex 0
Age 137
SibSp 0
Parch 0
Embarked 2

X_test:
Pclass 0
Sex 0
Age 40
SibSp 0
Parch 0
Embarked 0


In [6]:
# Age column has some nans:
# print('Nans in Age / total: ', X_train['Age'].isna().sum()/len(X_train['Age']))
# Around 20% entries. But I expect other columns (as Pclass) to be higly informative, 
# so I'd rather not throw away any line.
# I substitute nans in Age column them with the mean:
X_train.Age = X_train.Age.fillna(X_train.Age.mean())
X_test.Age = X_test.Age.fillna(X_test.Age.mean())

# Embarked columns has 2 Nan. I fill them with most common entries:
X_train.Embarked = X_train.Embarked.fillna(X_train.Embarked.mode()[0])
X_test.Embarked = X_test.Embarked.fillna(X_test.Embarked.mode()[0])

# Sex and Age are cathegorical data. Let's switch them to numerical
# Since they are simple and few, I choose the Ordinal Encoding:
object_cols = ['Sex', 'Embarked']
ordinal_encoder = OrdinalEncoder()
X_train.loc[:,object_cols] = ordinal_encoder.fit_transform(X_train.loc[:,object_cols])
X_test.loc[:,object_cols] = ordinal_encoder.transform(X_test.loc[:,object_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [7]:
X_test

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
306,1,1.0,0.920000,1,2,2.0
450,1,1.0,52.000000,0,0,2.0
719,3,1.0,29.756619,0,0,1.0
501,3,1.0,17.000000,0,0,2.0
120,3,0.0,2.000000,4,2,2.0
...,...,...,...,...,...,...
52,3,1.0,21.000000,0,0,2.0
651,3,1.0,29.756619,0,0,2.0
60,3,1.0,11.000000,5,2,2.0
655,3,0.0,18.000000,0,0,1.0


# Testing different models:

Let's try SVM:

In [8]:
# Try different values for C parameter (regularization)
C_vals = [0.01, 0.1, 1.0, 10., 100.]
for C_val in C_vals:
    svm = SVC(kernel = 'linear', C=C_val).fit(X_train, y_train)
    print('C = ', C_val, ' --> accuracy = ', svm.score(X_test, y_test))
    
# Looks like a default C=1 value is ok.

C =  0.01  --> accuracy =  0.7877094972067039
C =  0.1  --> accuracy =  0.7597765363128491
C =  1.0  --> accuracy =  0.7597765363128491
C =  10.0  --> accuracy =  0.7597765363128491
C =  100.0  --> accuracy =  0.7597765363128491


In [9]:
# Let's try kernelized SVM:

# Try different values for gamma parameter (rbf Kernel width)
gamma_vals = [0.001, 0.01, 1.0, 10.0]
for gamma_val in gamma_vals:
    clf = SVC(kernel = 'rbf', gamma=gamma_val).fit(X_train, y_train)
    print('gamma = {:.2f} -- > accuracy = {:.2f}'.format(gamma_val, clf.score(X_test, y_test)))

gamma = 0.00 -- > accuracy = 0.61
gamma = 0.01 -- > accuracy = 0.78
gamma = 1.00 -- > accuracy = 0.78
gamma = 10.00 -- > accuracy = 0.70


In [10]:
# Let's tune C and gamma at the same time:
C_vals = [0.01, 0.1, 1.0, 10., 100.]
gamma_vals = [0.001, 0.01, 1.0, 10.0]
for C_val in C_vals:
    for gamma_val in gamma_vals:
        clf = SVC(kernel = 'rbf', C=C_val, gamma=gamma_val).fit(X_train, y_train)
        print('C = {:.2f}  gamma = {:.2f} -- > accuracy = {:.4f}'.format(C_val, gamma_val, clf.score(X_test, y_test)))
        
#Best combination seems: C = 10.00  gamma = 0.01 -- > accuracy = 0.8156

C = 0.01  gamma = 0.00 -- > accuracy = 0.6201
C = 0.01  gamma = 0.01 -- > accuracy = 0.6201
C = 0.01  gamma = 1.00 -- > accuracy = 0.6201
C = 0.01  gamma = 10.00 -- > accuracy = 0.6201
C = 0.10  gamma = 0.00 -- > accuracy = 0.6201
C = 0.10  gamma = 0.01 -- > accuracy = 0.6145
C = 0.10  gamma = 1.00 -- > accuracy = 0.6201
C = 0.10  gamma = 10.00 -- > accuracy = 0.6201
C = 1.00  gamma = 0.00 -- > accuracy = 0.6145
C = 1.00  gamma = 0.01 -- > accuracy = 0.7765
C = 1.00  gamma = 1.00 -- > accuracy = 0.7821
C = 1.00  gamma = 10.00 -- > accuracy = 0.6983
C = 10.00  gamma = 0.00 -- > accuracy = 0.7933
C = 10.00  gamma = 0.01 -- > accuracy = 0.7989
C = 10.00  gamma = 1.00 -- > accuracy = 0.7709
C = 10.00  gamma = 10.00 -- > accuracy = 0.6983
C = 100.00  gamma = 0.00 -- > accuracy = 0.7989
C = 100.00  gamma = 0.01 -- > accuracy = 0.7877
C = 100.00  gamma = 1.00 -- > accuracy = 0.7709
C = 100.00  gamma = 10.00 -- > accuracy = 0.6983


In [11]:
#Same as above cell but applying also MinMax scaler:

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Let's tune C and gamma at the same time:
C_vals = [0.01, 0.1, 1.0, 10., 100.]
gamma_vals = [0.001, 0.01, 1.0, 10.0]
for C_val in C_vals:
    for gamma_val in gamma_vals:
        clf = SVC(kernel = 'rbf', C=C_val, gamma=gamma_val).fit(X_train_scaled, y_train)
        print('C = {:.2f}  gamma = {:.2f} -- > accuracy = {:.4f}'.format(C_val, gamma_val, clf.score(X_test_scaled, y_test)))
        
#You can notice a slight overall improvement, but not on the highest achievable accuracy
# Now best combination seems C = 10.00  gamma = 1.00 -- > accuracy = 0.8156
# Different gamma param but same accuracy

C = 0.01  gamma = 0.00 -- > accuracy = 0.6201
C = 0.01  gamma = 0.01 -- > accuracy = 0.6201
C = 0.01  gamma = 1.00 -- > accuracy = 0.6983
C = 0.01  gamma = 10.00 -- > accuracy = 0.6201
C = 0.10  gamma = 0.00 -- > accuracy = 0.6201
C = 0.10  gamma = 0.01 -- > accuracy = 0.6201
C = 0.10  gamma = 1.00 -- > accuracy = 0.7709
C = 0.10  gamma = 10.00 -- > accuracy = 0.8045
C = 1.00  gamma = 0.00 -- > accuracy = 0.6201
C = 1.00  gamma = 0.01 -- > accuracy = 0.7598
C = 1.00  gamma = 1.00 -- > accuracy = 0.8101
C = 1.00  gamma = 10.00 -- > accuracy = 0.7989
C = 10.00  gamma = 0.00 -- > accuracy = 0.7598
C = 10.00  gamma = 0.01 -- > accuracy = 0.7598
C = 10.00  gamma = 1.00 -- > accuracy = 0.8101
C = 10.00  gamma = 10.00 -- > accuracy = 0.7877
C = 100.00  gamma = 0.00 -- > accuracy = 0.7598
C = 100.00  gamma = 0.01 -- > accuracy = 0.7598
C = 100.00  gamma = 1.00 -- > accuracy = 0.7877
C = 100.00  gamma = 10.00 -- > accuracy = 0.7765


In [12]:
# As above but adding cross-validation (to get more reliable predictions, since the dataset is small):

# Without variables rescaling:
C_vals = [0.01, 0.1, 1.0, 10., 100.]
gamma_vals = [0.001, 0.01, 1.0, 10.0]
print('WITHOUT VARIABLES RESCALING:\n')
for C_val in C_vals:
    for gamma_val in gamma_vals:
        clf = SVC(kernel = 'rbf', C=C_val, gamma=gamma_val)
        cv_scores = cross_val_score(clf, X_train, y_train)
        print('C = {:.2f}  gamma = {:.2f}:'.format(C_val, gamma_val))
        print('Cross-validation scores (3-fold):', cv_scores)
        print('Mean cross-validation score (3-fold): {:.3f}'.format(np.mean(cv_scores)))
        
        
# With variables rescaling:
C_vals = [0.01, 0.1, 1.0, 10., 100.]
gamma_vals = [0.001, 0.01, 1.0, 10.0]
print('\n\nWITH VARIABLES RESCALING:\n')
for C_val in C_vals:
    for gamma_val in gamma_vals:
        clf = SVC(kernel = 'rbf', C=C_val, gamma=gamma_val)
        cv_scores = cross_val_score(clf, X_train_scaled, y_train)
        print('C = {:.2f}  gamma = {:.2f}:'.format(C_val, gamma_val))
        print('Cross-validation scores (3-fold):', cv_scores)
        print('Mean cross-validation score (3-fold): {:.3f}'.format(np.mean(cv_scores)))

WITHOUT VARIABLES RESCALING:

C = 0.01  gamma = 0.00:
Cross-validation scores (3-fold): [0.61538462 0.61538462 0.61267606 0.61267606 0.61971831]
Mean cross-validation score (3-fold): 0.615
C = 0.01  gamma = 0.01:
Cross-validation scores (3-fold): [0.61538462 0.61538462 0.61267606 0.61267606 0.61971831]
Mean cross-validation score (3-fold): 0.615
C = 0.01  gamma = 1.00:
Cross-validation scores (3-fold): [0.61538462 0.61538462 0.61267606 0.61267606 0.61971831]
Mean cross-validation score (3-fold): 0.615
C = 0.01  gamma = 10.00:
Cross-validation scores (3-fold): [0.61538462 0.61538462 0.61267606 0.61267606 0.61971831]
Mean cross-validation score (3-fold): 0.615
C = 0.10  gamma = 0.00:
Cross-validation scores (3-fold): [0.61538462 0.61538462 0.61267606 0.61267606 0.61971831]
Mean cross-validation score (3-fold): 0.615
C = 0.10  gamma = 0.01:
Cross-validation scores (3-fold): [0.62937063 0.64335664 0.64788732 0.64788732 0.64788732]
Mean cross-validation score (3-fold): 0.643
C = 0.10  gamma

Let's try random forests:

In [13]:
# Simple example:

clf = RandomForestClassifier().fit(X_train, y_train)
clf.score(X_test, y_test)

0.8100558659217877

Let's try gradient boosting:

In [15]:
clf = XGBClassifier().fit(X_train, y_train)
print(clf.score(X_test, y_test))

n_est_vals = [350,500,750]
learning_rate_vals = [0.01, 0.05, 0.1, 0.5, 1.0]

for n_est_val in n_est_vals:
    for learning_rate_val in learning_rate_vals:
        clf = XGBClassifier(n_estimators=n_est_val, learning_rate = learning_rate_val, early_stopping_rounds=5)\
        .fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        print('n_estimators = {:.2f}  learning_rate = {:.2f} --> accuracy = {:.3f}'\
              .format(n_est_val, learning_rate_val ,clf.score(X_test, y_test)))



0.8044692737430168
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


n_estimators = 350.00  learning_rate = 0.01 --> accuracy = 0.821
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


n_estimators = 350.00  learning_rate = 0.05 --> accuracy = 0.821
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoos



n_estimators = 350.00  learning_rate = 0.10 --> accuracy = 0.810
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


n_estimators = 350.00  learning_rate = 0.50 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 350.00  learning_rate = 1.00 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 500.00  learning_rate = 0.01 --> accuracy = 0.838
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 500.00  learning_rate = 0.05 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 500.00  learning_rate = 0.10 --> accuracy = 0.810
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 500.00  learning_rate = 0.50 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 500.00  learning_rate = 1.00 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 750.00  learning_rate = 0.01 --> accuracy = 0.844
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 750.00  learning_rate = 0.05 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 750.00  learning_rate = 0.10 --> accuracy = 0.793
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 750.00  learning_rate = 0.50 --> accuracy = 0.804
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






n_estimators = 750.00  learning_rate = 1.00 --> accuracy = 0.804


In [28]:
# Same as above but with cross-validation:

for n_est_val in n_est_vals:
    for learning_rate_val in learning_rate_vals:
        clf = XGBClassifier(n_estimators=n_est_val, learning_rate = learning_rate_val)#, early_stopping_rounds=5)
        cv_scores = cross_val_score(clf, X_train, y_train)
        print('n_estimators = {:.2f}  learning_rate = {:.2f} --> '\
              .format(n_est_val, learning_rate_val))
        print('Cross-validation scores (3-fold):', cv_scores)
        print('Mean cross-validation score (3-fold): {:.3f}'.format(np.mean(cv_scores)))
        
        
# Best one appears to be n_estimators = 500.00  learning_rate = 0.01 --> 
# Cross-validation scores (3-fold): [0.83216783 0.85314685 0.82394366 0.81690141 0.81690141]
# Mean cross-validation score (3-fold): 0.829











n_estimators = 350.00  learning_rate = 0.01 --> 
Cross-validation scores (3-fold): [0.83216783 0.78321678 0.79577465 0.8028169  0.79577465]
Mean cross-validation score (3-fold): 0.802








n_estimators = 350.00  learning_rate = 0.05 --> 
Cross-validation scores (3-fold): [0.81818182 0.76223776 0.80985915 0.80985915 0.81690141]
Mean cross-validation score (3-fold): 0.803








n_estimators = 350.00  learning_rate = 0.10 --> 
Cross-validation scores (3-fold): [0.7972028  0.74825175 0.78169014 0.80985915 0.80985915]
Mean cross-validation score (3-fold): 0.789








n_estimators = 350.00  learning_rate = 0.50 --> 
Cross-validation scores (3-fold): [0.7972028  0.76223776 0.78873239 0.79577465 0.81690141]
Mean cross-validation score (3-fold): 0.792
















n_estimators = 350.00  learning_rate = 1.00 --> 
Cross-validation scores (3-fold): [0.81118881 0.75524476 0.81690141 0.78169014 0.82394366]
Mean cross-validation score (3-fold): 0.798
















n_estimators = 500.00  learning_rate = 0.01 --> 
Cross-validation scores (3-fold): [0.83916084 0.78321678 0.78873239 0.8028169  0.79577465]
Mean cross-validation score (3-fold): 0.802
















n_estimators = 500.00  learning_rate = 0.05 --> 
Cross-validation scores (3-fold): [0.81118881 0.74825175 0.79577465 0.80985915 0.80985915]
Mean cross-validation score (3-fold): 0.795
















n_estimators = 500.00  learning_rate = 0.10 --> 
Cross-validation scores (3-fold): [0.78321678 0.76223776 0.78169014 0.8028169  0.80985915]
Mean cross-validation score (3-fold): 0.788
















n_estimators = 500.00  learning_rate = 0.50 --> 
Cross-validation scores (3-fold): [0.78321678 0.76223776 0.78873239 0.79577465 0.81690141]
Mean cross-validation score (3-fold): 0.789
















n_estimators = 500.00  learning_rate = 1.00 --> 
Cross-validation scores (3-fold): [0.81818182 0.75524476 0.79577465 0.78873239 0.82394366]
Mean cross-validation score (3-fold): 0.796
















n_estimators = 750.00  learning_rate = 0.01 --> 
Cross-validation scores (3-fold): [0.83216783 0.76223776 0.78169014 0.80985915 0.8028169 ]
Mean cross-validation score (3-fold): 0.798
















n_estimators = 750.00  learning_rate = 0.05 --> 
Cross-validation scores (3-fold): [0.8041958  0.75524476 0.77464789 0.80985915 0.80985915]
Mean cross-validation score (3-fold): 0.791
















n_estimators = 750.00  learning_rate = 0.10 --> 
Cross-validation scores (3-fold): [0.77622378 0.75524476 0.78873239 0.78873239 0.81690141]
Mean cross-validation score (3-fold): 0.785
















n_estimators = 750.00  learning_rate = 0.50 --> 
Cross-validation scores (3-fold): [0.78321678 0.76223776 0.80985915 0.79577465 0.80985915]
Mean cross-validation score (3-fold): 0.792
















n_estimators = 750.00  learning_rate = 1.00 --> 
Cross-validation scores (3-fold): [0.81118881 0.76223776 0.78873239 0.78169014 0.82394366]
Mean cross-validation score (3-fold): 0.794


In [30]:
# Same as above but with cross-validation + rescaled variables:

for n_est_val in n_est_vals:
    for learning_rate_val in learning_rate_vals:
        clf = XGBClassifier(n_estimators=n_est_val, learning_rate = learning_rate_val)#, early_stopping_rounds=5)
        cv_scores = cross_val_score(clf, X_train_scaled, y_train)
        print('n_estimators = {:.2f}  learning_rate = {:.2f} --> '\
              .format(n_est_val, learning_rate_val))
        print('Cross-validation scores (3-fold):', cv_scores)
        print('Mean cross-validation score (3-fold): {:.3f}'.format(np.mean(cv_scores)))
        
# Basically no difference compared to before

















n_estimators = 350.00  learning_rate = 0.01 --> 
Cross-validation scores (3-fold): [0.83216783 0.78321678 0.79577465 0.8028169  0.79577465]
Mean cross-validation score (3-fold): 0.802
















n_estimators = 350.00  learning_rate = 0.05 --> 
Cross-validation scores (3-fold): [0.81818182 0.76223776 0.80985915 0.80985915 0.81690141]
Mean cross-validation score (3-fold): 0.803
















n_estimators = 350.00  learning_rate = 0.10 --> 
Cross-validation scores (3-fold): [0.7972028  0.74825175 0.78169014 0.80985915 0.80985915]
Mean cross-validation score (3-fold): 0.789
















n_estimators = 350.00  learning_rate = 0.50 --> 
Cross-validation scores (3-fold): [0.7972028  0.76223776 0.78873239 0.79577465 0.81690141]
Mean cross-validation score (3-fold): 0.792
















n_estimators = 350.00  learning_rate = 1.00 --> 
Cross-validation scores (3-fold): [0.81118881 0.75524476 0.81690141 0.78169014 0.82394366]
Mean cross-validation score (3-fold): 0.798
















n_estimators = 500.00  learning_rate = 0.01 --> 
Cross-validation scores (3-fold): [0.83916084 0.78321678 0.78873239 0.8028169  0.79577465]
Mean cross-validation score (3-fold): 0.802
















n_estimators = 500.00  learning_rate = 0.05 --> 
Cross-validation scores (3-fold): [0.81118881 0.74825175 0.79577465 0.80985915 0.80985915]
Mean cross-validation score (3-fold): 0.795
















n_estimators = 500.00  learning_rate = 0.10 --> 
Cross-validation scores (3-fold): [0.78321678 0.76223776 0.78169014 0.8028169  0.80985915]
Mean cross-validation score (3-fold): 0.788
















n_estimators = 500.00  learning_rate = 0.50 --> 
Cross-validation scores (3-fold): [0.78321678 0.76223776 0.78873239 0.79577465 0.81690141]
Mean cross-validation score (3-fold): 0.789
















n_estimators = 500.00  learning_rate = 1.00 --> 
Cross-validation scores (3-fold): [0.81818182 0.75524476 0.79577465 0.78873239 0.82394366]
Mean cross-validation score (3-fold): 0.796
















n_estimators = 750.00  learning_rate = 0.01 --> 
Cross-validation scores (3-fold): [0.83216783 0.76223776 0.78169014 0.80985915 0.8028169 ]
Mean cross-validation score (3-fold): 0.798
















n_estimators = 750.00  learning_rate = 0.05 --> 
Cross-validation scores (3-fold): [0.8041958  0.75524476 0.77464789 0.80985915 0.80985915]
Mean cross-validation score (3-fold): 0.791
















n_estimators = 750.00  learning_rate = 0.10 --> 
Cross-validation scores (3-fold): [0.77622378 0.75524476 0.78873239 0.78873239 0.81690141]
Mean cross-validation score (3-fold): 0.785
















n_estimators = 750.00  learning_rate = 0.50 --> 
Cross-validation scores (3-fold): [0.78321678 0.76223776 0.80985915 0.79577465 0.80985915]
Mean cross-validation score (3-fold): 0.792
















n_estimators = 750.00  learning_rate = 1.00 --> 
Cross-validation scores (3-fold): [0.81118881 0.76223776 0.78873239 0.78169014 0.82394366]
Mean cross-validation score (3-fold): 0.794


# Final model and prediction:
(choosing best model from above tests)

In [31]:
# Train model over the full original training dataset

train_df = train_df.loc[:,X_cols]
test_df = test_df.loc[:,X_cols]
#Set PassengerId as index:
X_train = X_train_all
X_test = X_test_all
y_train = y_train_all

# Remove Nan:
X_train.Age = X_train.Age.fillna(X_train.Age.mean())
X_test.Age = X_test.Age.fillna(X_test.Age.mean())
# Embarked columns has 2 Nan. I fill them with most common entries:
X_train.Embarked = X_train.Embarked.fillna(X_train.Embarked.mode()[0])
X_test.Embarked = X_test.Embarked.fillna(X_test.Embarked.mode()[0])

# Sex and Age are cathegorical data. Let's switch them to numerical
object_cols = ['Sex', 'Embarked']
ordinal_encoder = OrdinalEncoder()
X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])

# Chosen model is GradientBoosting with n_estimators = 500.00  learning_rate = 0.01
# (but also SVM with tuned params would give similar accuracy on test data)
clf = XGBClassifier(n_estimators=500, learning_rate = 0.01)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)





In [32]:
output = pd.DataFrame({'PassengerId': X_test.index,
                       'Survived': predictions})
output.to_csv('submission.csv', index=False)

This predicted output scored an accuracy of almost 80% after submission to Kaggle competition.