In [1]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titaniccleaningfeatureengineering/train_withFeatures.csv
/kaggle/input/titaniccleaningfeatureengineering/__results__.html
/kaggle/input/titaniccleaningfeatureengineering/__output__.json
/kaggle/input/titaniccleaningfeatureengineering/custom.css
/kaggle/input/titaniccleaningfeatureengineering/__notebook__.ipynb
/kaggle/input/titaniccleaningfeatureengineering/cleaned_test.csv
/kaggle/input/titaniccleaningfeatureengineering/cleaned_train.csv
/kaggle/input/titaniccleaningfeatureengineering/test_withFeatures.csv
/kaggle/input/titaniccleaningfeatureengineering/__results___files/__results___16_1.png
/kaggle/input/titaniccleaningfeatureengineering/__results___files/__results___14_3.png
/kaggle/input/titaniccleaningfeatureengineering/__results___files/__results___21_0.png


In [2]:
train = pd.read_csv('../input/titaniccleaningfeatureengineering/cleaned_train.csv')
test = pd.read_csv('../input/titaniccleaningfeatureengineering/cleaned_test.csv')
submission = pd.read_csv('../input/titanic/gender_submission.csv')

X = train[train.columns[1:]]
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Voting Classifier

In [3]:
ensemble_lin_rbf = VotingClassifier(estimators = 
                                    [('KNN', KNeighborsClassifier(n_neighbors = 5)),
                                    ('RBF', SVC(probability = True, kernel = 'rbf',C = 0.5, gamma = 0.1)),
                                    ('svm', SVC(kernel = 'linear', probability = True)),
                                    ('RFor', RandomForestClassifier(n_estimators = 500, random_state = 0)),
                                    ('LR', LogisticRegression(C = 0.05)),
                                    ('DT', DecisionTreeClassifier(random_state = 0)),
                                    ('NB', GaussianNB())], 
                                   voting = 'soft').fit(X_train, y_train)
print('The accuracy for ensembled model (voting classifier) is:', ensemble_lin_rbf.score(X_test, y_test))
    
cross = cross_val_score(ensemble_lin_rbf, X, y, cv = 10, scoring = "accuracy")
print('The cross validated score is', cross.mean())

The accuracy for ensembled model (voting classifier) is: 0.8156424581005587
The cross validated score is 0.8003583588695948


In [4]:
submission['Survived'] = ensemble_lin_rbf.predict(test)
submission.to_csv("submission_votingclassifier.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


## Bagging

In [5]:
# bagged KNN
knn = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 5), random_state = 0, n_estimators = 700)
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
print('The accuracy for bagged KNN is:', metrics.accuracy_score(prediction, y_test))

result = cross_val_score(knn, X, y, cv = 10, scoring = 'accuracy')
print('The cross validated score for bagged KNN is:', result.mean())

The accuracy for bagged KNN is: 0.7262569832402235
The cross validated score for bagged KNN is: 0.7196461809102258


In [6]:
submission['Survived'] = knn.predict(test)
submission.to_csv("submission_baggedKNN.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [7]:
# II.2 bagged decision tree
dtc = BaggingClassifier(base_estimator = DecisionTreeClassifier(), random_state = 0, n_estimators = 100)
dtc.fit(X_train, y_train)
prediction = dtc.predict(X_test)
print('The accuracy for bagged Decision Tree is:', metrics.accuracy_score(prediction, y_test))

result = cross_val_score(dtc, X, y, cv = 10, scoring = 'accuracy')
print('The cross validated score for bagged Decision Tree is:', result.mean())

The accuracy for bagged Decision Tree is: 0.8491620111731844
The cross validated score for bagged Decision Tree is: 0.8138417886732494


In [8]:
submission['Survived'] = dtc.predict(test)
submission.to_csv("submission_baggedDT.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


## Boosting

In [9]:
# AdaBoost (with default decision tree base estimator)
ada = AdaBoostClassifier(n_estimators = 200, random_state = 0, learning_rate = 0.1)
result = cross_val_score(ada, X, y, cv = 10, scoring = 'accuracy')
print('The cross validated score for AdaBoost is:', result.mean())

n_estimators = list(range(100,1100,100))
learn_rate = [0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper = {'n_estimators':n_estimators, 'learning_rate':learn_rate}

adaGS = GridSearchCV(estimator = AdaBoostClassifier(), param_grid = hyper, verbose = True)
adaGS.fit(X, y)
print(adaGS.best_score_)
print(adaGS.best_estimator_)

The cross validated score for AdaBoost is: 0.8070619112473045
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  7.2min finished


0.8114478114478114
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.7,
                   n_estimators=300, random_state=None)


In [10]:
submission['Survived'] = adaGS.predict(test)
submission.to_csv("submission_adaboost.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [11]:
# Stochastic Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators = 500, random_state = 0, learning_rate = 0.1)
result = cross_val_score(gbc, X, y, cv = 10, scoring = 'accuracy')
print('The cross validated score for Gradient Boosting is:', result.mean())

gbcGS = GridSearchCV(estimator = GradientBoostingClassifier(), param_grid = hyper, verbose = True)
gbcGS.fit(X, y)
print(gbcGS.best_score_)
print(gbcGS.best_estimator_)

The cross validated score for Gradient Boosting is: 0.8305453410509591
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.8215488215488216
GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  2.8min finished


In [12]:
submission['Survived'] = gbcGS.predict(test)
submission.to_csv("submission_gradientboost.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [13]:
# III.3 XGBoost
xgboost = xgb.XGBClassifier(n_estimators = 900,learning_rate = 0.1)
result = cross_val_score(xgboost, X, y, cv=10, scoring = 'accuracy')
print('The cross validated score for XGBoost is:', result.mean())

xgbGS = GridSearchCV(estimator = xgb.XGBClassifier(), param_grid = hyper, verbose = True)
xgbGS.fit(X, y)
print(xgbGS.best_score_)
print(xgbGS.best_estimator_)

The cross validated score for XGBoost is: 0.8216062308478038
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.8271604938271605
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=400, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)


[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  1.2min finished


In [14]:
submission['Survived'] = xgbGS.predict(test)
submission.to_csv("submission_xgboost.csv", index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
