# Titanic: Machine Learning from Disaster

In [138]:
# https://www.kaggle.com/c/titanic

In [139]:
import sklearn
import pandas as pd
import warnings
warnings.simplefilter('ignore')

In [140]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')
test_ = test
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
# train = train.dropna(axis=0, how='any')
# test = test.dropna(axis=0, how='any')

In [141]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [142]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [143]:
train['Sex'].replace(to_replace=dict(female=1, male=0), inplace=True)
train['Embarked'].replace(to_replace=dict(Q=2,C=1, S=0), inplace=True)
test['Sex'].replace(to_replace=dict(female=1, male=0), inplace=True)
test['Embarked'].replace(to_replace=dict(Q=2,C=1, S=0), inplace=True)

In [144]:
# train_dummies = pd.get_dummies(train, columns=[ 'Pclass', 'Embarked'])
# test_dummies = pd.get_dummies(test, columns=[ 'Pclass', 'Embarked'])
train_dummies = train
test_dummies = test

In [145]:
train_dummies.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0.0
1,1,1,1,38.0,1,0,71.2833,1.0
2,1,3,1,26.0,0,0,7.925,0.0
3,1,1,1,35.0,1,0,53.1,0.0
4,0,3,0,35.0,0,0,8.05,0.0


In [146]:
test_dummies.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,34.5,0,0,7.8292,2
1,3,1,47.0,1,0,7.0,0
2,2,0,62.0,0,0,9.6875,2
3,3,0,27.0,0,0,8.6625,0
4,3,1,22.0,1,1,12.2875,0


## Preprocess data

In [147]:
# Extract features
# X_train = train_dummies.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
# X_test = test_dummies.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

X_train = train_dummies.drop(['Survived'], axis=1)
print(X_train.shape)
X_full = train_dummies.drop(['Survived'], axis=1)
X_full = X_full.append(test_dummies)
print(X_train.shape,X_full.shape)
X_full.head()

(891, 7)
(891, 7) (1309, 7)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,0.0
1,1,1,38.0,1,0,71.2833,1.0
2,3,1,26.0,0,0,7.925,0.0
3,1,1,35.0,1,0,53.1,0.0
4,3,0,35.0,0,0,8.05,0.0


In [148]:
# Extract targets
y_train = train_dummies['Survived']

In [149]:
#Fill NA values
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
imp.fit(X_full)
X_full = imp.transform(X_full)
X_train = imp.transform(X_train)
X_test = imp.transform(test_dummies)

In [150]:
# Scale features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=True, with_mean=False, with_std=True)
scaler.fit(X_full)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [151]:
import numpy as np
print(np.mean(X_train, axis = 0))
print(np.var(X_train, axis = 0))

[ 2.75653501  0.73601191  2.30894938  0.50228343  0.44103174  0.62267519
  0.55473276]
[ 0.99543457  0.99544176  1.0181976   1.12032114  0.86692458  0.92216386
  0.94569426]


In [152]:
print(X_train)
print(X_test)

[[ 3.58202142  0.          1.70830382 ...,  0.          0.14018029  0.        ]
 [ 1.19400714  2.08849241  2.95070659 ...,  0.          1.37827771
   1.53123551]
 [ 3.58202142  2.08849241  2.01890451 ...,  0.          0.15323155  0.        ]
 ..., 
 [ 3.58202142  2.08849241  2.32027552 ...,  2.31152519  0.45341072  0.        ]
 [ 1.19400714  0.          2.01890451 ...,  0.          0.58005636
   1.53123551]
 [ 3.58202142  0.          2.48480555 ...,  0.          0.14984789
   3.06247101]]
[[ 3.58202142  0.          2.67893098 ...,  0.          0.15137924
   3.06247101]
 [ 3.58202142  2.08849241  3.64955815 ...,  0.          0.13534648  0.        ]
 [ 2.38801428  0.          4.81431075 ...,  0.          0.18730987
   3.06247101]
 ..., 
 [ 3.58202142  0.          2.98953168 ...,  0.          0.14018029  0.        ]
 [ 3.58202142  0.          2.32027552 ...,  0.          0.15564846  0.        ]
 [ 3.58202142  0.          2.32027552 ...,  1.15576259  0.43230247
   1.53123551]]


## Train model and make predictions

In [153]:
# Fit logistic regression
from sklearn.linear_model import LogisticRegression
# LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, 
#                    fit_intercept=True, intercept_scaling=1, class_weight=None, 
#                    random_state=None, solver='liblinear', max_iter=100,
#                    multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)

In [154]:
#Оценка качества методом кроссвалидации
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression(verbose=0, intercept_scaling=1,n_jobs=1 ,dual=False , warm_start=False,random_state = 42, )

#Подбор параметров по сетке
param_grid_l2 = {
    'penalty': ['l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'tol': [0.001, 0.01, 0.1, 1],
    'solver': [ 'newton-cg', 'lbfgs', 'liblinear', 'sag'],
    'max_iter': [10,50,100, 500, 1000, 5000, 10000],
    'class_weight': [None, 'balanced']}

param_grid_l1 = {
    'penalty': ['l1'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'tol': [0.001, 0.01, 0.1, 1],
    'solver': [  'liblinear'],
    'max_iter': [10, 50 ,100, 500, 1000, 5000, 10000],
    'class_weight': [None, 'balanced']}

train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X_train, y_train, 
                                                                                    test_size = 0.25,
                                                                                    random_state = 42)
#кроссвалидация на 5 фолда - отложеная выборка 10%
cv = cross_validation.StratifiedShuffleSplit(train_labels, n_iter = 4, test_size = 0.25, random_state = 42)

grid_search_l2 = GridSearchCV(log_reg, param_grid_l2,scoring = 'accuracy', cv = cv )
grid_search_l1 = GridSearchCV(log_reg, param_grid_l1,scoring = 'accuracy', cv = cv )


In [155]:
%%time
grid_search_l2.fit(train_data,train_labels)

CPU times: user 23.8 s, sys: 237 ms, total: 24 s
Wall time: 24.3 s


GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 0 ..., 1 0], n_iter=4, test_size=0.25, random_state=42),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'tol': [0.001, 0.01, 0.1, 1], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'max_iter': [10, 50, 100, 500, 1000, 5000, 10000], 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [156]:
%%time
grid_search_l1.fit(train_data,train_labels)

CPU times: user 3.92 s, sys: 46 ms, total: 3.97 s
Wall time: 4.03 s


GridSearchCV(cv=StratifiedShuffleSplit(labels=[1 0 ..., 1 0], n_iter=4, test_size=0.25, random_state=42),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'tol': [0.001, 0.01, 0.1, 1], 'solver': ['liblinear'], 'max_iter': [10, 50, 100, 500, 1000, 5000, 10000], 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [157]:
grid_search_l2.best_estimator_

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='sag', tol=0.01, verbose=0,
          warm_start=False)

In [158]:
grid_search_l1.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.01,
          verbose=0, warm_start=False)

In [159]:
print(grid_search_l2.best_score_)
print(grid_search_l2.best_params_)

0.793413173653
{'C': 0.01, 'class_weight': None, 'max_iter': 10, 'penalty': 'l2', 'solver': 'sag', 'tol': 0.01}


In [160]:
print(grid_search_l1.best_score_)
print(grid_search_l1.best_params_)

0.782934131737
{'C': 1, 'class_weight': None, 'max_iter': 50, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.01}


In [161]:
# grid_search_l2.best_estimator_.fit(X_train, y_train)
# y_pred = grid_search_l2.best_estimator_.predict(X_test)

grid_search_l1.best_estimator_.fit(X_train, y_train)
y_pred = grid_search_l1.best_estimator_.predict(X_test)

In [162]:
# Your submission scored  0.77033 без fit 0.76077 c cv на отложеной выборки 25% -0.77033- {'C': 1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg', 'tol': 0.001}
# Your submission scored 0.77033 без fit 0.76077 - {'C': 10, 'class_weight': None, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
#Результаты при данной стратегии кроссвалидации при различных параметрах совпали и  близки к собсвенной оценке 0.799382716049
# при размере отложеннной выборки в 30%  оценки сильнее разлицаются - но ресультаты немного лучше 

In [163]:
print(grid_search_l2.best_estimator_.coef_)
print(grid_search_l1.best_estimator_.coef_)

[[-0.35417321  0.65238812 -0.14772808 -0.11235065  0.0039124   0.17706887
   0.09467578]]
[[-0.82771281  1.27741314 -0.42833061 -0.31923402 -0.07120385  0.16220672
   0.15646548]]


In [164]:
#Оценка качества методом кроссвалидации
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV

train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X_train, y_train, 
                                                                                    test_size = 0.2,
                                                                                    random_state = 42)
#кроссвалидация на 5 фолда - отложеная выборка 10%
cv = cross_validation.StratifiedShuffleSplit(train_labels, n_iter = 10, test_size = 0.2, random_state = 42)

In [165]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


# random_forest = RandomForestClassifier(bootstrap=True, class_weight='balanced', criterion='gini',
#             max_depth=13, max_features='sqrt', max_leaf_nodes=None,
#             min_impurity_split=1e-07, min_samples_leaf=2,
#             min_samples_split=2, min_weight_fraction_leaf=0.0,
#             n_estimators=1500, n_jobs=-1, oob_score=False,
#             random_state=42, verbose=0, warm_start=True)

random_forest = ExtraTreesClassifier(
    max_features='auto',
    bootstrap=True,
    oob_score=True,
    n_estimators=100,
    max_depth=None,
#     min_samples_split=2,
#    class_weight={0: 0.7, 1: 0.3},
    class_weight='balanced',
#     min_weight_fraction_leaf=0.02,
    random_state=42,
    n_jobs=-1
    )

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print(random_forest.oob_score_)
print(random_forest)


# clf_ext = ExtraTreesClassifier(
#     max_features='auto',
#     bootstrap=True,
#     oob_score=True,

rf_scoring = cross_validation.cross_val_score(random_forest, X_train, y_train, scoring = 'f1', cv = cv)
print( 'Log mean:{}, max:{}, min:{}, std:{}'.format(rf_scoring.mean(), rf_scoring.max(), 
                                                   rf_scoring.min(), rf_scoring.std()) )

0.813692480359
ExtraTreesClassifier(bootstrap=True, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=True, random_state=42, verbose=0, warm_start=False)
Log mean:0.7373588274445697, max:0.784, min:0.6538461538461539, std:0.03679409512212857


In [166]:
#Your submission scored - 0.78469 - оценка oob - 0.793490460157 
#Пробовал подбирать параметры по сетке - параметров много считает долго и результат часто хуже параметров по умолчанию(максимальный результат - 0.78947)
#возможно еще неочень подходит выбранная стратегия кросс валидации или нужно по другому готовить данные
0.76077

0.76077

## Create submission

In [215]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test_['PassengerId'], y_pred):
        out.write('%s,%s\n' % (passenger, y))

In [2]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from mlxtend.classifier import StackingClassifier, StackingCVClassifier
from sklearn import svm
import xgboost as xgb




In [3]:
LogisticRegression()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [1]:
# clf1 = KNeighborsClassifier(n_neighbors=1)
clf1 = GradientBoostingClassifier(
    random_state = 0,
#     n_estimators = 575,
#     max_depth=13,
#     min_samples_leaf=2,
)
clf2 = RandomForestClassifier(
    n_estimators=575,
    max_depth=13,
    min_samples_leaf=2,max_features = None,
    random_state=0, n_jobs=-1, warm_start= True,verbose = 0 )
clf3 = AdaBoostClassifier(    
    n_estimators=575,
    learning_rate=0.95)
# clf4 = LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, max_iter=10, multi_class='ovr', n_jobs=1,
#           penalty='l2', random_state=42, solver='sag', tol=0.01, verbose=0,
#           warm_start=False)
clf4 = svm.SVC(
    #kernel= 'linear', C=0.025
)
clf5 = ExtraTreesClassifier(
    n_estimators=575,
    max_depth=6,
    min_samples_leaf=3,max_features = 'auto',
    random_state=0, n_jobs=-1, warm_start= True,verbose = 0 
    )


clf3 = xgb.XGBClassifier()



lr = xgb.XGBClassifier()




sclf = StackingClassifier(classifiers=[clf1, clf2,clf3], 
                          meta_classifier=lr)

print(' cross validation:\n')

for clf, label in zip([clf1,clf2,clf3, sclf], 
                      [ 'GradientBoostingClassifier',
                       'Random Forest', 
                       'XGBClassifier',
#                       'SVC ',
#                       'ExtraTreesClassifier',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X_train, y_train, 
                                              cv=cv, scoring='log_loss')
    print("f1: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

NameError: name 'GradientBoostingClassifier' is not defined

In [214]:
sclf.fit(X_train, y_train)
y_pred = sclf.predict(X_test)

In [186]:

params = {
#          'randomforestclassifier__n_estimators': [10, 100, 500],
#           'meta-logisticregression__C': [0.1, 1, 10.0],
#          'meta-logisticregression__penalty':['l1','l2']
}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=cv,
                    refit=True)
grid.fit(X_train, y_train)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
grid.best_estimator_.fit(X_train, y_train)
y_pred = grid.best_estimator_.predict(X_test)

0.787 +/- 0.01 {'randomforestclassifier__n_estimators': 10}
0.793 +/- 0.02 {'randomforestclassifier__n_estimators': 100}
0.787 +/- 0.02 {'randomforestclassifier__n_estimators': 500}
Best parameters: {'randomforestclassifier__n_estimators': 100}
Accuracy: 0.79


In [218]:
from mlxtend.classifier import EnsembleVoteClassifier

eclf = EnsembleVoteClassifier(clfs=[clf1, clf2,clf3], weights=[1,1,1])



params = {
#          'randomforestclassifier__n_estimators': [10, 100, 500],
#           'meta-logisticregression__C': [0.1, 1, 10.0],
#          'meta-logisticregression__penalty':['l1','l2']
}

grid = GridSearchCV(estimator=eclf, 
                    param_grid=params, 
                    cv=cv,
                    refit=True, scoring='f1')
grid.fit(X_train, y_train)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

grid.best_estimator_.fit(X_train, y_train)
y_pred = grid.best_estimator_.predict(X_test)

0.767 +/- 0.03 {}
Best parameters: {}
Accuracy: 0.77


In [1]:
LogisticRegression()

NameError: name 'LogisticRegression' is not defined

In [224]:
#Практикум по Pandas

In [225]:
pd.__version__

'0.20.1'