In [16]:
import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

X, y = train_data.drop(['Survived'], axis=1), train_data['Survived']

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X.shape: (891, 11) y.shape: (891,)


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(X_train.select_dtypes(exclude=['object']).columns)),
        ("imputer", SimpleImputer(strategy="median")),
    ])

print(num_pipeline.fit_transform(X_train))

[[332.       1.      45.5      0.       0.      28.5   ]
 [734.       2.      23.       0.       0.      13.    ]
 [383.       3.      32.       0.       0.       7.925 ]
 ...
 [861.       3.      41.       2.       0.      14.1083]
 [436.       1.      14.       1.       2.     120.    ]
 [103.       1.      21.       0.       1.      77.2875]]


In [67]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [20]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(X_train.select_dtypes(include=['object']).columns)),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

In [21]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [110]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

kfold = StratifiedKFold(n_splits=5)

import xgboost


pipe = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('classifier', xgboost.XGBClassifier())
])

grid = {'classifier__learning__rate':[0.01, 0.1, 1, 2, 5],
        'classifier__reg_lambda':[0.01, 0.1, 0.5, 1, 2, 5],
        'classifier__n_estimators':[1,2,5, 7, 10, 15]}

search = GridSearchCV(pipe, grid, cv=kfold, scoring='accuracy', n_jobs=-1)
search.fit(X_train, y_train)



Parameters: { "learning__rate" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocessing',
                                        FeatureUnion(transformer_list=[('num_pipeline',
                                                                        Pipeline(steps=[('select_numeric',
                                                                                         DataFrameSelector(attribute_names=Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))),
                                                                                        ('imputer',
                                                                                         SimpleImputer(strategy='med...
                                                      min_child_weight=None,
                                                      missing=nan,
                                                      monotone_constraints=None,
                

In [112]:
print(search.best_params_)
print(search.best_score_)

{'classifier__learning__rate': 0.01, 'classifier__n_estimators': 7, 'classifier__reg_lambda': 2}
0.8243967300305328


In [107]:
Y_pred = search.predict(test_data)

result = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred
    })
result.to_csv('./result_xgBoost.csv', index=False)
#najwiecej: 0.78947

In [111]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np

kfold = StratifiedKFold(n_splits=5)


pipe2 = Pipeline([
    ('preprocessing', preprocess_pipeline),
    ('classifier', ExtraTreesClassifier())
])

grid2 ={'classifier__n_estimators' : np.r_[10:210:10],
              'classifier__max_depth': np.r_[5:20:2],
        'classifier__max_leaf_nodes': [16, 32, 64]
             }

search2 = GridSearchCV(pipe2, grid2, cv=kfold, scoring='accuracy', n_jobs=-1)
search2.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocessing',
                                        FeatureUnion(transformer_list=[('num_pipeline',
                                                                        Pipeline(steps=[('select_numeric',
                                                                                         DataFrameSelector(attribute_names=Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object'))),
                                                                                        ('imputer',
                                                                                         SimpleImputer(strategy='med...
                                                                                        ('cat_encoder',
                                                                                         OneHotEncoder(handle_unknown='ignore',
         

In [113]:
print(search2.best_params_)
print(search2.best_score_)

{'classifier__max_depth': 13, 'classifier__max_leaf_nodes': 64, 'classifier__n_estimators': 30}
0.8244361272530287


In [115]:
Y_pred2 = search2.predict(test_data)

result2 = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred2
    })
result2.to_csv('./result_Extra.csv', index=False)
#najwiecej: 0.79186