In [1]:
# =============================================================================
# #import Library
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
# =============================================================================
# #get raw data
# =============================================================================

dataset = pd.read_csv('train.csv')
predict = pd.read_csv("test.csv")
dataset = dataset.set_index(['PassengerId','Name'])
predict = predict.set_index(['PassengerId','Name'])
dataset = dataset.drop(['Cabin','Ticket'], axis=1)
predict = predict.drop(['Cabin','Ticket'], axis=1)


In [3]:
# =============================================================================
# #Feature Engineering
# =============================================================================

numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
Imp_num = SimpleImputer(strategy='median')
Std_Scal = StandardScaler()
numeric_transformer = make_pipeline(Imp_num, Std_Scal)

categorical_features = ['Embarked', 'Sex', 'Pclass']
Imp_cat = SimpleImputer(strategy='constant', fill_value='S')
onehot = OneHotEncoder(handle_unknown='ignore')
categorical_transformer = make_pipeline(Imp_cat, onehot)

preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])

y = dataset['Survived'].values
X = dataset.drop(['Survived'], axis=1)

predict_X = preprocessor.fit_transform(predict)
X = preprocessor.fit_transform(dataset)

# =============================================================================
# # Split Train & Test Set
# =============================================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 14)



In [4]:
# =============================================================================
# #'''Classifier'''
# =============================================================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

'''VotingClassifier'''
knn_clf = KNeighborsClassifier(n_neighbors = 15, metric = 'minkowski', p = 2,algorithm='brute')
log_clf = LogisticRegression(solver='liblinear',random_state=0)
rnd_clf = RandomForestClassifier(n_estimators=15)
svm_clf = SVC(probability=True, gamma = 0.6, kernel = 'poly',degree=2,random_state=0)
dtc_clf = DecisionTreeClassifier(random_state=0,min_samples_split =3,min_samples_leaf =5)
BNB_clf = BernoulliNB()

voting_clf = VotingClassifier(estimators=[('lr', log_clf), 
                                          ('rf', rnd_clf),
                                          ('svc', svm_clf),
                                          ('knn',knn_clf),
                                          ('dtc',dtc_clf),
                                          ('BNB',BNB_clf)],
                                          voting='soft',
                                          flatten_transform=True)
voting_clf.fit(X_train, y_train)


'''BaggingClassifier'''
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(rnd_clf, 
                            n_estimators=30,
                            max_samples=100, 
                            bootstrap=True, 
                            n_jobs=-1,
                            oob_score=True)


for clf in (log_clf, rnd_clf, svm_clf, knn_clf, dtc_clf,BNB_clf, voting_clf, bag_clf ):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))


LogisticRegression 0.8
RandomForestClassifier 0.8
SVC 0.8444444444444444
KNeighborsClassifier 0.8222222222222222
DecisionTreeClassifier 0.8444444444444444
BernoulliNB 0.7555555555555555
VotingClassifier 0.8
BaggingClassifier 0.8222222222222222


In [None]:
# =============================================================================
# #Output Results
# =============================================================================
predict_y  = pd.DataFrame(voting_clf.predict(predict_X))
index = pd.Series(list(zip(*predict.index.values))[0])
predict_y = predict_y.set_index(index)
predict_y = predict_y.reset_index()
predict_y = predict_y.rename(columns={0:"Survived","index":"PassengerId"})
predict_y.to_csv('predications_VotingClassifier_2.csv', sep=',', index=False, encoding='utf-8')