In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

train_data = pd.read_csv('train.csv')

In [2]:
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived'].copy()

X_test = pd.read_csv('test.csv')

In [3]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder

class FeatureEngineering(BaseEstimator, TransformerMixin):

    def __init__(self, group_age = True, group_fare = True, group_fam = True):
        self.group_age = group_age
        self.group_fare = group_fare
        self.group_fam = group_fam
    
    def fit(self, X):
        return self
    
    def transform(self, X):

        def extract_title(name):
            title_search = re.search(' ([A-Za-z]+)\.', name)
            if title_search:
                return title_search.group(1)
            return ""
        
        if 'Name' in X.columns:
            X['Title'] = X['Name'].apply(extract_title)
            X = X.drop(['Name'], axis=1)
        
        if 'Age' in X.columns and self.group_age:
            age_bins = [0, 12, 18, 30, 50, 100]
            age_labels = ['Child', 'Teenage', 'Young Adult', 'Adult', 'Elderly']
            X['AgeGroup'] = pd.cut(X['Age'], bins=age_bins, labels=age_labels, right=False)
            X = X.drop(['Age'], axis=1)

        if 'Fare' in X.columns and self.group_fare:
            fare_bins = [0, 50, 100, 150, 200, 300, 1000]
            fare_labels = ['0-50', '50-100', '100-150', '150-200', '200-300', '300+']
            X['FareGroup'] = pd.cut(X['Fare'], bins=fare_bins, labels=fare_labels, right=False)
            X = X.drop(['Fare'], axis=1)
        
        if 'SibSp' in X.columns and 'Parch' in X.columns and self.group_fam:
            X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
            X = X.drop(['SibSp', 'Parch'], axis=1)
            
        X = X.drop(['PassengerId', 'Cabin', 'Ticket'], axis=1)
        return X


In [5]:
feat_eng = FeatureEngineering()
X_train = feat_eng.fit_transform(X_train)
X_test = feat_eng.fit_transform(X_test)

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X_train['Sex'] = label_encoder.fit_transform(X_train[['Sex']])
X_train['Title'] = label_encoder.fit_transform(X_train['Title'])
X_train['Embarked'] = label_encoder.fit_transform(X_train['Embarked'])

X_test['Sex'] = label_encoder.fit_transform(X_test[['Sex']])
X_test['Title'] = label_encoder.fit_transform(X_test['Title'])
X_test['Embarked'] = label_encoder.fit_transform(X_test['Embarked'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_attribs = ['Pclass', 'FamilySize','Sex', 'Embarked', 'Title']
ord_attribs = ['FareGroup', 'AgeGroup']
                
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

ord_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder()),
])

final_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('ord', ord_pipeline, ord_attribs),
])


X_train = final_pipeline.fit_transform(X_train)
X_test = final_pipeline.fit_transform(X_test)

In [8]:
X_train.shape

(891, 7)

In [9]:
X_test.shape

(418, 7)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

best_log_clf = LogisticRegression(C=10, max_iter = 1000, penalty='l2')
best_log_clf.fit(X_train, y_train)
cross_val_score(best_log_clf, X_train, y_train).mean()

0.7868055991463185

In [11]:
from sklearn.svm import SVC

best_svm_clf = SVC(C=10, coef0=-1, degree=2)
best_svm_clf.fit(X_train, y_train)

cross_val_score(best_svm_clf, X_train, y_train, scoring='accuracy').mean()

0.8126043562864854

In [12]:
from sklearn.neighbors import KNeighborsClassifier

best_knn_clf = KNeighborsClassifier(n_neighbors=10, p=1)
best_knn_clf.fit(X_train, y_train)

cross_val_score(best_knn_clf, X_train, y_train, scoring='accuracy').mean()

0.8193082669010107

In [13]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(max_features=3, n_estimators=40)
rnd_clf.fit(X_train, y_train)
cross_val_score(rnd_clf, X_train, y_train, scoring='accuracy').mean()

0.8047705730964786

In [14]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(n_estimators=290, estimator=DecisionTreeClassifier(max_depth=4),
                            max_samples = 0.8, max_features=0.95, bootstrap=False)

bag_clf.fit(X_train, y_train)
cross_val_score(bag_clf, X_train, y_train, scoring='accuracy').mean()

0.824913690289373

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(n_estimators=70, estimator=DecisionTreeClassifier(max_depth=2),
                            learning_rate = 0.01)

ada_clf.fit(X_train, y_train)
cross_val_score(ada_clf, X_train, y_train, scoring='accuracy').mean()

0.7867993220764546

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4,
                                    max_features = 'log2', subsample = 0.9)
gb_clf.fit(X_train, y_train)
cross_val_score(gb_clf, X_train, y_train, scoring='accuracy').mean()

0.8226916075575922

In [17]:
from xgboost import XGBRFClassifier

xgbrf_clf = XGBRFClassifier(n_estimators=100, learning_rate=0.01, max_depth=4)

xgbrf_clf.fit(X_train, y_train)
cross_val_score(xgbrf_clf, X_train, y_train, scoring='accuracy').mean()

0.821568012051974

In [18]:
from sklearn.ensemble import VotingClassifier

param_grid = {
    'voting': ['hard', 'soft'],
    'weights': [
        [1, 3, 1, 1],
        [2, 3, 2, 2],
        [2, 3, 3, 2],
        [2, 3, 2, 3],
        [1, 3, 2, 1],
    ]
}

voting_clf = VotingClassifier(
    estimators=[('knn', best_knn_clf),
                ('bag', bag_clf),
                ('gb', gb_clf),
                ('xgb', xgbrf_clf)]
)

grid_search = GridSearchCV(voting_clf, param_grid, cv=3, 
                           scoring='accuracy',
                           return_train_score=True)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
best_voting_clf = grid_search.best_estimator_
print(cross_val_score(best_voting_clf, X_train, y_train, scoring='accuracy').mean())

{'voting': 'hard', 'weights': [2, 3, 3, 2]}
0.8260435628648548


In [19]:
y_pred = best_voting_clf.predict(X_test)
X_test = pd.read_csv('test.csv')
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': y_pred})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
