In [284]:
import plotly.graph_objects as go
import plotly.express as px

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

import pandas as pd
import numpy as np

In [285]:
def plot_numeric(column, df):
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=df[df['Survived']==1][column], name='Survived'))
    fig.add_trace(go.Histogram(x=df[df['Survived']==0][column], name='No Survived'))
    
    # Overlay both histograms
    fig.update_layout(barmode='overlay',title=column)
    # Reduce opacity to see both histograms
    fig.update_traces(opacity=0.75)
    fig.show()
    
def plot_categorical(column, df):
    df_aux = df.groupby([column,'Survived']).count()['PassengerId'].reset_index()
    df_aux.columns = [column,'Survived','total']
    df_aux['percent'] = df_aux.apply(lambda x : (x['total']*100.0) / df_aux[df_aux[column]==x[column]]['total'].sum(),axis=1)
    df_aux['total_percent_bar'] = df_aux.apply(lambda x : (df_aux[df_aux[column]==x[column]]['total'].sum()*100.0) / df.shape[0],axis=1)
    fig = px.bar(df_aux, x="total", y=column, color="Survived", title=column, orientation='h', hover_data=['percent','total_percent_bar'])
    fig.show()

In [286]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [287]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics_columns = df_train.select_dtypes(include=numerics).columns
numerics_columns = list(numerics_columns)
numerics_columns
for _ in ['Survived', 'PassengerId']:
    numerics_columns.remove(_)
for c in numerics_columns:
    plot_numeric(c, df_train)

In [288]:
def get_main_cabin(x):
    if pd.isnull(x):
        return np.nan
    if ~pd.isnull(x):
        return x[:1]


def categorical_fare(x):
    if x>= 0 and x <= 50:
        return '0 - 50'
    if x>= 51 and x <= 100:
        return '51 - 100'
    if x>= 101 and x <= 150:
        return '101 - 150'
    if x>= 151 and x <= 200:
        return '151 - 200'
    elif x >= 201:
        return '201+'

def check_is_alone(x):
    if x == 0:
        return True
    if x > 0:
        return False

df_train['fare_categorical'] = df_train['Fare'].apply(categorical_fare)
df_train['main_cabin'] = df_train['Cabin'].apply(check_main_cabin)
df_train['total_aboard'] = df_train['SibSp'] + df_train['Parch']
df_train['is_alone'] = df_train['total_aboard'].apply(check_is_alone)
df_train['no_had_siblings'] = df_train['SibSp'].apply(check_is_alone)
df_train['no_had_parents'] = df_train['Parch'].apply(check_is_alone)

In [289]:
texts_columns = df_train.select_dtypes(include=['object', 'bool']).columns
texts_columns = list(texts_columns) + ['Pclass']
for _ in ['Name', 'Ticket', 'Cabin']:
    texts_columns.remove(_)
texts_columns

for c in texts_columns:
    plot_categorical(c, df_train)


In [290]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

class createCategoricals(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X['fare_categorical'] = X['Fare'].apply(categorical_fare)
        X['total_aboard'] = X['SibSp'] + X['Parch']
        X['is_alone'] = X['total_aboard'].apply(check_is_alone)
        X['no_had_siblings'] = X['SibSp'].apply(check_is_alone)
        X['no_had_parents'] = X['Parch'].apply(check_is_alone)
        return X

class DebugEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        display(X)
        return X

In [291]:
cat_pipeline = Pipeline([
        ("create_cat", createCategoricals()),
        ("select_cat", DataFrameSelector(['Sex', 'Embarked', 'fare_categorical', 'is_alone', 'no_had_siblings', 'no_had_parents', 'Pclass'])),
        ("imputer", MostFrequentImputer()),
        ("debug", DebugEncoder()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [292]:
df_train = pd.read_csv('train.csv')
X_data = cat_pipeline.fit_transform(df_train)

Unnamed: 0,Sex,Embarked,fare_categorical,is_alone,no_had_siblings,no_had_parents,Pclass
0,male,S,0 - 50,False,False,True,3
1,female,C,51 - 100,False,False,True,1
2,female,S,0 - 50,True,True,True,3
3,female,S,51 - 100,False,False,True,1
4,male,S,0 - 50,True,True,True,3
...,...,...,...,...,...,...,...
886,male,S,0 - 50,True,True,True,2
887,female,S,0 - 50,True,True,True,1
888,female,S,0 - 50,False,False,False,3
889,male,C,0 - 50,True,True,True,1


In [295]:
y = df_train[['Survived']].values
validation_index = round(len(x[0])*.6)
X_train = X_data[:validation_index]
y_train = y[:validation_index]
X_validation = X_data[validation_index:]
y_validation = y[validation_index:]
X_train.shape, y_train.shape

((535, 19), (535, 1))

In [296]:
X_validation.shape, y_validation.shape

((356, 19), (356, 1))

In [298]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train.ravel())
forest_scores = cross_val_score(forest_clf, X_train, y_train.ravel(), cv=10)
forest_scores.mean()

0.7853948287910553

In [299]:
y_validation_pred = forest_clf.predict(X_validation)
print("Precision: {:.2f}%".format(100 * precision_score(y_validation.ravel(), y_validation_pred, average='micro')))
print("Recall: {:.2f}%".format(100 * recall_score(y_validation.ravel(), y_validation_pred, average='micro')))

Precision: 81.46%
Recall: 81.46%


In [300]:
df_test = pd.read_csv('test.csv')
df_passengerid = df_test[['PassengerId']].copy()
X_test = cat_pipeline.fit_transform(df_test)
predictions = forest_clf.predict(X_test)
predictions[:10]

Unnamed: 0,Sex,Embarked,fare_categorical,is_alone,no_had_siblings,no_had_parents,Pclass
0,male,Q,0 - 50,True,True,True,3
1,female,S,0 - 50,False,False,True,3
2,male,Q,0 - 50,True,True,True,2
3,male,S,0 - 50,True,True,True,3
4,female,S,0 - 50,False,False,False,3
...,...,...,...,...,...,...,...
413,male,S,0 - 50,True,True,True,3
414,female,C,101 - 150,True,True,True,1
415,male,S,0 - 50,True,True,True,3
416,male,S,0 - 50,True,True,True,3


array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0])

In [301]:
output = pd.DataFrame({'PassengerId': df_passengerid['PassengerId'], 'Survived': predictions})
output.to_csv('submission.csv', index = False)