In [413]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [414]:
df = pd.read_csv('train.csv')

In [415]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [416]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [417]:
#df['Name']

In [418]:
titles = {
    'Mr': 'Mr',
    'Miss': 'Miss',
    'Mrs': 'Mrs',
    'Master': 'Master',
    'Dr': 'Dr',
    'Rev': 'Rev',
    'Col': 'Officer',
    'Mlle': 'Miss',
    'Major': 'Officer',
    'Ms': 'Mrs',
    'Capt': 'Officer',
    'Lady': 'Royal',
    'the Countess': 'Royal',
    'Jonkheer': 'Royal',
    'Mme': 'Mrs',
    'Don': 'Royal',
    'Sir': 'Royal'
}

In [419]:
df.drop(labels=['PassengerId', 'Ticket'], axis=1, inplace=True)

In [420]:
from sklearn.base import TransformerMixin

In [421]:
class FeatureTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        df = X.copy()
        df['Cabin'] = df['Cabin'].fillna('U').apply(lambda x: x[0])
        df['Cabin'].replace(['T', 'G'], 'Z', inplace=True)
        df['Family'] = df['Parch'] + df['SibSp']
        df['AgeGroup'] = pd.cut(df['Age'], [0, 15, 30, 45, 60, np.inf])
        df['Name'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
        df['Name'] = df['Name'].map(titles)
        return df

In [422]:
df = FeatureTransformer().fit_transform(df)

In [423]:
display(df['Cabin'].value_counts())
display(df[['Cabin', 'Survived']].groupby(['Cabin']).mean())

U    687
C     59
B     47
D     33
E     32
A     15
F     13
Z      5
Name: Cabin, dtype: int64

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
U,0.299854
Z,0.4


In [424]:
display(df['Family'].value_counts())
display(df[['Family', 'Survived']].groupby(['Family']).mean())

0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: Family, dtype: int64

Unnamed: 0_level_0,Survived
Family,Unnamed: 1_level_1
0,0.303538
1,0.552795
2,0.578431
3,0.724138
4,0.2
5,0.136364
6,0.333333
7,0.0
10,0.0


In [425]:
display(df['AgeGroup'].value_counts())
display(df[['AgeGroup', 'Survived']].groupby(['AgeGroup']).mean())

(15.0, 30.0]    326
(30.0, 45.0]    202
(0.0, 15.0]      83
(45.0, 60.0]     81
(60.0, inf]      22
Name: AgeGroup, dtype: int64

Unnamed: 0_level_0,Survived
AgeGroup,Unnamed: 1_level_1
"(0.0, 15.0]",0.590361
"(15.0, 30.0]",0.358896
"(30.0, 45.0]",0.425743
"(45.0, 60.0]",0.407407
"(60.0, inf]",0.227273


In [426]:
display(df['Name'].value_counts())
display(df[['Name', 'Survived']].groupby(['Name']).mean())

Mr         517
Miss       184
Mrs        127
Master      40
Dr           7
Rev          6
Royal        5
Officer      5
Name: Name, dtype: int64

Unnamed: 0_level_0,Survived
Name,Unnamed: 1_level_1
Dr,0.428571
Master,0.575
Miss,0.701087
Mr,0.156673
Mrs,0.795276
Officer,0.4
Rev,0.0
Royal,0.6


In [427]:
class FeatureSelector(TransformerMixin):
    def __init__(self, features):
        self.features = features
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.features]

In [428]:
cat_features = ['Name', 'Sex', 'Cabin', 'Embarked', 'AgeGroup']
cat_selector = FeatureSelector(cat_features)
num_features = ['Age', 'SibSp', 'Parch', 'Fare', 'Family']
num_selector = FeatureSelector(num_features)

In [429]:
from sklearn.impute import SimpleImputer
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='most_frequent')

In [430]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [431]:
from sklearn.preprocessing import LabelEncoder

In [432]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()

In [433]:
from sklearn.pipeline import make_pipeline

In [434]:
cat_pipeline = make_pipeline(cat_selector, cat_imputer, encoder)
num_pipeline = make_pipeline(num_selector, num_imputer, scaler)

In [435]:
from sklearn.pipeline import make_union

In [436]:
transformed_df = make_union(cat_pipeline, num_pipeline)

In [437]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(labels='Survived', axis=1),
                                                    df['Survived'], test_size=0.2, 
                                                    stratify=df['Survived'])

In [438]:
name_age = X_train[['Name', 'Age']].groupby(['Name'])

In [439]:
def fill_age(df):
    if df['Missing']:
        return name_age.get_group(df['Name']).mean()[0]
    return df['Age']

In [440]:
df['Missing'] = df['Age'].isna()

In [441]:
df['Age'] = df.apply(fill_age, axis='columns')

In [442]:
X_train = transformed_df.fit_transform(X_train)
X_test = transformed_df.transform(X_test)

In [443]:
from sklearn.metrics import accuracy_score

In [444]:
import h2o
from h2o.automl import H2OAutoML

In [445]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,54 mins 31 secs
H2O cluster timezone:,America/Fortaleza
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.5
H2O cluster version age:,"7 days, 9 hours and 45 minutes"
H2O cluster name:,H2O_from_python_flycher_wgf2q3
H2O cluster total nodes:,1
H2O cluster free memory:,1.900 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [447]:
df = pd.DataFrame(X_train)

In [449]:
df[10] = y_train.values

In [451]:
df.columns = df.columns.astype(str)

In [453]:
h2o_df = h2o.H2OFrame(df)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [455]:
h2o_df

0,1,2,3,4,5,6,7,8,9,10
2,0,6,2,1,-0.488514,-0.470833,-0.475432,-0.440854,-0.56345,0
3,1,2,0,1,-0.790295,0.465572,-0.475432,1.4738,0.0663402,0
4,0,6,2,2,0.869502,-0.470833,-0.475432,-0.326579,-0.56345,1
3,1,6,0,1,-0.337623,-0.470833,-0.475432,-0.491267,-0.56345,0
3,1,6,2,1,-0.563959,-0.470833,-0.475432,-0.479187,-0.56345,0
1,1,6,2,0,-2.07287,4.21119,1.95115,0.275481,3.84508,0
2,0,1,0,1,-0.488514,-0.470833,1.95115,0.325734,0.69613,1
3,1,6,2,1,-0.563959,-0.470833,-0.475432,-0.480234,-0.56345,0
4,0,6,2,1,-0.337623,0.465572,-0.475432,-0.128469,0.0663402,1
3,1,6,2,2,0.26594,0.465572,-0.475432,-0.324646,0.0663402,0




In [460]:
h2o_df['10'] = h2o_df['10'].asfactor()

In [461]:
aml = H2OAutoML(max_models=10, max_runtime_secs=120, seed=42)

In [462]:
aml.train(x=x, y=y, training_frame=h2o_df)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [463]:
test = pd.DataFrame(X_test)
test.columns = test.columns.astype(str)
test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [464]:
y_pred = aml.predict(test)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [468]:
y_pred = h2o.as_list(y_pred['predict']).values
y_pred = y_pred.reshape([-1])

In [469]:
accuracy_score(y_pred, y_test)

0.8547486033519553

In [470]:
df_test = pd.read_csv('test.csv')

In [471]:
df_test.drop(labels=['Ticket'], axis=1, inplace=True)

In [472]:
passenger = df_test['PassengerId']
df_test = FeatureTransformer().fit_transform(df_test)
df_test['Missing'] = df_test['Age'].isna()
df_test['Age'] = df_test.apply(fill_age, axis='columns')
df_test = transformed_df.transform(df_test)

In [474]:
df_test = pd.DataFrame(df_test)
df_test.columns = df_test.columns.astype(str)
df_test = h2o.H2OFrame(df_test)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [481]:
survived = h2o.as_list(aml.predict(df_test)['predict']).values.reshape([-1])

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [482]:
answer = pd.DataFrame(data={'PassengerID': passenger, 'Survived': survived.astype(int)})

In [483]:
answer

Unnamed: 0,PassengerID,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [484]:
answer.to_csv('submission.csv', index=False)