In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tpot import TPOTClassifier
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np



In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.drop(['PassengerId','Ticket', 'Cabin', 'Name'], axis=1, inplace=True) 
test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True) 

In [3]:
train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [8]:
def preprocess_df(df):
    dummy_cols = [
        'Pclass',
#         'SibSp',
#         'Parch',
        'Sex',
        'Embarked'
    ]
    df.dropna(subset=['Embarked'],inplace=True)
    df[dummy_cols] = df[dummy_cols].astype(str)
    df = pd.get_dummies(df,columns=dummy_cols,drop_first=True) # create dummies
#     df = df.loc[np.random.permutation(len(df))].reset_index(drop=True) # shuffle the df
    scaler = MinMaxScaler()
    comb_dummy_df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
    
    imputer = KNNImputer(n_neighbors=5)
    imputed_df = pd.DataFrame(imputer.fit_transform(comb_dummy_df),columns = comb_dummy_df.columns)
#     imputed_df['Survived'] = imputed_df['Survived'].astype(int)


#     comb_dummy_df["Age"].fillna(comb_dummy_df["Age"].mean(), inplace=True)
#     comb_dummy_df["Fare"].fillna(comb_dummy_df["Fare"].mean(), inplace=True)
    
    return imputed_df

In [10]:
train_dummy_df = preprocess_df(train_df)
train_dummy_df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0.0,0.271174,0.125,0.000000,0.014151,0.0,1.0,1.0,0.0,1.0
1,1.0,0.472229,0.125,0.000000,0.139136,0.0,0.0,0.0,0.0,0.0
2,1.0,0.321438,0.000,0.000000,0.015469,0.0,1.0,0.0,0.0,1.0
3,1.0,0.434531,0.125,0.000000,0.103644,0.0,0.0,0.0,0.0,1.0
4,0.0,0.434531,0.000,0.000000,0.015713,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
884,0.0,0.334004,0.000,0.000000,0.025374,1.0,0.0,1.0,0.0,1.0
885,1.0,0.233476,0.000,0.000000,0.058556,0.0,0.0,0.0,0.0,1.0
886,0.0,0.273687,0.125,0.333333,0.045771,0.0,1.0,0.0,0.0,1.0
887,1.0,0.321438,0.000,0.000000,0.058556,0.0,0.0,1.0,0.0,0.0


In [11]:
train_dummy_df

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0.0,0.271174,0.125,0.000000,0.014151,0.0,1.0,1.0,0.0,1.0
1,1.0,0.472229,0.125,0.000000,0.139136,0.0,0.0,0.0,0.0,0.0
2,1.0,0.321438,0.000,0.000000,0.015469,0.0,1.0,0.0,0.0,1.0
3,1.0,0.434531,0.125,0.000000,0.103644,0.0,0.0,0.0,0.0,1.0
4,0.0,0.434531,0.000,0.000000,0.015713,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
884,0.0,0.334004,0.000,0.000000,0.025374,1.0,0.0,1.0,0.0,1.0
885,1.0,0.233476,0.000,0.000000,0.058556,0.0,0.0,0.0,0.0,1.0
886,0.0,0.273687,0.125,0.333333,0.045771,0.0,1.0,0.0,0.0,1.0
887,1.0,0.321438,0.000,0.000000,0.058556,0.0,0.0,1.0,0.0,0.0


In [12]:
train_dummy_df = preprocess_df(train_df) 
train_class = train_dummy_df['Survived'].values
training_indices, validation_indices = training_indices, testing_indices = train_test_split(train_dummy_df.index,
                                                                                            stratify = train_class,
                                                                                            train_size=0.75, test_size=0.25)

In [13]:
training_indices.size, validation_indices.size

(666, 223)

In [14]:
tpot = TPOTClassifier(generations=5,verbosity=2)

tpot.fit(train_dummy_df.drop('Survived',axis=1).loc[training_indices].values,
         train_dummy_df.loc[training_indices,'Survived'].values)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8468297609695881

Generation 2 - Current best internal CV score: 0.8468297609695881

Generation 3 - Current best internal CV score: 0.8543036696218158

Generation 4 - Current best internal CV score: 0.8543036696218158

Generation 5 - Current best internal CV score: 0.8543597800471329

Best pipeline: GradientBoostingClassifier(OneHotEncoder(input_matrix, minimum_fraction=0.1, sparse=False, threshold=10), learning_rate=0.1, max_depth=4, max_features=0.6000000000000001, min_samples_leaf=14, min_samples_split=19, n_estimators=100, subsample=0.9000000000000001)


TPOTClassifier(generations=5, verbosity=2)

In [16]:
tpot.score(train_dummy_df.drop('Survived',axis=1).loc[validation_indices].values,
           train_dummy_df.loc[validation_indices, 'Survived'].values)

0.8116591928251121

In [12]:
# 1st attemp score: 0.8071748878923767
# 2nd attempt score: 0.8026905829596412 --> added Parch, SibSp as dummy
# 3 attempt score: 0.8340807174887892 --> scaled data, used knn imputer, removed Parch,SubSp as dummies

In [13]:
tpot.export('tpot_titanic_pipeline2.py')

In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = train_dummy_df
features = train_dummy_df.drop('Survived', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, train_dummy_df['Survived'], random_state=None)

# Average CV score on the training set was: 0.8607788127034002
exported_pipeline = RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.3, min_samples_leaf=4, min_samples_split=2, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)


In [18]:
test_dummy_df = preprocess_df(test_df)
passenger_id = test_dummy_df['PassengerId']
test_dummy_df.drop(columns=['PassengerId'],inplace=True)
results = exported_pipeline.predict(test_dummy_df)
# need to run test df through the same pre-processing as train
# predict
# submit to kaggle

In [19]:
results

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0.,
       1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.,
       0., 1., 0., 1., 0.

In [20]:
results_df = pd.DataFrame()

In [21]:
results_df['PassengerId'] = passenger_id
results_df['Survived'] = results

In [23]:
results_df.sort_values(by='PassengerId',inplace=True)

In [114]:
results_df.to_csv('kaggle-submission-1.csv', index=False)

In [24]:
results_df

Unnamed: 0,PassengerId,Survived
0,0.000000,0.0
1,0.002398,0.0
2,0.004796,0.0
3,0.007194,0.0
4,0.009592,0.0
...,...,...
413,0.990408,0.0
414,0.992806,1.0
415,0.995204,0.0
416,0.997602,0.0
