In [75]:
import pandas as pd
from pycaret.classification import *

session_id = 1234

In [76]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [77]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [79]:
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

Based on the number of unique variables the following data types should change
1. Pclass - ordinal
2. Sex - categorical
3. Embarked - categorical

PassengerId, Cabin, Name, and Ticket will be dropped because they are unique identifiers or missing to much data. 

We can create new feature variables from some of our existing columns such as total family size and the fare per person

### Updating Train Data

In [80]:
# imputing mean for age and mode for embarked
most_common_embarked = df['Embarked'].mode()[0] 
df = df.fillna({'Age':df['Age'].mean(), 'Embarked':most_common_embarked})

# adding feature variables
df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
df['Fare_Per_Person'] = df['Fare'] / df['Family_Size']

### Updating Test Data

In [81]:
# imputing mean for age and mode for embarked
test_df = test_df.fillna({'Age':test_df['Age'].mean(), 'Embarked':most_common_embarked})

# adding feature variables
test_df['Family_Size'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['Fare_Per_Person'] = test_df['Fare'] / test_df['Family_Size']

In [82]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Size,Fare_Per_Person
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,3.625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,35.64165
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,26.55
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,8.05
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q,1,8.4583
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,51.8625
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,5,4.215
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,3,3.7111
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,2,15.0354


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Name             891 non-null    object 
 4   Sex              891 non-null    object 
 5   Age              891 non-null    float64
 6   SibSp            891 non-null    int64  
 7   Parch            891 non-null    int64  
 8   Ticket           891 non-null    object 
 9   Fare             891 non-null    float64
 10  Cabin            204 non-null    object 
 11  Embarked         891 non-null    object 
 12  Family_Size      891 non-null    int64  
 13  Fare_Per_Person  891 non-null    float64
dtypes: float64(3), int64(6), object(5)
memory usage: 97.6+ KB


## Pycaret Set up

In [84]:
models = setup(data=df,
               target='Survived', train_size=0.8,
               ignore_features=['PassengerId', 'Name', 'Ticket', 'Cabin'],
               numeric_features=['SibSp', 'Parch', 'Fare','Family_Size','Fare_Per_Person'],
               categorical_features=['Sex', 'Embarked'],
               ordinal_features={'Pclass':['1','2','3']},
               normalize=True,
               session_id=session_id
               )

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 14)"
4,Transformed data shape,"(891, 12)"
5,Transformed train set shape,"(712, 12)"
6,Transformed test set shape,"(179, 12)"
7,Ignore features,4
8,Ordinal features,2
9,Numeric features,5


In [85]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8245,0.8848,0.7402,0.7902,0.7621,0.6237,0.6266,0.019
catboost,CatBoost Classifier,0.8231,0.8904,0.7401,0.7868,0.7608,0.621,0.6235,0.339
lightgbm,Light Gradient Boosting Machine,0.8174,0.8703,0.7581,0.7624,0.7588,0.6122,0.6136,0.066
rf,Random Forest Classifier,0.8005,0.8742,0.7358,0.745,0.7378,0.5772,0.5798,0.034
ada,Ada Boost Classifier,0.7964,0.849,0.7507,0.7282,0.7383,0.5718,0.573,0.018
knn,K Neighbors Classifier,0.7949,0.842,0.7033,0.7491,0.7232,0.5609,0.5639,0.014
et,Extra Trees Classifier,0.7949,0.8457,0.7357,0.7338,0.7311,0.5659,0.5695,0.03
lr,Logistic Regression,0.7879,0.8307,0.6702,0.7553,0.7083,0.5428,0.547,0.012
ridge,Ridge Classifier,0.7851,0.0,0.6776,0.7442,0.707,0.5383,0.542,0.01
lda,Linear Discriminant Analysis,0.7851,0.8275,0.6776,0.7442,0.707,0.5383,0.542,0.01


In [86]:
tune = tune_model(best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7917,0.8255,0.6429,0.7826,0.7059,0.547,0.5533
1,0.8194,0.8933,0.7143,0.8,0.7547,0.6126,0.615
2,0.7887,0.8157,0.6667,0.75,0.7059,0.5419,0.5442
3,0.8873,0.9082,0.8519,0.8519,0.8519,0.7609,0.7609
4,0.7746,0.8035,0.6667,0.72,0.6923,0.5149,0.5159
5,0.8451,0.8822,0.8519,0.7667,0.807,0.6782,0.6808
6,0.8592,0.9276,0.7778,0.84,0.8077,0.6968,0.6981
7,0.7887,0.8552,0.7037,0.7308,0.717,0.5485,0.5488
8,0.7746,0.8872,0.7407,0.6897,0.7143,0.5286,0.5296
9,0.8592,0.9352,0.7857,0.8462,0.8148,0.7014,0.7027


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [90]:
out = predict_model(tune, test_df)
out.rename(columns={'prediction_label': 'Survived'}, inplace=True)
out[['PassengerId','Survived']].to_csv('submission.csv', index=False)