# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
from pycaret.classification import *

# Variables to Set

In [2]:
data_file = 'classification_dataset.csv'
target_column = 'Survived'

# Load/Transform Data

In [37]:
df = pd.read_csv(data_file)
df[target_column] = df[target_column].astype(object)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Feature Engineering

In [45]:
age_bin_labels = [ 'Child', 'Young', 'Middle', 'Elderly']
df['Age_Bin_Label'] = pd.qcut(df['Age'],
                              q=4,
                              labels=age_bin_labels)

df['Dr'] = np.where(df['Name'].str.contains('Dr. '),1,0)
df['Age_Impute'] = df['Age'].fillna(df.groupby('Sex')['Age'].transform('mean'))
df['Fare_Max'] = np.where(df['Fare']>=45,45,df['Fare'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Bin_Label,Dr,Age_Impute,Fare_Max
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Young,0,22.0,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Middle,0,38.0,45.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Young,0,26.0,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Middle,0,35.0,45.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Middle,0,35.0,8.05


In [46]:
df.dtypes

PassengerId         int64
Survived           object
Pclass              int64
Name               object
Sex                object
Age               float64
SibSp               int64
Parch               int64
Ticket             object
Fare              float64
Cabin              object
Embarked           object
Age_Bin_Label    category
Dr                  int32
Age_Impute        float64
Fare_Max          float64
dtype: object

# Profile Data

In [47]:
pp.ProfileReport(df)

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=29.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






# Split into Training/Test and Validation Data

In [48]:
data = df.sample(frac=0.75, random_state=786)
data_unseen = df.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (668, 16)
Unseen Data For Predictions: (223, 16)


# Setup Dataset with Pycaret

In [49]:
cla = setup(data = data, 
            target = target_column , 
            session_id=123,
            silent = True,
            verbose = True) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(668, 16)"
5,Missing Values,True
6,Numeric Features,6
7,Categorical Features,9
8,Ordinal Features,False
9,High Cardinality Features,False


# Compare Classification Models

In [50]:
compare_models(
#         include = ['']
#         exclude = ['']
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8138,0.874,0.6842,0.8292,0.7443,0.6017,0.6115,0.247
rf,Random Forest Classifier,0.8119,0.8594,0.7105,0.8046,0.7479,0.6005,0.6078,0.212
dt,Decision Tree Classifier,0.8052,0.7919,0.7211,0.7915,0.7447,0.5897,0.5989,0.023
ridge,Ridge Classifier,0.8029,0.0,0.7158,0.7782,0.7412,0.5836,0.5881,0.032
lightgbm,Light Gradient Boosting Machine,0.7988,0.8522,0.7211,0.7678,0.7348,0.5753,0.5813,0.213
lr,Logistic Regression,0.7965,0.8595,0.7053,0.7724,0.7323,0.5699,0.5752,0.997
et,Extra Trees Classifier,0.7925,0.8617,0.7,0.7771,0.7275,0.5623,0.5706,0.222
ada,Ada Boost Classifier,0.7622,0.828,0.6684,0.7236,0.6908,0.4991,0.5031,0.129
knn,K Neighbors Classifier,0.5994,0.5893,0.3789,0.5056,0.4297,0.1339,0.1376,0.05
lda,Linear Discriminant Analysis,0.5978,0.6043,0.5526,0.5019,0.5244,0.1774,0.1797,0.077


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

# Build a Logistic Regression Model

In [51]:
lr = create_model('lr')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8723,0.8891,0.8421,0.8421,0.8421,0.735,0.735
1,0.8085,0.9041,0.7368,0.7778,0.7568,0.5991,0.5996
2,0.8085,0.8233,0.7368,0.7778,0.7568,0.5991,0.5996
3,0.8085,0.8684,0.6842,0.8125,0.7429,0.5921,0.5976
4,0.766,0.7199,0.5263,0.8333,0.6452,0.4835,0.512
5,0.8936,0.9474,0.8421,0.8889,0.8649,0.7773,0.778
6,0.6596,0.7256,0.4211,0.6154,0.5,0.2554,0.266
7,0.7826,0.9025,0.7368,0.7368,0.7368,0.5517,0.5517
8,0.8478,0.9376,0.8947,0.7727,0.8293,0.6933,0.6994
9,0.7174,0.8772,0.6316,0.6667,0.6486,0.4126,0.413


# Tune Logistic Regression Model

In [52]:
tuned_lr = tune_model(lr)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8936,0.8872,0.8947,0.85,0.8718,0.781,0.7817
1,0.8298,0.9173,0.7895,0.7895,0.7895,0.6466,0.6466
2,0.8085,0.8402,0.7368,0.7778,0.7568,0.5991,0.5996
3,0.8511,0.8684,0.7895,0.8333,0.8108,0.6882,0.6888
4,0.766,0.7068,0.5263,0.8333,0.6452,0.4835,0.512
5,0.8936,0.9192,0.8421,0.8889,0.8649,0.7773,0.778
6,0.7234,0.6974,0.5789,0.6875,0.6286,0.4108,0.4147
7,0.7826,0.9064,0.7368,0.7368,0.7368,0.5517,0.5517
8,0.8043,0.9337,0.8421,0.7273,0.7805,0.6057,0.611
9,0.6957,0.8772,0.6316,0.6316,0.6316,0.3723,0.3723


In [53]:
print(tuned_lr)

LogisticRegression(C=6.718000000000001, class_weight={}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


# Evaluate Logistic Regression Model

In [54]:
evaluate_model(lr)

Unnamed: 0,Parameters
C,1.0
class_weight,
dual,False
fit_intercept,True
intercept_scaling,1
l1_ratio,
max_iter,1000
multi_class,auto
n_jobs,
penalty,l2


# Predict on Test Dataset

In [55]:
predict_model(tuned_lr)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7662,0.856,0.8406,0.617,0.7117,0.5227,0.5404


Unnamed: 0,PassengerId,Fare,Dr,Age_Impute,Fare_Max,Pclass_1,Pclass_2,Pclass_3,Name_Abelson Mrs. Samuel (Hannah Wizosky),Name_Aks Mrs. Sam (Leah Rosen),...,Embarked_Q,Embarked_S,Age_Bin_Label_Child,Age_Bin_Label_Elderly,Age_Bin_Label_Middle,Age_Bin_Label_Young,Age_Bin_Label_not_available,Survived,Label,Score
0,395.0,16.700001,0,24.000000,16.700001,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,1,0.5866
1,231.0,83.474998,0,35.000000,45.000000,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,1,0.9789
2,542.0,31.275000,0,9.000000,31.275000,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.6618
3,131.0,7.895800,0,33.000000,7.895800,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.6011
4,239.0,10.500000,0,19.000000,10.500000,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,1,0.5780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,688.0,10.170800,0,19.000000,10.170800,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0,0,0.6544
197,719.0,15.500000,0,30.726645,15.500000,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.7007
198,110.0,24.150000,0,27.915709,24.150000,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0.9479
199,3.0,7.925000,0,26.000000,7.925000,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,1,0.8859


# Predict on Validation Data

In [56]:
unseen_predictions = predict_model(tuned_lr, data=data_unseen)
unseen_predictions

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Bin_Label,Dr,Age_Impute,Fare_Max,Label,Score
0,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Middle,0,35.000000,45.0000,1,0.9937
1,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,,0,30.726645,8.4583,0,0.7783
2,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,Child,0,2.000000,21.0750,0,0.9874
3,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,Elderly,0,58.000000,26.5500,1,0.9488
4,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S,Elderly,0,39.000000,31.2750,0,0.9839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C,Child,0,15.000000,7.2250,1,0.9933
219,877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20.0,0,0,7534,9.8458,,S,Child,0,20.000000,9.8458,0,0.6417
220,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,Elderly,0,56.000000,45.0000,1,0.9918
221,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,Young,0,22.000000,10.5167,1,0.9453


In [57]:
from pycaret.utils import check_metric
check_metric(unseen_predictions[target_column], unseen_predictions.Label, 'AUC')

0.7788

# Predict on Kaggle Titanic Data

In [58]:
kaggle_test = pd.read_csv('classification_dataset_validation.csv')
kaggle_predictions = predict_model(tuned_lr, data=kaggle_test)
kaggle_predictions

KeyError: "['Fare_Max', 'Age_Bin_Label', 'Dr', 'Age_Impute'] not in index"