#### Import Library 

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from pycaret.classification import *


#### Load Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Feature Engineering

In [3]:
def fill_missing_values(df):
    # 填補年齡的缺失值為中位數
    df['Age'].fillna(df['Age'].median(), inplace=True)
    # 填補登船港口的缺失值為出現最多的港口
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    # 填補票價的缺失值為中位數
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    # 填補艙房的缺失值為 'U' (Unknown)
    df['Cabin'].fillna('U', inplace=True)
    return df

def encode_categorical_features(df):
    # 將性別轉換為數值
    df['Sex'] = LabelEncoder().fit_transform(df['Sex'])

    return df

def create_new_features(df):
    # 創建家庭規模特徵
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    # 創建是否獨自一人特徵
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    # 創建艙房類型特徵 (取第一個字母)
    df['CabinType'] = df['Cabin'].apply(lambda x: x[0])

    return df

# 使用這些函數進行特徵工程
def feature_engineering(df):
    df = fill_missing_values(df)
    df = encode_categorical_features(df)
    df = create_new_features(df)
    return df

feature_train = feature_engineering(train)
feature_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,CabinType
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,U,S,2,0,U
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,2,0,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,U,S,1,1,U
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S,2,0,C
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,U,S,1,1,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,U,S,1,1,U
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S,1,1,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,28.0,1,2,W./C. 6607,23.4500,U,S,4,0,U
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C,1,1,C


#### Data Preparation

In [4]:
feature_train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
train_data, test_data = train_test_split(feature_train.drop(['PassengerId','IsAlone','CabinType'], axis=1), random_state=100, train_size=0.8)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
408,0,3,1,21.0,0,0,7.775,S,1
480,0,3,1,9.0,5,2,46.9,S,8
510,1,3,1,29.0,0,0,7.75,Q,1
609,1,1,0,40.0,0,0,153.4625,S,1
547,1,2,1,28.0,0,0,13.8625,C,1


#### PyCaret setting

In [5]:
clf1 = setup(data = train_data, 
             target = 'Survived', 
             categorical_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked','FamilySize'],
             normalize=True, ) # 数据标准化)


Unnamed: 0,Description,Value
0,Session id,1914
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(712, 9)"
4,Transformed data shape,"(712, 32)"
5,Transformed train set shape,"(498, 32)"
6,Transformed test set shape,"(214, 32)"
7,Categorical features,8
8,Preprocess,True
9,Imputation type,simple


#### Compare Model

In [6]:
compare_models(fold = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7671,0.8057,0.7063,0.6851,0.6951,0.5068,0.5074,0.116
lda,Linear Discriminant Analysis,0.7671,0.8427,0.6845,0.6944,0.6874,0.5023,0.504,0.066
ridge,Ridge Classifier,0.7651,0.8431,0.6792,0.6923,0.6838,0.4974,0.4991,0.072
lightgbm,Light Gradient Boosting Machine,0.7651,0.8106,0.7064,0.6818,0.6931,0.5032,0.5041,0.102
rf,Random Forest Classifier,0.7632,0.8196,0.6954,0.6846,0.6889,0.4979,0.499,0.172
xgboost,Extreme Gradient Boosting,0.7592,0.8097,0.7063,0.6705,0.6876,0.492,0.4928,0.094
et,Extra Trees Classifier,0.7572,0.8074,0.7058,0.6684,0.6858,0.4882,0.4896,0.138
lr,Logistic Regression,0.7512,0.8283,0.6314,0.6896,0.6563,0.4623,0.4659,1.096
ada,Ada Boost Classifier,0.7471,0.7947,0.6474,0.6673,0.6545,0.4558,0.4582,0.104
knn,K Neighbors Classifier,0.747,0.8035,0.6364,0.6772,0.6541,0.4553,0.4576,0.106


#### Ensemble Model

In [7]:
lr = create_model('lr',fold = 5)
ridge = create_model('ridge',fold = 5)
svm = create_model('svm',fold = 5)
gbc = create_model('gbc',fold = 5)
lightgbm = create_model('lightgbm', fold = 5)
# xgboost = create_model('xgboost', fold = 5)

stacker = stack_models(estimator_list = [ridge], meta_model = lr)
stacker

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.72,0.8168,0.6757,0.6098,0.641,0.4125,0.414
1,0.76,0.8606,0.6316,0.7059,0.6667,0.4801,0.4819
2,0.68,0.7315,0.5526,0.5833,0.5676,0.3139,0.3142
3,0.8182,0.8814,0.7297,0.7714,0.75,0.6073,0.6079
4,0.7778,0.8511,0.5676,0.7778,0.6562,0.4979,0.5114
Mean,0.7512,0.8283,0.6314,0.6896,0.6563,0.4623,0.4659
Std,0.0476,0.0527,0.0662,0.0805,0.0583,0.0971,0.0982


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.75,0.8434,0.7297,0.6429,0.6835,0.4783,0.4809
1,0.81,0.8916,0.7368,0.7568,0.7467,0.5947,0.5948
2,0.7,0.7455,0.6053,0.6053,0.6053,0.3633,0.3633
3,0.8182,0.8823,0.7568,0.7568,0.7568,0.6116,0.6116
4,0.7475,0.8524,0.5676,0.7,0.6269,0.4392,0.4447
Mean,0.7651,0.8431,0.6792,0.6923,0.6838,0.4974,0.4991
Std,0.0438,0.0519,0.0772,0.0606,0.0611,0.0941,0.0933


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.71,0.7529,0.6486,0.6,0.6234,0.3882,0.389
1,0.71,0.8249,0.5789,0.6286,0.6027,0.375,0.3758
2,0.65,0.6768,0.5526,0.5385,0.5455,0.261,0.261
3,0.7879,0.8317,0.7297,0.7105,0.72,0.5493,0.5494
4,0.7475,0.8232,0.5676,0.7,0.6269,0.4392,0.4447
Mean,0.7211,0.7819,0.6155,0.6355,0.6237,0.4025,0.404
Std,0.0457,0.0599,0.0659,0.0641,0.0563,0.0937,0.0941


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.81,0.834,0.7838,0.725,0.7532,0.5992,0.6004
1,0.79,0.8309,0.7368,0.7179,0.7273,0.5566,0.5567
2,0.69,0.7133,0.6053,0.5897,0.5974,0.3454,0.3455
3,0.7475,0.7927,0.7027,0.65,0.6753,0.4692,0.4702
4,0.798,0.8577,0.7027,0.7429,0.7222,0.5637,0.5642
Mean,0.7671,0.8057,0.7063,0.6851,0.6951,0.5068,0.5074
Std,0.0439,0.0507,0.0586,0.0572,0.0549,0.0913,0.0915


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.79,0.837,0.7838,0.6905,0.7342,0.5618,0.5649
1,0.79,0.8245,0.7368,0.7179,0.7273,0.5566,0.5567
2,0.68,0.7141,0.5789,0.5789,0.5789,0.3209,0.3209
3,0.7475,0.8182,0.7027,0.65,0.6753,0.4692,0.4702
4,0.8182,0.859,0.7297,0.7714,0.75,0.6073,0.6079
Mean,0.7651,0.8106,0.7064,0.6818,0.6931,0.5032,0.5041
Std,0.0482,0.0502,0.0689,0.0648,0.0624,0.1015,0.1019


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.8533,0.7778,0.7,0.7368,0.5763,0.5784
1,0.78,0.8879,0.7368,0.7,0.7179,0.5378,0.5383
2,0.84,0.916,0.7895,0.7895,0.7895,0.6604,0.6604
3,0.78,0.8659,0.6316,0.75,0.6857,0.5184,0.5229
4,0.72,0.781,0.5789,0.6471,0.6111,0.3934,0.3949
5,0.68,0.7538,0.5789,0.5789,0.5789,0.3209,0.3209
6,0.8,0.8727,0.5789,0.8462,0.6875,0.5479,0.5693
7,0.82,0.8557,0.7895,0.75,0.7692,0.6218,0.6224
8,0.7143,0.8396,0.5,0.6429,0.5625,0.3553,0.3614
9,0.8367,0.9086,0.6667,0.8571,0.75,0.6316,0.6426


#### Tuning Model

In [9]:
tuned_stacker = tune_model(
    stacker,
    n_iter=50,  # 調整迭代次數
    optimize='Accuracy',  # 優化指標
    search_library='scikit-learn',  # 使用的搜索庫
    search_algorithm='random'  # 使用的搜索算法
)

# 評估調整後的模型
evaluate_model(tuned_stacker)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.74,0.8533,0.8333,0.6,0.6977,0.48,0.5
1,0.84,0.8879,0.9474,0.72,0.8182,0.68,0.7005
2,0.82,0.916,0.7895,0.75,0.7692,0.6218,0.6224
3,0.82,0.8659,0.8421,0.7273,0.7805,0.6293,0.6342
4,0.72,0.781,0.6316,0.6316,0.6316,0.4058,0.4058
5,0.64,0.7538,0.6316,0.5217,0.5714,0.2659,0.2695
6,0.86,0.8727,0.7368,0.875,0.8,0.6935,0.6996
7,0.82,0.8557,0.8421,0.7273,0.7805,0.6293,0.6342
8,0.7551,0.8396,0.7222,0.65,0.6842,0.4851,0.4869
9,0.8367,0.9086,0.7778,0.7778,0.7778,0.6487,0.6487


Fitting 10 folds for each of 50 candidates, totalling 500 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [10]:
tune_pred = predict_model(tuned_stacker, data = test_data)
accuracy_score(tune_pred['Survived'], test_data['Survived'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.7821,0.8412,0.7333,0.7432,0.7383,0.5517,0.5517


1.0