<a href="https://colab.research.google.com/github/eduion/AIOT/blob/main/HW4/HW4_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install pycaret
!pip install catboost
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.classification import *

def simplify_ages(df):
    df['Age'] = df['Age'].fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young_Adult', 'Adult', 'Senior']
    categories = pd.cut(df['Age'], bins, labels=group_names)
    df['Age'] = categories
    return df

def simplify_cabins(df):
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df['Fare'] = df['Fare'].fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df['Fare'], bins, labels=group_names)
    df['Fare'] = categories
    return df

def format_name(df):
    df['Lname'] = df['Name'].apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df['Name'].apply(lambda x: x.split(' ')[1])
    return df

def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

def rename_features(df):
    df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)
    return df

# 主程序
if __name__ == "__main__":
    # 載入數據集
    data_train = pd.read_csv('./train.csv')  # 將路徑改為實際數據的位置

    # 轉換特徵
    data_train = transform_features(data_train)
    data_train = rename_features(data_train)

    # 分割數據
    train_data, test_data = train_test_split(data_train.drop(['PassengerId'], axis=1), random_state=100, train_size=0.8)

    # 設定 PyCaret
    clf1 = setup(data=train_data,
                 target='Survived',
                 categorical_features=['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'Lname', 'NamePrefix'],
                 session_id=123)

    # 1. 比較模型並選擇最佳模型
    best_model = compare_models(fold=5, sort='Accuracy')

    # 2. 集成模型
    top_models = [create_model(m, fold=5) for m in ['lr', 'knn', 'nb', 'dt', 'rf']]  # 手動選擇模型
    tuned_models = [tune_model(m, fold=5, optimize='Accuracy') for m in top_models]  # 調參
    ensemble_model = blend_models(estimator_list=tuned_models, fold=5)

    # 3. 超參數優化
    optimized_model = tune_model(ensemble_model, fold=5, optimize='Accuracy')

    # 訓練最佳模型並進行預測
    final_model = finalize_model(optimized_model)
    predictions = predict_model(final_model, data=test_data)

    # 輸出最佳模型和預測結果
    print("Best Model:", best_model)
    print("Predictions:")
    print(predictions)





Unnamed: 0,Description,Value
0,Session id,123
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(712, 10)"
4,Transformed data shape,"(712, 30)"
5,Transformed train set shape,"(498, 30)"
6,Transformed test set shape,"(214, 30)"
7,Numeric features,2
8,Categorical features,7
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8031,0.8557,0.6782,0.7711,0.7198,0.5692,0.5735,0.152
lr,Logistic Regression,0.8011,0.871,0.6993,0.7542,0.7219,0.5679,0.5724,0.26
knn,K Neighbors Classifier,0.7972,0.8454,0.6519,0.7714,0.7057,0.553,0.5581,0.264
et,Extra Trees Classifier,0.7771,0.829,0.6947,0.7066,0.7001,0.5228,0.5233,0.452
rf,Random Forest Classifier,0.7691,0.8276,0.6945,0.6948,0.6926,0.5078,0.5099,0.344
gbc,Gradient Boosting Classifier,0.7671,0.7646,0.7003,0.6865,0.693,0.5054,0.5058,0.262
catboost,CatBoost Classifier,0.7511,0.7845,0.6161,0.6935,0.6465,0.4571,0.463,0.948
lightgbm,Light Gradient Boosting Machine,0.7489,0.7494,0.7161,0.6501,0.6812,0.4748,0.4767,1.182
dt,Decision Tree Classifier,0.7449,0.7351,0.6952,0.6521,0.6722,0.4642,0.4655,0.166
ada,Ada Boost Classifier,0.7408,0.7487,0.6687,0.6652,0.659,0.4517,0.4582,0.248


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.79,0.8863,0.6486,0.75,0.6957,0.5366,0.5399
1,0.81,0.8873,0.7895,0.7317,0.7595,0.6028,0.604
2,0.84,0.9107,0.8421,0.7619,0.8,0.6672,0.6695
3,0.7475,0.8184,0.5676,0.7,0.6269,0.4392,0.4447
4,0.8182,0.8524,0.6486,0.8276,0.7273,0.5939,0.6038
Mean,0.8011,0.871,0.6993,0.7542,0.7219,0.5679,0.5724
Std,0.0313,0.0322,0.101,0.0422,0.0588,0.0766,0.0759


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.75,0.813,0.6216,0.6765,0.6479,0.4546,0.4556
1,0.83,0.9003,0.7105,0.8182,0.7606,0.6298,0.6336
2,0.81,0.8374,0.6842,0.7879,0.7324,0.5862,0.5897
3,0.7374,0.7916,0.5135,0.7037,0.5938,0.4066,0.4177
4,0.8586,0.8849,0.7297,0.871,0.7941,0.6877,0.6939
Mean,0.7972,0.8454,0.6519,0.7714,0.7057,0.553,0.5581
Std,0.0465,0.0414,0.0782,0.0721,0.074,0.1061,0.1052


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.71,0.7849,0.7838,0.58,0.6667,0.42,0.435
1,0.76,0.8073,0.5789,0.7333,0.6471,0.469,0.4766
2,0.61,0.7944,0.9211,0.493,0.6422,0.2914,0.3641
3,0.7374,0.7369,0.6757,0.641,0.6579,0.445,0.4454
4,0.7778,0.794,0.8108,0.6667,0.7317,0.5451,0.5527
Mean,0.719,0.7835,0.7541,0.6228,0.6691,0.4341,0.4548
Std,0.0591,0.0244,0.1173,0.0814,0.0324,0.0827,0.0613


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.72,0.722,0.7297,0.6,0.6585,0.4251,0.4309
1,0.76,0.7453,0.6842,0.6842,0.6842,0.4907,0.4907
2,0.77,0.7585,0.7105,0.6923,0.7013,0.5144,0.5145
3,0.7071,0.6953,0.6486,0.6,0.6234,0.3843,0.3851
4,0.7677,0.7546,0.7027,0.6842,0.6933,0.5064,0.5065
Mean,0.7449,0.7351,0.6952,0.6521,0.6722,0.4642,0.4655
Std,0.0262,0.0236,0.0275,0.0427,0.0283,0.0508,0.0498


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.73,0.8057,0.6216,0.6389,0.6301,0.4176,0.4177
1,0.77,0.8563,0.7105,0.6923,0.7013,0.5144,0.5145
2,0.8,0.8391,0.8158,0.7045,0.7561,0.5881,0.5927
3,0.7273,0.7609,0.6486,0.6316,0.64,0.4206,0.4206
4,0.8182,0.8758,0.6757,0.8065,0.7353,0.5985,0.6039
Mean,0.7691,0.8276,0.6945,0.6948,0.6926,0.5078,0.5099
Std,0.0364,0.0405,0.0674,0.0628,0.0502,0.0781,0.0802


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.79,0.8833,0.6757,0.7353,0.7042,0.5419,0.543
1,0.82,0.8809,0.7895,0.75,0.7692,0.6218,0.6224
2,0.83,0.914,0.7368,0.8,0.7671,0.6336,0.6349
3,0.7475,0.8088,0.5676,0.7,0.6269,0.4392,0.4447
4,0.8081,0.852,0.6216,0.8214,0.7077,0.5689,0.5811
Mean,0.7991,0.8678,0.6782,0.7613,0.715,0.5611,0.5652
Std,0.0291,0.0354,0.0791,0.044,0.0521,0.0696,0.0684


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.82,0.8007,0.7297,0.7714,0.75,0.6095,0.6101
1,0.81,0.9051,0.6579,0.8065,0.7246,0.5819,0.5889
2,0.82,0.8345,0.7895,0.75,0.7692,0.6218,0.6224
3,0.7071,0.7831,0.5946,0.6111,0.6027,0.3708,0.3709
4,0.798,0.8749,0.7027,0.7429,0.7222,0.5637,0.5642
Mean,0.791,0.8397,0.6949,0.7364,0.7138,0.5496,0.5513
Std,0.0427,0.0453,0.0658,0.0664,0.0581,0.0917,0.0923


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.77,0.8213,0.7297,0.675,0.7013,0.5148,0.5158
1,0.77,0.8219,0.6316,0.7273,0.6761,0.4991,0.5021
2,0.79,0.8343,0.6842,0.7429,0.7123,0.5474,0.5486
3,0.7475,0.7827,0.6486,0.6667,0.6575,0.4576,0.4577
4,0.798,0.8045,0.7297,0.7297,0.7297,0.5684,0.5684
Mean,0.7751,0.8129,0.6848,0.7083,0.6954,0.5175,0.5185
Std,0.0177,0.0178,0.0404,0.0312,0.0257,0.0385,0.0384


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.81,0.7823,0.6757,0.7812,0.7246,0.5808,0.5843
1,0.79,0.7848,0.7632,0.7073,0.7342,0.561,0.5621
2,0.79,0.7797,0.7368,0.7179,0.7273,0.5566,0.5567
3,0.7475,0.7167,0.5946,0.6875,0.6377,0.4454,0.4482
4,0.7576,0.7193,0.5676,0.7241,0.6364,0.4585,0.4662
Mean,0.779,0.7565,0.6676,0.7236,0.692,0.5205,0.5235
Std,0.023,0.0316,0.0766,0.0314,0.045,0.0567,0.0552


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.75,0.8606,0.7568,0.6364,0.6914,0.4839,0.489
1,0.81,0.8665,0.8158,0.7209,0.7654,0.6068,0.6101
2,0.82,0.9005,0.8158,0.7381,0.775,0.6256,0.6278
3,0.7172,0.7936,0.7297,0.6,0.6585,0.4211,0.4269
4,0.8485,0.8577,0.8649,0.7619,0.8101,0.6849,0.6887
Mean,0.7891,0.8558,0.7966,0.6915,0.7401,0.5645,0.5685
Std,0.0482,0.0347,0.0479,0.0623,0.0562,0.0971,0.096


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.8786,0.7027,0.7429,0.7222,0.5662,0.5667
1,0.85,0.9056,0.8421,0.7805,0.8101,0.6865,0.6878
2,0.81,0.8988,0.7632,0.7436,0.7532,0.5988,0.599
3,0.7677,0.8372,0.6757,0.6944,0.6849,0.501,0.5011
4,0.8182,0.8511,0.7027,0.7879,0.7429,0.6029,0.6053
Mean,0.8092,0.8742,0.7373,0.7499,0.7427,0.5911,0.592
Std,0.0267,0.0265,0.0597,0.0333,0.041,0.0601,0.0605


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.84,0.8782,0.7297,0.8182,0.7714,0.649,0.6515
1,0.86,0.9043,0.8421,0.8,0.8205,0.7059,0.7065
2,0.85,0.8911,0.8158,0.7949,0.8052,0.6833,0.6834
3,0.7576,0.8337,0.6216,0.697,0.6571,0.4706,0.4724
4,0.8081,0.8633,0.6486,0.8,0.7164,0.5738,0.581
Mean,0.8231,0.8741,0.7316,0.782,0.7541,0.6165,0.619
Std,0.0371,0.0244,0.0875,0.0433,0.0602,0.0856,0.0846


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8045,0.8646,0.6133,0.8846,0.7244,0.5805,0.6039


Best Model: RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=123, solver='auto',
                tol=0.0001)
Predictions:
     Pclass     Sex          Age  SibSp  Parch        Fare Cabin  \
205       3  female         Baby      0      1  2_quartile     G   
44        3  female      Student      0      0  1_quartile     N   
821       3    male  Young_Adult      0      0  2_quartile     N   
458       2  female        Adult      0      0  2_quartile     N   
795       2    male        Adult      0      0  2_quartile     N   
..      ...     ...          ...    ...    ...         ...   ...   
247       2  female      Student      0      2  2_quartile     N   
259       2  female        Adult      0      1  3_quartile     N   
353       3    male      Student      1      0  3_quartile     N   
783       3    male      Unknown      1      2  3_quartile     N   
852       3  female        Child      1 