# Data preprocessing

In [1]:
import pandas as pd

In [2]:
# Function for data preprocessing
def preprocess_dataframe(df):
    # 轉換資料類型
    ## 先全部轉類別
    df = df.astype('category')
    
    ## 然後把數值轉回來
    df[['locdt', 'loctm', 'flam1', 'csmam']] = df[['locdt', 'loctm', 'flam1', 'csmam']].astype('int64')
    df[['conam', 'iterm']] = df[['conam', 'iterm']].astype('float64')

    ## label 也要轉成數值，不轉模型訓練會出現 error
    if 'label' in df.columns:
        df['label'] = df['label'].astype('int64')

    # 缺失值填充
    columns_to_fill = ['etymd', 'mcc', 'stocn', 'scity', 'hcefg', 'csmcu']
    for column in columns_to_fill:
        df[column] = df[column].cat.add_categories('-1').fillna('-1')

    # 刪除不需要的欄位
    ## stscd(狀態碼)幾乎全部都是缺失值，也應該不是重要特徵，先就刪掉這欄
    ## txkey全部都是唯一資料，刪除不用
    df.drop(['stscd', 'txkey'], axis=1, inplace=True)

    return df

In [4]:
# 讀取資料
public_df = pd.read_csv('/Users/chunyu/Desktop/ESun_data_project/Other_lgbm_exp/public.csv')
train_df = pd.read_csv('/Users/chunyu/Desktop/ESun_data_project/Other_lgbm_exp/training.csv')
private_df = pd.read_csv('/Users/chunyu/Desktop/ESun_data_project/Other_lgbm_exp/private_1_processed.csv')

# 準備上傳用資料(只保留 txkey 欄位df)
final_df = private_df[['txkey']]

# 預處理資料
public_df = preprocess_dataframe(public_df)
train_df = preprocess_dataframe(train_df)
private_df = preprocess_dataframe(private_df)

## Split test set for comparison

In [21]:
from sklearn.model_selection import train_test_split

#分割特徵和目標變數
X = public_df.drop('label', axis=1) # 特徵
y = public_df['label'] # 目標變數

# 分割出測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# 組合出訓練df
y_train_df = pd.DataFrame(y_train, columns=['label'])
public_train_df = pd.concat([X_train, y_train_df], axis=1)

# 將 y 轉為一維 NumPy 陣列
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Create pycaret object

In [8]:
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()

# Experiment: Compare the Preformance of Original Model and the Incremental Model

## Import origin model trained by pycaret

In [9]:
bst_pycaret_lgbm = exp.load_model('17_bst_threshold_caliberate')

Transformation Pipeline and Model Successfully Loaded


## Incremental training with LightGBM

In [10]:
import numpy as np
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler # oversampling 過採樣
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve, auc

In [11]:
lgb_model = bst_pycaret_lgbm[-1]
lgb_model

In [12]:
# 設定categorical variables
# 取得剩下的名稱列表
all_columns = train_df.columns.to_list()
# select categroy columns and without "label"
categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'category' and col != 'label']

# 針對少數樣本進行 resample (Oversampling)
# 創建 RandomOverSampler 對象
oversample = RandomOverSampler(sampling_strategy='minority')
# 應用 Oversampling
X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)

# 將資料轉換為 LightGBM 格式
train_data = lgb.Dataset(X_train_resampled, categorical_feature=categorical_columns, label=y_train_resampled)
test_data = lgb.Dataset(X_test, categorical_feature=categorical_columns, label=y_test, reference=train_data)

# 設置 LightGBM 參數
params = {
    'objective': 'binary',  # 二元分類問題
    'boosting_type': 'gbdt',  # 使用梯度提升決策樹
    'metric': 'binary_logloss',  # 使用二元對數損失作為評估指標
    'bagging_fraction': 0.9,
    'bagging_freq': 3,
    'feature_fraction': 0.5,
    'learning_rate': 0.4,
    'min_child_samples': 6,
    'min_split_gain': 0.3,
    'n_estimators': 20,
    'n_jobs': 7,
    'num_leaves': 150,
    'random_state': 123,
    'reg_alpha': 0.005,
    'reg_lambda': 0.0005
}

incremental_train_trial = lgb.train(
    params,
    train_set=train_data,
    num_boost_round=100,
    valid_sets=test_data,
    init_model=lgb_model,
)

[LightGBM] [Info] Number of positive: 485460, number of negative: 485460
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17246
[LightGBM] [Info] Number of data points in the train set: 970920, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


## Predictions of incremental model

In [13]:
y_pred_incremental = incremental_train_trial.predict(X_test, num_iteration=incremental_train_trial.best_iteration)
y_pred_incremental_binary = (y_pred_incremental > 0.5).astype(int)

# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_incremental_binary)
print("Accuracy:", accuracy)

# 顯示分類報告
print("Classification Report:")
print(classification_report(y_test, y_pred_incremental_binary))


Accuracy: 0.9988178116379877
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    121396
           1       0.92      0.72      0.80       412

    accuracy                           1.00    121808
   macro avg       0.96      0.86      0.90    121808
weighted avg       1.00      1.00      1.00    121808



In [14]:
# Optimize threshold by AUC(PR)
# 計算 PR 曲線
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_incremental)

# 計算 AUC(PR)
area = auc(recall, precision)
print("PR AUC:", area)

# 尋找最佳閾值
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]
print("Best Threshold:", best_threshold)

# 使用最佳閾值重新分類
y_pred_incremental_binary = (y_pred_incremental > best_threshold).astype(int)

# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_incremental_binary)
print("Accuracy:", accuracy)

# 顯示更新的 classification report
print("Updated Classification Report:")
print(classification_report(y_test, y_pred_incremental_binary))

PR AUC: 0.7774099361497807
Best Threshold: 0.4608699092212707
Accuracy: 0.9988096019965848
Updated Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    121396
           1       0.91      0.72      0.80       412

    accuracy                           1.00    121808
   macro avg       0.96      0.86      0.90    121808
weighted avg       1.00      1.00      1.00    121808



## Predictions of original model

In [16]:
y_pred_origin = exp.predict_model(bst_pycaret_lgbm, data=X_test)



In [17]:
#y_pred_origin['prediction_label'].to_numpy()
y_pred_origin_binary = y_pred_origin['prediction_label'].values

# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_origin_binary)
print("Accuracy:", accuracy)

# 顯示分類報告
print("Classification Report:")
print(classification_report(y_test, y_pred_origin_binary))

Accuracy: 0.9983498620780245
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    121396
           1       0.91      0.57      0.70       412

    accuracy                           1.00    121808
   macro avg       0.95      0.79      0.85    121808
weighted avg       1.00      1.00      1.00    121808



# Experiment: create and train new model with new data added

In [25]:
# Combine original data and 70% new data of public df
train_cb = pd.concat([train_df, public_train_df], ignore_index=True)


In [None]:
# init setup on exp
exp.setup(train_cb, target='label',
          fix_imbalance=True, fix_imbalance_method='RandomOverSampler',
          n_jobs=7,
          fold=5,
          log_experiment=True, experiment_name='03_add_t1',
          session_id=123)

In [None]:
lgbm_add =  exp.create_model('lgbm')

In [None]:
tuned_lgbm_add = exp.tune_model(lgbm_add, optimize='F1')
exp.save_model(tune_lgbm_add, '31_tuned_lgbm_add')

In [None]:
tune_lgbm_threshold_add = exp.optimize_threshold(tuned_lgbm_add, optimize='F1')
exp.save_model(tune_lgbm_threshold_add, '32_tune_lgbm_threshold_add')

In [None]:
tune_lgbm_threshold_add_caliberate_add = exp.caliberate_model(tune_lgbm_threshold_add)
exp.save_model(tune_lgbm_threshold_add, '33_tune_lgbm_threshold_add_caliberate_add')

In [None]:
final_lgbm_add = exp.finalize_model(tune_lgbm_threshold_add_caliberate_add)
exp.save_model(tune_lgbm_threshold_add, '34_final_lgbm_add')

## Prediction using re-traininf model (0.2 not use for training)

In [None]:
y_pred_retrain = exp.predict_model(final_lgbm_add, data=X_test)

In [None]:
#y_pred_retrain['prediction_label'].to_numpy()
y_pred_retrain_binary = y_pred_retrain['prediction_label'].values

# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_retrain_binary)
print("Accuracy:", accuracy)

# 顯示分類報告
print("Classification Report:")
print(classification_report(y_test, y_pred_retrain_binary))

# Predict using original model

In [14]:
prediction_bst_lgbm = exp.predict_model(bst_pycaret_lgbm, data=private_df)
final_df['label'] = prediction_bst_lgbm[['prediction_label']]
#print(final_df['label'].value_counts())
final_df.to_csv('Original_final_lgbm_model_prediction.csv', index=False)

0    752565
1      1574
Name: label, dtype: int64


# Predict using incremental model (0.3 not use for training)

In [50]:
# 預測
y_pred = incremental_train_trial.predict(private_df, num_iteration=incremental_train_trial.best_iteration)

y_pred_binary = (y_pred > 0.3855).astype(int)

print(y_pred_binary)
print('---Prediction finished---')

### 製作上傳檔案
final_df['label'] = y_pred_binary
final_df.to_csv('Submission_incremental_model_with_30percent_not_used.csv', index=False)

[0 0 0 ... 0 0 0]
---Prediction finished---


In [51]:
final_df['label'].value_counts()

0    753326
1       813
Name: label, dtype: int64

In [21]:
origin_df = pd.read_csv('Original_final_lgbm_model_prediction.csv')

In [22]:
origin_df['label'].value_counts()

0    752565
1      1574
Name: label, dtype: int64