# Data preprocessing and setup

In [2]:
import pandas as pd

In [3]:
# Function for data preprocessing
def preprocess_dataframe(df):
    # 轉換資料類型
    ## 先全部轉類別
    df = df.astype('category')
    
    ## 然後把數值轉回來
    df[['locdt', 'loctm', 'flam1', 'csmam']] = df[['locdt', 'loctm', 'flam1', 'csmam']].astype('int64')
    df[['conam', 'iterm']] = df[['conam', 'iterm']].astype('float64')

    ## label 也要轉成數值，不轉模型訓練會出現 error
    if 'label' in df.columns:
        df['label'] = df['label'].astype('int64')

    # 缺失值填充
    columns_to_fill = ['etymd', 'mcc', 'stocn', 'scity', 'hcefg', 'csmcu']
    for column in columns_to_fill:
        df[column] = df[column].cat.add_categories('-1').fillna('-1')

    # 刪除不需要的欄位
    ## stscd(狀態碼)幾乎全部都是缺失值，也應該不是重要特徵，先就刪掉這欄
    ## txkey全部都是唯一資料，刪除不用
    df.drop(['stscd', 'txkey'], axis=1, inplace=True)

    return df

In [66]:
# 讀取資料
public_df = pd.read_csv('/Users/chunyu/Desktop/ESun_data_project/Other_lgbm_exp/public.csv')
train_df = pd.read_csv('/Users/chunyu/Desktop/ESun_data_project/Other_lgbm_exp/training.csv')
private_df = pd.read_csv('/Users/chunyu/Desktop/ESun_data_project/Other_lgbm_exp/private_1_processed.csv')

# 準備上傳用資料(只保留 txkey 欄位df)
final_df = private_df[['txkey']]

# 預處理資料
public_df = preprocess_dataframe(public_df)
train_df = preprocess_dataframe(train_df)
private_df = preprocess_dataframe(private_df)

In [11]:
len(public_df)

609040

In [12]:
len(private_df)

754139

In [13]:
len(train_df)

8688526

# Import original model trained by pycaret

In [31]:
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()

In [6]:
bst_pycaret_lgbm = exp.load_model('17_bst_threshold_caliberate')

Transformation Pipeline and Model Successfully Loaded


# Experiment: Compare the Preformance of Original Model and the Incremental Model

## Split test set for comparison

In [18]:
from sklearn.model_selection import train_test_split

#分割特徵和目標變數
X = public_df.drop('label', axis=1) # 特徵
y = public_df['label'] # 目標變數

# 分割50%出來做測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=123)

# 將 y 轉為一維 NumPy 陣列
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

## Incremental training with LightGBM

In [28]:
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler # oversampling 過採樣
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_recall_curve, auc

In [21]:
lgb_model = bst_pycaret_lgbm[-1]
lgb_model

In [26]:
# 設定categorical variables
# 取得剩下的名稱列表
all_columns = train_df.columns.to_list()
# select categroy columns and without "label"
categorical_columns = [col for col in train_df.columns if train_df[col].dtype == 'category' and col != 'label']

# 針對少數樣本進行 resample (Oversampling)
# 創建 RandomOverSampler 對象
oversample = RandomOverSampler(sampling_strategy='minority')
# 應用 Oversampling
X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)

# 將資料轉換為 LightGBM 格式
train_data = lgb.Dataset(X_train_resampled, categorical_feature=categorical_columns, label=y_train_resampled)
test_data = lgb.Dataset(X_test, categorical_feature=categorical_columns, label=y_test, reference=train_data)

# 設置 LightGBM 參數
params = {
    'bagging_fraction': 0.9,
    'bagging_freq': 3,
    'feature_fraction': 0.5,
    'learning_rate': 0.4,
    'min_child_samples': 6,
    'min_split_gain': 0.3,
    'n_estimators': 20,
    'n_jobs': 7,
    'num_leaves': 150,
    'random_state': 123,
    'reg_alpha': 0.005,
    'reg_lambda': 0.0005
}

incremental_train_trial = lgb.train(
    params,
    train_set=train_data,
    num_boost_round=500,
    valid_sets=test_data,
    init_model=lgb_model
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16422
[LightGBM] [Info] Number of data points in the train set: 606794, number of used features: 23
[LightGBM] [Info] Start training from score 0.500000


## Predictions of incremental model

In [69]:
y_pred_incremental = incremental_train_trial.predict(X_test, num_iteration=incremental_train_trial.best_iteration)
y_pred_incremental_binary = (y_pred_incremental > 0.5).astype(int)

# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_incremental_binary)
print("Accuracy:", accuracy)

# 顯示分類報告
print("Classification Report:")
print(classification_report(y_test, y_pred_incremental_binary))


Accuracy: 0.9975305398660187
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    303459
           1       0.64      0.67      0.65      1061

    accuracy                           1.00    304520
   macro avg       0.82      0.84      0.83    304520
weighted avg       1.00      1.00      1.00    304520



## Predictions of original model

In [72]:
y_pred_origin = exp.predict_model(bst_pycaret_lgbm, data=X_test)



In [73]:
#y_pred_origin['prediction_label'].to_numpy()
y_pred_origin_binary = y_pred_origin['prediction_label'].values

# 評估模型性能
accuracy = accuracy_score(y_test, y_pred_origin_binary)
print("Accuracy:", accuracy)

# 顯示分類報告
print("Classification Report:")
print(classification_report(y_test, y_pred_origin_binary))

Accuracy: 0.9983580717194273
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    303459
           1       0.91      0.58      0.71      1061

    accuracy                           1.00    304520
   macro avg       0.96      0.79      0.86    304520
weighted avg       1.00      1.00      1.00    304520



# Predict using original model

In [14]:
prediction_bst_lgbm = exp.predict_model(bst_pycaret_lgbm, data=private_df)
final_df['label'] = prediction_bst_lgbm[['prediction_label']]
#print(final_df['label'].value_counts())
final_df.to_csv('Original_final_lgbm_model_prediction.csv', index=False)

0    752565
1      1574
Name: label, dtype: int64
