In [33]:
import lightgbm as lgb
import joblib
from sklearn.metrics import roc_auc_score as metric
from sklearn.model_selection import train_test_split
import itertools
import pandas as pd
import warnings
from sklearn.model_selection import GridSearchCV


In [6]:
# Load the dataset
tr_tr_new = joblib.load('../joblib/tr_tr_encoded.joblib')

In [7]:
te_tr_new = joblib.load('../joblib/te_tr_encoded.joblib')

In [8]:
X = tr_tr_new.drop('isFraud', axis=1)
y = tr_tr_new['isFraud']

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# Base parameters
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'tree_learner': 'serial',
    'seed': 0,
    'device_type': 'gpu',
    'metric': 'auc',
}

# Parameters for grid search
params_grid = {
    'learning_rate': [0.05, 0.08],
    'colsample_bytree': [0.5, 0.6],
    'subsample': [0.7, 0.8],
    'n_estimators': [2000, 3000],
    'early_stopping_rounds': [100]
}

In [34]:

# Set up LightGBM classifier
lgb_classifier = lgb.LGBMClassifier(**lgb_params)


# Set up grid search with cross-validation
grid_search = GridSearchCV(
    lgb_classifier, 
    param_grid=params_grid, 
    scoring='roc_auc', 
    cv=5,  # number of cross-validation folds, adjust as needed
    verbose=2
)

In [35]:
# Run the grid search
grid_search.fit(X_train, y_train, eval_metric='auc', eval_set=[(X_valid, y_valid)])

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters found: ", best_params)
print("Best AUC score: ", best_score)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[LightGBM] [Info] Number of positive: 13289, number of negative: 364656
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19815
[LightGBM] [Info] Number of data points in the train set: 377945, number of used features: 213
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 60 dense feature groups (21.63 MB) transferred to GPU in 0.030479 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035161 -> initscore=-3.312018
[LightGBM] [Info] Start training from score -3.312018
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's auc: 0.962532
[CV] END colsample_bytree=0.5, early_stopping_rounds=100, lea

In [38]:
filename = 'gridSearchLightgbm.joblib'
joblib.dump(grid_search, filename)

['gridSearchLightgbm.joblib']

In [71]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

In [72]:
best_model = lgb.train(best_params,
                  train_data,
                  valid_sets=[train_data, valid_data])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.182460 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19841
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 214
[LightGBM] [Info] Start training from score 0.035161
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[3000]	training's l2: 0.00851498	valid_1's l2: 0.0128258


In [73]:
best_model

<lightgbm.basic.Booster at 0x2a9dcc9eb50>

In [74]:
pred_train_p = best_model.predict(X_train)
pred_val_p = best_model.predict(X_valid)

In [68]:
# Compute AUC
auc_train = metric(y_train, pred_train_p)
auc_val = metric(y_valid, pred_val_p)
# If you have a separate test set, uncomment the line below
# auc_test = roc_auc_score(y_test, pred_test_p)

print(f"Validation AUC: {auc_val:.4f}")
print('Metric train = %.4f - Metric val = %.4f' % (auc_train, auc_val))

Validation AUC: 0.9534
Metric train = 0.9864 - Metric val = 0.9534


In [75]:
filename = 'lightgbm_model[0.9534].joblib'
joblib.dump(best_model, filename)

['lightgbm_model[0.9534].joblib']

In [46]:
pred_te_tr_new_p = best_model.predict(te_tr_new)

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save the DataFrame to a CSV file
output_df.to_csv('predicted_fraud_lightgbm[5].csv', index=False)
