In [1]:
import lightgbm as lgb
import joblib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [2]:

# Load the dataset
tr_tr_new = joblib.load('../joblib/tr_tr_encoded.joblib')

In [14]:
te_tr_new = joblib.load('../joblib/te_tr_encoded.joblib')

In [3]:
# Splitting the dataset into features and target
X = tr_tr_new.drop(columns=['isFraud'])
y = tr_tr_new['isFraud']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
lgb_params = {
    'objective':'binary',
    'boosting_type':'gbdt',
    'metric':'auc',
    'n_jobs':-1,
    'learning_rate':0.007,
    'num_leaves': 2**8,
    'max_depth':-1,
    'tree_learner':'serial',        # Change here
    'colsample_bytree': 0.5,
    'subsample_freq':1,
    'subsample':0.7,
    'n_estimators':10000,
    'max_bin':255,
    'verbose':-1,
    'seed': 0,
    'early_stopping_rounds':100,
    'device_type': 'gpu'             # Add this line
}

In [24]:
# Create datasets for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)


In [25]:
# Train the model
model = lgb.train(lgb_params,
                  train_data,
                  valid_sets=[train_data, valid_data])



In [26]:
# Predict scores on train, validation
pred_train_p = model.predict(X_train)
pred_val_p = model.predict(X_valid)
# If you have a separate test set, uncomment the line below
# pred_test_p = model.predict(X_test)

# Compute AUC
auc_train = roc_auc_score(y_train, pred_train_p)
auc_val = roc_auc_score(y_valid, pred_val_p)
# If you have a separate test set, uncomment the line below
# auc_test = roc_auc_score(y_test, pred_test_p)

print(f"Validation AUC: {auc_val:.4f}")
print('Metric train = %.4f - Metric val = %.4f' % (auc_train, auc_val))


Validation AUC: 0.9763
Metric train = 1.0000 - Metric val = 0.9763


In [27]:
import pandas as pd

pred_te_tr_new_p = model.predict(te_tr_new)

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': pred_te_tr_new_p
})

# Save the DataFrame to a CSV file
output_df.to_csv('predicted_fraud_lightgbm[4].csv', index=False)


In [28]:
import numpy as np
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Assuming model_lgbm is your trained LightGBM model and model_xgb is your trained XGBoost model
# And X_train, X_valid and y_train are your training and validation data

# Get the predictions from both models for your training data
train_preds_lgbm = model.predict(X_train)

model_xgb = joblib.load('../models_libjob/xgboost_model[0.9728].joblib')
train_preds_xgb = model_xgb.predict_proba(X_train)[:, 1]

# Stack predictions together
stacked_train_predictions = np.column_stack((train_preds_lgbm, train_preds_xgb))

# Train a logistic regression model on the stacked predictions
meta_model = LogisticRegression().fit(stacked_train_predictions, y_train)

# Get predictions for test set (te_tr_new) and stack them
test_preds_lgbm = model.predict(te_tr_new)
test_preds_xgb = model_xgb.predict_proba(te_tr_new)[:, 1]
stacked_test_predictions = np.column_stack((test_preds_lgbm, test_preds_xgb))

# Make final predictions using the meta model
final_preds = meta_model.predict_proba(stacked_test_predictions)[:, 1]

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': final_preds
})

# Save the DataFrame to a CSV file
output_df.to_csv('stacked_predictions.csv', index=False)


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)

    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

# Convert datasets to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train)
dtest = xgb.DMatrix(te_tr_new)

# LightGBM: Using predict() for probability scores
train_preds_lgbm = model_lgbm.predict(X_train)
test_preds_lgbm = model_lgbm.predict(te_tr_new)

# XGBoost: Use DMatrix and then predict()
train_preds_xgb = model_xgb.predict(dtrain)
test_preds_xgb = model_xgb.predict(dtest)

# Stack predictions together for train and test sets
stacked_train_predictions = np.column_stack((train_preds_lgbm, train_preds_xgb))
stacked_test_predictions = np.column_stack((test_preds_lgbm, test_preds_xgb))

# Train a logistic regression model on the stacked predictions
meta_model = LogisticRegression().fit(stacked_train_predictions, y_train)

# Make final predictions using the meta model
final_preds = meta_model.predict_proba(stacked_test_predictions)[:, 1]

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': final_preds
})

# Save the DataFrame to a CSV file
output_df.to_csv('stacked_predictions.csv', index=False)


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

# Convert datasets to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train)
dtest = xgb.DMatrix(te_tr_new)

# LightGBM: Using predict() for probability scores
train_preds_lgbm = model.predict(X_train)
test_preds_lgbm = model.predict(te_tr_new)

# XGBoost: Use DMatrix and then predict()
train_preds_xgb = model_xgb.predict(dtrain)
test_preds_xgb = model_xgb.predict(dtest)

# Stack predictions together for train and test sets
stacked_train_predictions = np.column_stack((train_preds_lgbm, train_preds_xgb))
stacked_test_predictions = np.column_stack((test_preds_lgbm, test_preds_xgb))

# Train a logistic regression model on the stacked predictions
meta_model = LogisticRegression().fit(stacked_train_predictions, y_train)

# Make final predictions using the meta model
final_preds = meta_model.predict_proba(stacked_test_predictions)[:, 1]

# Create the output DataFrame
output_df = pd.DataFrame({
    'TransactionID': te_tr_new.reset_index()['TransactionID'],
    'isFraud': final_preds
})

# Save the DataFrame to a CSV file
output_df.to_csv('stacked_predictions[1].csv', index=False)
