In [1]:
import numpy as np 
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import os

import warnings
warnings.filterwarnings('ignore')


ModuleNotFoundError: No module named 'lightgbm'

In [None]:
train = pd.read_csv("./santander-customer-transaction-prediction/train.csv")
test = pd.read_csv("./santander-customer-transaction-prediction/test.csv")

train.columns

In [None]:
train.head(5)

In [None]:
train.shape, test.shape

In [None]:
## missing data

(train.isnull().values.any(), test.isnull().values.any())

## no missing data

In [None]:
## class imbalance: 2 classes : 1 and 0
train['target'].value_counts(normalize = True)

In [None]:
sns.countplot(target)

So there is almost 90% data with target class as 0 and only 10% with target class as 1. There is high Class Imbalance. So we need to do either Up/Downsampling or Stratified Sampling.

In [None]:
## Resampling: 
from sklearn.utils import resample

train_pos = train[train['target'] == 1]
train_neg = train[train['target'] == 0]

train['target'].value_counts()

In [None]:
## upsample minority class: https://elitedatascience.com/imbalanced-classes
## minority class - positive class

train_pos_upsampled = resample(train_pos, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(train_neg),    # to match majority class
                                 random_state=42) 
 
# Combine majority class with upsampled minority class
train_upsampled = pd.concat([train_neg, train_pos_upsampled])

train_upsampled['target'].value_counts()


In [None]:
target = train_upsampled['target']
train_upsampled = train_upsampled.drop(['ID_code', 'target'], axis = 1)

In [None]:
kfold = 10
folds = KFold(n_splits=kfold, random_state=42)
predictions = np.zeros(len(test))

In [None]:
## lightgbm parameters:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [None]:
train_upsampled.shape, target.shape

In [None]:
for i, (trn_idx, val_idx) in enumerate(folds.split(train_upsampled.values, target.values)):
    
    X_train, y_train = train_upsampled.iloc[trn_idx][:], target.iloc[trn_idx]
    X_valid, y_valid = train_upsampled.iloc[val_idx][:], target.iloc[val_idx]
    X_tr, y_tr = X_train.values, y_train.values    
    X_tr = pd.DataFrame(X_tr)
    print("Fold idx:{}".format(i + 1))
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, train_data, 1000000, valid_sets = [train_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    
    predictions += clf.predict(test[test.columns.difference(['ID_code'])], num_iteration=clf.best_iteration) / folds.n_splits
    
    

In [None]:
result = pd.DataFrame({"ID_code": test.ID_code.values})
result["target"] = predictions
result.to_csv("./santander-customer-transaction-prediction/submission.csv", index=False)