# Import Libararies

In [2]:
# need the latest version of lightgbm 3.3.1
!pip install lightgbm --upgrade

Collecting lightgbm
  Downloading lightgbm-3.3.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 34.7 MB/s 
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 2.2.3
    Uninstalling lightgbm-2.2.3:
      Successfully uninstalled lightgbm-2.2.3
Successfully installed lightgbm-3.3.1


In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn import metrics
import datetime
print(lgb.__version__)

3.3.1


In [2]:
pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.4.11-py2.py3-none-any.whl (32 kB)
Collecting yarg
  Downloading yarg-0.1.9-py2.py3-none-any.whl (19 kB)
Installing collected packages: yarg, pipreqs
Successfully installed pipreqs-0.4.11 yarg-0.1.9


# Dataset 1: Click Fraud Data

The original text here https://ibimapublishing.com/articles/JIACS/2019/263928/ 

## Importing Data

In [None]:
# dataset taken from https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/data?select=train.csv
!gdown --id 1-6p65jg0jIK3LsjK4PBAcGIHx85lHMyW

Downloading...
From: https://drive.google.com/uc?id=1-6p65jg0jIK3LsjK4PBAcGIHx85lHMyW
To: /content/combined.csv
100% 79.9M/79.9M [00:00<00:00, 153MB/s]


Since the dataset is large, we decided to download part of the dataset and store it on gdrive for easy of running this notebook. Note that as detailed in our report, we decided to take all the positive samples and some of the negative samples in a rough 1:2 ratio to do our testing due to the highly imbalanced nature of the dataset

In [None]:
df = pd.read_csv('combined.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,103,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1
1,1504,29692,9,1,22,215,2017-11-06 16:00:02,2017-11-07 10:05:22,1
2,1798,64516,35,1,13,21,2017-11-06 16:00:02,2017-11-06 23:40:50,1
3,2102,172429,35,1,46,274,2017-11-06 16:00:03,2017-11-07 00:55:29,1
4,3056,199085,35,1,13,274,2017-11-06 16:00:04,2017-11-06 23:04:54,1


## LightGBM on Click Fraud Data

### Data Preprocessing for LightGBM

In [None]:
df = df.drop(['Unnamed: 0', 'attributed_time'], axis = 1)
df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,204158,35,1,13,21,2017-11-06 15:41:07,1
1,29692,9,1,22,215,2017-11-06 16:00:02,1
2,64516,35,1,13,21,2017-11-06 16:00:02,1
3,172429,35,1,46,274,2017-11-06 16:00:03,1
4,199085,35,1,13,274,2017-11-06 16:00:04,1


In [None]:
def find_day(timestamp):
  timestamp_read = datetime.datetime.fromisoformat(timestamp)
  return  timestamp_read.weekday()

def find_hour(timestamp):
  timestamp_read = datetime.datetime.fromisoformat(timestamp)
  return timestamp_read.hour

In [None]:
df['day'] = df['click_time'].apply(find_day)
df['hour'] = df['click_time'].apply(find_hour)
df.drop('click_time', inplace = True, axis = 1)
df.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour
0,204158,35,1,13,21,1,0,15
1,29692,9,1,22,215,1,0,16
2,64516,35,1,13,21,1,0,16
3,172429,35,1,46,274,1,0,16
4,199085,35,1,13,274,1,0,16


In [None]:
labels = df['is_attributed']
df = df.drop('is_attributed', axis = 1)
df.head()

Unnamed: 0,ip,app,device,os,channel,day,hour
0,204158,35,1,13,21,0,15
1,29692,9,1,22,215,0,16
2,64516,35,1,13,21,0,16
3,172429,35,1,46,274,0,16
4,199085,35,1,13,274,0,16


In [None]:
data = lgb.Dataset(df, label = labels, categorical_feature=list(df.columns),free_raw_data=False)

### Running LightGBM

In [None]:
# these params were specified in the paper 
params = {
    'learning_rate': 0.2,
    'max_depth': 3,
    'num_leaves': 7,
    'min_child_samples': 100,
    'min_child_weight': 0,
    'subsample': 0.7,
    'subsample_freq': 1,
    'colsample_bytree': 0.9,
    'scale_pos_weight': 200,
    'metric': ['auc', 'cross_entropy', 'l2'],
    'objective': 'binary'
}

In [None]:
start = datetime.datetime.now()
eval = lgb.cv(params=params, train_set = data,nfold=5, verbose_eval = 10, early_stopping_rounds = 100, return_cvbooster=True)
time_taken = datetime.datetime.now() - start
print(time_taken)



[LightGBM] [Info] Number of positive: 365476, number of negative: 806629
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 1172105.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17093
[LightGBM] [Info] Number of data points in the train set: 1172105, number of used features: 7
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 293027.000000
[LightGBM] [Info] Number of positive: 365477, number of negative: 806628
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 1172105.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] 



[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.311812 -> initscore=-0.791660
[LightGBM] [Info] Start training from score -0.791660
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.311812 -> initscore=-0.791660
[LightGBM] [Info] Start training from score -0.791660
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.311812 -> initscore=-0.791660
[LightGBM] [Info] Start training from score -0.791660
[10]	cv_agg's auc: 0.999061 + 7.6901e-05	cv_agg's cross_entropy: 0.190919 + 0.00272429	cv_agg's l2: 0.0362355 + 0.00104985
[20]	cv_agg's auc: 0.999365 + 8.31143e-05	cv_agg's cross_entropy: 0.148217 + 0.00238545	cv_agg's l2: 0.0280225 + 0.000626433
[30]	cv_agg's auc: 0.999428 + 6.71091e-05	cv_agg's cross_entropy: 0.128012 + 0.00265146	cv_agg's l2: 0.0241218 + 0.00067179
[40]	cv_agg's auc: 0.999443 + 7.02495e-05	cv_agg's cross_entropy: 0.116853 + 0.0024518	cv_agg's l2: 0.022494 + 0.000532216
[50]	cv_agg's auc: 0.999457 + 6.75982e-05	cv_agg's cross_entropy: 0.109139 + 0.00245913	cv_agg's l2: 0.

### Feature Importance

In [None]:
def getfeatureimpt(eval): 
  feature_importance = {}
  c = 1
  for b in eval['cvbooster'].boosters:
    feature_importance[c] = {k:v for k,v in zip(b.feature_name(), b.feature_importance(importance_type='gain'))}
    c+=1
  fi = pd.DataFrame(feature_importance)
  means = pd.DataFrame(fi.mean(axis = 1))
  means.columns = ['mean']
  means.sort_values(by = 'mean', inplace = True, ascending=False)
  means['perc'] = means['mean']/means['mean'].sum()
  return means

In [None]:
# mean refers to mean information gain, perc is the % of total information gain for the feature
features = getfeatureimpt(eval)
features

Unnamed: 0,mean,perc
hour,3294346.0,0.349809
day,2341006.0,0.248579
channel,1490126.0,0.158229
app,1422214.0,0.151017
ip,687388.5,0.07299
os,172354.0,0.018301
device,10118.26,0.001074


## XGBoost on Click Fraud Data

In [None]:
from xgboost import XGBClassifier
import pandas as pd
import datetime
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gc
import numpy as np

### XGBoost Data Preparation

In [None]:
df = pd.read_csv('combined.csv')
df_process = df.drop(['Unnamed: 0', 'attributed_time'], axis = 1)
df_process.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,204158,35,1,13,21,2017-11-06 15:41:07,1
1,29692,9,1,22,215,2017-11-06 16:00:02,1
2,64516,35,1,13,21,2017-11-06 16:00:02,1
3,172429,35,1,46,274,2017-11-06 16:00:03,1
4,199085,35,1,13,274,2017-11-06 16:00:04,1


In [None]:
df_process['hour'] = df_process['click_time'].apply(find_hour)
df_process['day'] =  df_process['click_time'].apply(find_day)

In [None]:
df_process = df_process.drop(['click_time', 'ip'], axis = 1)

In [None]:
for col in df_process.columns:
  df_process[col] = df_process[col].astype('category')

df_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465132 entries, 0 to 1465131
Data columns (total 7 columns):
 #   Column         Non-Null Count    Dtype   
---  ------         --------------    -----   
 0   app            1465132 non-null  category
 1   device         1465132 non-null  category
 2   os             1465132 non-null  category
 3   channel        1465132 non-null  category
 4   is_attributed  1465132 non-null  category
 5   hour           1465132 non-null  category
 6   day            1465132 non-null  category
dtypes: category(7)
memory usage: 15.5 MB


In [None]:
df_process_X = df_process.drop('is_attributed', axis = 1)
x_train, x_val, y_train, y_val = train_test_split(df_process_X, df_process['is_attributed'], test_size=0.33, random_state=42)

### Train XGBoost

In [None]:
xgbmodel = XGBClassifier(objective='binary:logistic',  n_estimators = 1000, booster = 'gbtree', eval_metric = ['auc', 'logloss'])
start = datetime.datetime.now()
xgbmodel.fit(x_train.values, y_train.values, eval_set = [(x_val.values, y_val.values)], early_stopping_rounds = 20)
time_taken = datetime.datetime.now() - start
print(time_taken)


[0]	validation_0-auc:0.996676	validation_0-logloss:0.597095
Multiple eval metrics have been passed: 'validation_0-logloss' will be used for early stopping.

Will train until validation_0-logloss hasn't improved in 20 rounds.
[1]	validation_0-auc:0.996676	validation_0-logloss:0.519941
[2]	validation_0-auc:0.996676	validation_0-logloss:0.455719
[3]	validation_0-auc:0.998345	validation_0-logloss:0.404203
[4]	validation_0-auc:0.998345	validation_0-logloss:0.357222
[5]	validation_0-auc:0.998345	validation_0-logloss:0.316317
[6]	validation_0-auc:0.998345	validation_0-logloss:0.28373
[7]	validation_0-auc:0.998345	validation_0-logloss:0.253091
[8]	validation_0-auc:0.998345	validation_0-logloss:0.226941
[9]	validation_0-auc:0.998345	validation_0-logloss:0.203752
[10]	validation_0-auc:0.998345	validation_0-logloss:0.183313
[11]	validation_0-auc:0.998345	validation_0-logloss:0.164972
[12]	validation_0-auc:0.998345	validation_0-logloss:0.14837
[13]	validation_0-auc:0.998345	validation_0-logloss:0.

In [None]:
predicted_y = xgbmodel.predict(x_val.values)
accuracy_score(y_val.values, predicted_y)

0.9979110392269605

### XGBoost Feature Importance

In [None]:
mean_dict = {}
counter = 0
fi_array = xgbmodel.get_booster().get_score(importance_type = 'gain')

for col in df_process_X.columns:
  mean_dict[col] = fi_array['f'+str(counter)]
  counter += 1

fi_dict = {}
fi_dict['mean'] = mean_dict
fi_df = pd.DataFrame(fi_dict, index=df_process_X.columns)
fi_df['perc'] = fi_df['mean']/fi_df['mean'].sum()

In [None]:
fi_df

Unnamed: 0,mean,perc
app,60.309516,0.001262
device,37.497279,0.000785
os,4.640709,9.7e-05
channel,25.214387,0.000528
hour,2182.430285,0.045665
day,45482.147025,0.951664


## Reducing Dataset back to original balance

Out of curiosity, we reduced the positive samples in our training data back to the percentage in the original dataset. Originally, there were only 0.24% positive observations. We describe the motivation for doing this in our report. 

In [None]:
exp = pd.read_csv('combined.csv')
# do the same data preprocessing 
exp.drop(['Unnamed: 0', 'attributed_time'], axis = 1, inplace=True)
exp['day'] = exp['click_time'].apply(find_day)
exp['hour'] = exp['click_time'].apply(find_hour)
exp.drop('click_time', inplace = True, axis = 1)
exp.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour
0,204158,35,1,13,21,1,0,15
1,29692,9,1,22,215,1,0,16
2,64516,35,1,13,21,1,0,16
3,172429,35,1,46,274,1,0,16
4,199085,35,1,13,274,1,0,16


In [None]:
# number of positives we want
num_pos = round(len(exp[exp['is_attributed'] == 0]) /0.9966 * 0.0024)
pos = exp[exp['is_attributed'] == 1].sample(num_pos)
pos.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour
10071,186150,18,1,19,134,1,0,21
214375,108877,19,0,0,213,1,2,4
405019,27782,9,1,14,334,1,3,9
116142,5524,35,1,10,274,1,1,11
366912,109938,35,1,19,21,1,3,5


In [None]:
# put it back together with all the neg samples
original = pd.concat([pos, exp[exp['is_attributed'] == 0]])
original.reset_index(drop=True, inplace=True)
original.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,day,hour
0,186150,18,1,19,134,1,0,21
1,108877,19,0,0,213,1,2,4
2,27782,9,1,14,334,1,3,9
3,5524,35,1,10,274,1,1,11
4,109938,35,1,19,21,1,3,5


In [None]:
# create the lgb dataset
o_labels = original['is_attributed']
original.drop('is_attributed', inplace = True, axis = 1)
o_data = lgb.Dataset(original, label = o_labels, categorical_feature=list(original.columns),free_raw_data=False)

In [None]:
#run the model
o_start = datetime.datetime.now()
o_eval = lgb.cv(params=params, train_set = o_data,nfold=5, verbose_eval = 10, early_stopping_rounds = 100, return_cvbooster=True)
o_time_taken = datetime.datetime.now() - o_start
print(o_time_taken)



[LightGBM] [Info] Number of positive: 1942, number of negative: 806629
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 808571.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17870
[LightGBM] [Info] Number of data points in the train set: 808571, number of used features: 7
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 202143.000000
[LightGBM] [Info] Number of positive: 1942, number of negative: 806629
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 808571.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17870
[LightGBM] [Info] Number of data points in the train set: 808571, number of used features: 7
[LightGBM] [Info] 



[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002402 -> initscore=-6.029145
[LightGBM] [Info] Start training from score -6.029145
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002402 -> initscore=-6.029145
[LightGBM] [Info] Start training from score -6.029145
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002403 -> initscore=-6.028629
[LightGBM] [Info] Start training from score -6.028629
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002403 -> initscore=-6.028631
[LightGBM] [Info] Start training from score -6.028631
[10]	cv_agg's auc: 0.993371 + 0.0108773	cv_agg's cross_entropy: 0.049495 + 0.0807091	cv_agg's l2: 0.00250057 + 0.00340373
[20]	cv_agg's auc: 0.993232 + 0.00971855	cv_agg's cross_entropy: 0.0510266 + 0.092686	cv_agg's l2: 0.00286271 + 0.00464846
[30]	cv_agg's auc: 0.993367 + 0.00939004	cv_agg's cross_entropy: 0.036748 + 0.0697115	cv_agg's l2: 0.00225226 + 0.00394181
[40]	cv_agg's auc: 0.993335 + 0.00935829	cv_agg's cross_entropy: 0.0330238 + 0.0631704	cv_agg's 

In [None]:
o_means = getfeatureimpt(o_eval)
o_means

Unnamed: 0,mean,perc
day,90019140.0,0.772663
hour,16859450.0,0.14471
app,3553834.0,0.030504
channel,2909602.0,0.024974
os,2421528.0,0.020785
device,447469.6,0.003841
ip,294020.6,0.002524


#  Dataset 2: Credit Card Fraud Data

## LightGBM on Credit Card Data

### Download Data

In [None]:
# original dataset https://www.kaggle.com/mlg-ulb/creditcardfraud 
!gdown --id 1tWqDuPaoKRiBiPJTi-jG5uP17AVrgzyc

Downloading...
From: https://drive.google.com/uc?id=1tWqDuPaoKRiBiPJTi-jG5uP17AVrgzyc
To: /content/creditcard.csv
100% 151M/151M [00:01<00:00, 117MB/s]


In [None]:
cc = pd.read_csv('/content/creditcard.csv')
cc.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


The data has already treaed with PCA, and so there is not much preprocessing that can be done to pretreat the data since all the features are now Principal Components

### Formatting the Data to work with LightGBM

In [None]:
cc_labels = cc['Class']
cc.drop('Class', axis = 1, inplace = True)
cc.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [None]:
cc_data = lgb.Dataset(cc, label = cc_labels,free_raw_data=False)

### LightGBM on Credit Card Fraud Dataset

In [None]:
cc_start = datetime.datetime.now()
cc_eval = lgb.cv(params=params, train_set = cc_data,nfold=5, verbose_eval = 10, return_cvbooster=True, early_stopping_rounds= 100)
cc_time_taken = datetime.datetime.now() - cc_start
print(cc_time_taken)



[LightGBM] [Info] Number of positive: 393, number of negative: 227452
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 227845.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227845, number of used features: 30
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 56962.000000
[LightGBM] [Info] Number of positive: 393, number of negative: 227452
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 227845.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227845, number of used features: 30
[LightGBM] [Info] [cr



[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001729 -> initscore=-6.358344
[LightGBM] [Info] Start training from score -6.358344
[10]	cv_agg's auc: 0.922736 + 0.0278212	cv_agg's cross_entropy: 0.633141 + 0.318649	cv_agg's l2: 0.0247395 + 0.0121967
[20]	cv_agg's auc: 0.928314 + 0.01686	cv_agg's cross_entropy: 0.648176 + 0.123587	cv_agg's l2: 0.0282438 + 0.00397796
[30]	cv_agg's auc: 0.934345 + 0.0153228	cv_agg's cross_entropy: 0.596831 + 0.145142	cv_agg's l2: 0.0249607 + 0.00511037
[40]	cv_agg's auc: 0.915962 + 0.0121748	cv_agg's cross_entropy: 0.664768 + 0.210615	cv_agg's l2: 0.0271095 + 0.00708526
[50]	cv_agg's auc: 0.921098 + 0.0154862	cv_agg's cross_entropy: 0.72792 + 0.15334	cv_agg's l2: 0.0305358 + 0.00455112
[60]	cv_agg's auc: 0.826001 + 0.169724	cv_agg's cross_entropy: 0.755782 + 0.132616	cv_agg's l2: 0.0313731 + 0.00402272
[70]	cv_agg's auc: 0.887872 + 0.0345232	cv_agg's cross_entropy: 1.03941 + 0.192447	cv_agg's l2: 0.0411843 + 0.00646205
[80]	cv_agg's auc: 0.907956 + 0.0

In [None]:
cc_fi = getfeatureimpt(cc_eval)
cc_fi.head()

Unnamed: 0,mean,perc
V17,248707400000.0,0.3113
V10,200561600000.0,0.251037
V18,89584410000.0,0.11213
V21,82197960000.0,0.102885
V3,59991340000.0,0.075089


In [None]:
#best score
cc_eval['auc-mean'][-1]

0.9373038335116363

In [None]:
# train time 
print(cc_time_taken)

0:00:21.298477


## XGBoost on Credit Card Data

### Download Data

In [None]:
!gdown --id 1tWqDuPaoKRiBiPJTi-jG5uP17AVrgzyc

Downloading...
From: https://drive.google.com/uc?id=1tWqDuPaoKRiBiPJTi-jG5uP17AVrgzyc
To: /content/creditcard.csv
100% 151M/151M [00:01<00:00, 141MB/s]


In [None]:
df_cc = pd.read_csv('creditcard.csv')
df_cc.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


### Data Preprocessing

In [None]:
df_cc_X = df_cc.drop('Class', axis = 1)
df_cc_X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [None]:
x_train_cc, x_val_cc, y_train_cc, y_val_cc = train_test_split(df_cc_X, df_cc['Class'], test_size=0.33, random_state=42)

### Run XGBoost

In [None]:
xgbmodel_cc = XGBClassifier(objective='binary:logistic',  n_estimators = 1000, booster = 'gbtree', eval_metric = ['auc', 'logloss'])
start = datetime.datetime.now()
xgbmodel_cc.fit(x_train_cc.values, y_train_cc.values, eval_set = [(x_val_cc.values, y_val_cc.values)], early_stopping_rounds = 20)
time_taken = datetime.datetime.now() - start
print(time_taken)

[0]	validation_0-auc:0.919213	validation_0-logloss:0.598059
Multiple eval metrics have been passed: 'validation_0-logloss' will be used for early stopping.

Will train until validation_0-logloss hasn't improved in 20 rounds.
[1]	validation_0-auc:0.919237	validation_0-logloss:0.520193
[2]	validation_0-auc:0.919255	validation_0-logloss:0.456273
[3]	validation_0-auc:0.925971	validation_0-logloss:0.401311
[4]	validation_0-auc:0.925977	validation_0-logloss:0.354323
[5]	validation_0-auc:0.925974	validation_0-logloss:0.314671
[6]	validation_0-auc:0.925958	validation_0-logloss:0.279634
[7]	validation_0-auc:0.929317	validation_0-logloss:0.248951
[8]	validation_0-auc:0.929308	validation_0-logloss:0.22228
[9]	validation_0-auc:0.932646	validation_0-logloss:0.198931
[10]	validation_0-auc:0.932647	validation_0-logloss:0.178033
[11]	validation_0-auc:0.93265	validation_0-logloss:0.159589
[12]	validation_0-auc:0.932676	validation_0-logloss:0.143505
[13]	validation_0-auc:0.932681	validation_0-logloss:0.

### Feature Importance

In [None]:
mean_dict_cc = {}
counter = 0
fi_array_cc = xgbmodel_cc.get_booster().get_score(importance_type = 'gain')

for col in df_cc_X.columns:
  mean_dict_cc[col] = fi_array_cc['f'+str(counter)]
  counter += 1

fi_dict_cc = {}
fi_dict_cc['mean'] = mean_dict_cc
fi_df_cc = pd.DataFrame(fi_dict_cc, index=df_cc_X.columns)
fi_df_cc['perc'] = fi_df_cc['mean']/fi_df_cc['mean'].sum()

In [None]:
fi_df_cc.sort_values(by = 'perc', ascending = False)

Unnamed: 0,mean,perc
V17,51.822475,0.251742
V14,24.218101,0.117646
V10,10.367123,0.050361
V7,8.612096,0.041836
V4,7.358882,0.035748
V1,6.874751,0.033396
V20,5.604944,0.027228
V21,5.519701,0.026814
V28,5.478786,0.026615
V12,5.386324,0.026166
