In [23]:
# !pip3 install sklearn
# !pip3 install xgboost

In [85]:
import os
import gc
import sys

import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [81]:
from sklearn import preprocessing

In [59]:
import pickle
# with open(<filename>, 'wb') as file: -> pickle.dump(<object>, file)
# with open(<filename>, 'rb') as file: -> <var> = pickle.load(file)

In [70]:
# sys.path.append('..')
# from Other.utils.ReduceMem import reduce_mem_usage

In [84]:
sample_submission = pd.read_csv('../../Data/sample_submission.csv')

In [68]:
train_tr = pd.read_csv('../../Data/train_transaction.csv')
train_id = pd.read_csv('../../Data/train_identity.csv')
test_tr = pd.read_csv('../../Data/test_transaction.csv')
test_id = pd.read_csv('../../Data/test_identity.csv')

In [69]:
train = train_id.merge(train_tr, on='TransactionID')
test = test_id.merge(test_tr, on='TransactionID')
# del train_tr, train_id, test_tr, test_id

In [94]:
print('train.shape: ', train.shape)
print('test.shape: ', test.shape)
print('----------------')
print('train_tr.shape: ', train_tr.shape)
print('train_id.shape: ', train_id.shape)
print('test_tr.shape: ', test_tr.shape)
print('test_id.shape: ', test_id.shape)
print('sample_submission.shape: ', sample_submission.shape)


train.shape:  (144233, 423)
test.shape:  (141907, 422)
----------------
train_tr.shape:  (590540, 394)
train_id.shape:  (144233, 41)
test_tr.shape:  (506691, 393)
test_id.shape:  (141907, 41)
sample_submission.shape:  (506691, 2)


In [72]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [73]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
print('training set shape: ', train.shape)
print('test set shape: ', test.shape)

Mem. usage decreased to 169.60 Mb (64.6% reduction)
Mem. usage decreased to 168.90 Mb (64.1% reduction)
training set shape:  (144233, 434)
test set shape:  (141907, 433)


In [75]:
cols_to_drop = ['dist1', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'D2', 'D11']
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [76]:
y_train = train['isFraud'].copy()

In [82]:
X_train = train.drop('isFraud', axis=1)
X_test = test.copy()

# del train, test

for c in X_train.columns:
    if X_train[c].dtype=='float16' or  X_train[c].dtype=='float32' or  X_train[c].dtype=='float64':
        X_train[c].fillna(X_train[c].mean())
        X_test[c].fillna(X_train[c].mean())

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values))  

In [83]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True)

print(folds)

KFold(n_splits=5, random_state=None, shuffle=True)


In [87]:
xgb_submission = sample_submission.copy()
xgb_submission['isFraud'] = 0

for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
    xgbclf = xgb.XGBClassifier(
        n_estimators=1000,
        max_depth=9,
        learning_rate=0.048,
        subsample=0.85,
        colsample_bytree=0.85,
        missing=-999,
        tree_method='auto', # auto, exact, approx, hist, gpu_hist (https://xgboost.readthedocs.io/en/latest/parameter.html)
        reg_alpha=0.15,
        reg_lamdba=0.85
    )
    
    X_train_, X_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    xgbclf.fit(X_train_,y_train_)
    del X_train_,y_train_
    pred=xgbclf.predict_proba(X_test)[:,1]
    val=xgbclf.predict_proba(X_valid)[:,1]
    del xgbclf, X_valid
    print('ROC accuracy: {}'.format(roc_auc_score(y_valid, val)))
    del val,y_valid
    xgb_submission['isFraud'] = xgb_submission['isFraud']+pred/n_fold
    del pred
    gc.collect()

ROC accuracy: 0.9814702070456137


ValueError: operands could not be broadcast together with shapes (506691,) (141907,) 

In [None]:
exp_name = '1_first_default_xgboost'
xgb_submission.to_csv('./{directory}/sub_xgboost.csv'.format(directory=exp_name))
xgb_submission.head()

In [96]:
len(sample_submission['TransactionID'].unique())

506691

In [100]:
xgb_submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0
1,3663550,0
2,3663551,0
3,3663552,0
4,3663553,0
