# Learning, Fraud Detection - Cross Validation

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc

%matplotlib inline

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import KFold

from IPython.display import HTML

In [107]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [108]:
train_identity = pd.read_csv('data/train_identity.csv')
print("Train identity:{}".format(train_identity.shape))
train_transaction = pd.read_csv('data/train_transaction.csv')
print("Train transaction:{}".format(train_transaction.shape))

Train identity:(144233, 41)
Train transaction:(590540, 394)


In [109]:
train_identity = reduce_mem_usage(train_identity)
train_transaction = reduce_mem_usage(train_transaction)

Memory usage of dataframe is 45.12 MB --> 25.86 MB (Decreased by 42.7%)
Memory usage of dataframe is 1775.15 MB --> 542.35 MB (Decreased by 69.4%)


In [110]:
# Join Transaction and Identity dataframes
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## Feature Engineering

In [111]:
# Delete identity and transaction dataframes from memory
del train_identity, train_transaction
gc.collect()

5945

In [112]:
# http://localhost:8888/notebooks/IEEE-CIS%20Fraud%20Detection%20_%20EDA.ipynb
import datetime

START_DATE = '2017-11-30'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
train['Date'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
train['_ymd'] = train['Date'].dt.year.astype(str) + '-' + train['Date'].dt.month.astype(str) + '-' + train['Date'].dt.day.astype(str)
train['_year_month'] = train['Date'].dt.year.astype(str) + '-' + train['Date'].dt.month.astype(str)
train['_weekday'] = train['Date'].dt.dayofweek
train['_hour'] = train['Date'].dt.hour
train['_day'] = train['Date'].dt.day


In [113]:
# https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt
# id_30 OS
train.loc[train['id_30'].str.contains('Windows', na=False), 'id_30'] = 'Windows'
train.loc[train['id_30'].str.contains('iOS', na=False), 'id_30'] = 'iOS'
train.loc[train['id_30'].str.contains('Mac OS', na=False), 'id_30'] = 'Mac'
train.loc[train['id_30'].str.contains('Android', na=False), 'id_30'] = 'Android'
train['id_30'].fillna("NAN", inplace=True)

train['id_30'].value_counts()

NAN        512975
Windows     36739
iOS         19782
Mac         13580
Android      6303
Linux        1136
other          15
func           10
Name: id_30, dtype: int64

In [114]:
# https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt
# id_31 Browser
train.loc[train['id_31'].str.contains('chrome', na=False), 'id_31'] = 'Chrome'
train.loc[train['id_31'].str.contains('firefox', na=False), 'id_31'] = 'Firefox'
train.loc[train['id_31'].str.contains('safari', na=False), 'id_31'] = 'Safari'
train.loc[train['id_31'].str.contains('edge', na=False), 'id_31'] = 'Edge'
train.loc[train['id_31'].str.contains('ie', na=False), 'id_31'] = 'IE'
train.loc[train['id_31'].str.contains('samsung', na=False), 'id_31'] = 'Samsung'
train.loc[train['id_31'].str.contains('opera', na=False), 'id_31'] = 'Opera'
train['id_31'].fillna("NAN", inplace=True)
train.loc[train.id_31.isin(train.id_31.value_counts()[train.id_31.value_counts() < 200].index), 'id_31'] = "Others"

train['id_31'].value_counts()

NAN        450258
Chrome      76059
Safari      37281
IE          10018
Firefox      7012
Edge         6401
Samsung      2044
Others        706
Opera         449
other         312
Name: id_31, dtype: int64

In [115]:
# Drop Transaction ID, Transaction Date, year month day, year month, and date

train.drop('TransactionID', axis=1, inplace=True)
train.drop('TransactionDT', axis=1, inplace=True)
train.drop('_ymd', axis=1, inplace=True)
train.drop('_year_month', axis=1, inplace=True)
train.drop('Date', axis=1, inplace=True)

In [116]:
# Label encode the categorical features

cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 
            'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 
            'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4',
            'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2','P_emaildomain',
            'R_emaildomain', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 
            'M8', 'M9']
for col in cat_cols:
    if col in train.columns:
        le = preprocessing.LabelEncoder()
        le.fit(list(train[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))

In [117]:
# Separate Features and Labels

y = train['isFraud']
X = train.drop('isFraud', axis=1)

del train
gc.collect()

109

In [118]:
# Fill NaN with mean values of the column

X.fillna(X.mean(), inplace=True)

In [127]:
# Precision, Recall and F1 Scores

def get_scores(y_true, y_pred, auc):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred) 
    
    f1 = f1_score(y_true, y_pred)
    
    return [precision, recall, f1, auc]

def get_auc_roc(label_data, prediction_data, metrics_title):
    fpr, tpr, thresholds = metrics.roc_curve(y_train, train_predictions)
    auc = metrics.auc(fpr, tpr)

    return auc

In [124]:
X.head(10)

Unnamed: 0,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,...,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,_weekday,_hour,_day
0,68.5,4,3417,500,42,1,38,1,166,65,...,4,2,2,2,2,2,1742,4,0,1
1,29.0,4,7922,303,42,2,2,1,173,65,...,4,2,2,2,2,2,1742,4,0,1
2,59.0,4,9383,389,42,4,58,2,178,65,...,4,2,2,2,2,2,1742,4,0,1
3,50.0,4,6991,466,42,2,14,2,282,65,...,4,2,2,2,2,2,1742,4,0,1
4,50.0,1,9262,413,42,2,2,1,241,65,...,3,1,0,1,1,1,954,4,0,1
5,49.0,4,10366,454,42,4,108,2,132,65,...,4,2,2,2,2,2,1742,4,0,1
6,159.0,4,2009,259,42,4,58,2,17,65,...,4,2,2,2,2,2,1742,4,0,1
7,422.5,4,2360,389,42,4,108,2,173,65,...,4,2,2,2,2,2,1742,4,0,1
8,15.0,1,7962,0,42,4,108,2,183,65,...,2,1,0,0,1,1,1727,4,0,1
9,117.0,4,6370,10,42,2,106,2,78,65,...,4,2,2,2,2,2,1742,4,0,1


## Train and Test LightGBM models

### Cross Validation

In [128]:
%%time

# Tuning LGBM for overfitting
# The best model
params={'learning_rate': 0.03,
        'objective': 'binary',
        'metric': 'auc',
        'max_bin': 256,
        'num_leaves': 256,
        'min_data_in_leaf': 10,
        'verbose': 1,
        'random_state': 42,
        'bagging_fraction': 0.85,
        'bagging_freq': 10,
        'feature_fraction': 0.9,
        'max_depth': 128,
       }

scores = []

lgbm = lgb.LGBMClassifier(**params, n_estimators=3000)

cv = KFold(n_splits=5, random_state=42, shuffle=False)
for train_index, test_index in cv.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y[test_index]
    lgbm.fit(X_train, y_train)
    
    test_preds = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_)
    test_predictions = lgbm.predict_proba(X_test, num_iteration=lgbm.best_iteration_)[:,1]
    
    auc = get_auc_roc(y_test, test_predictions, 'Test')
    scores.append(get_scores(y_test, test_preds, auc))
    
print('Scores')
for score in scores:
    print('precision: {0:.3f}, recall: {1:.3f}, f1: {2:.3f}, auc: {3:.3f}'.format(
        score[0], score[1], score[2], score[3]))
    
    
    
    

Scores
precision: 0.832, recall: 0.358, f1: 0.501, auc: 0.498
precision: 0.946, recall: 0.422, f1: 0.583, auc: 0.498
precision: 0.929, recall: 0.430, f1: 0.588, auc: 0.496
precision: 0.907, recall: 0.434, f1: 0.587, auc: 0.494
precision: 0.888, recall: 0.382, f1: 0.534, auc: 0.495
CPU times: user 11h 54min 49s, sys: 2min 47s, total: 11h 57min 37s
Wall time: 1h 54s


In [129]:
%%time

# Tuning LGBM for overfitting
# The best model
params={'learning_rate': 0.01,
        'objective': 'binary',
        'metric': 'auc',
        'max_bin': 512,
        'num_leaves': 512,
        'min_data_in_leaf': 10,
        'verbose': 1,
        'random_state': 42,
        'bagging_fraction': 1,
        'bagging_freq': 10,
        'feature_fraction': 1,
        'max_depth': 128,
       }

scores = []

lgbm = lgb.LGBMClassifier(**params, n_estimators=3000)

cv = KFold(n_splits=5, random_state=42, shuffle=False)
for train_index, test_index in cv.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y[test_index]
    lgbm.fit(X_train, y_train)
    
    train_preds = lgbm.predict(X_train, num_iteration=lgbm.best_iteration_)
    train_predictions = lgbm.predict_proba(X_train, num_iteration=lgbm.best_iteration_)[:,1]
    auc = get_auc_roc(y_train, train_predictions, 'Train')
    scores.append(get_scores(y_test, test_preds, auc))
    
    test_preds = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_)
    test_predictions = lgbm.predict_proba(X_test, num_iteration=lgbm.best_iteration_)[:,1]
    auc = get_auc_roc(y_test, test_predictions, 'Test')
    scores.append(get_scores(y_test, test_preds, auc))
    
print('Scores')
for score in scores:
    print('precision: {0:.3f}, recall: {1:.3f}, f1: {2:.3f}, auc: {3:.3f}'.format(
        score[0], score[1], score[2], score[3]))
    
    
    
    

KeyboardInterrupt: 

In [60]:
# Predict Training
train_preds = lgbm.predict(X_train, num_iteration=lgbm.best_iteration_)
show_scores(y_train, train_preds)

Confusion Matrix
true positive 16420, false positive: 0
false negative: 1, true negative: 456011
Precision: 1.000, Recall: 1.000
F1 score: 1.000


In [61]:
# Predict Test
test_preds = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_)
show_scores(y_test, test_preds)

Confusion Matrix
true positive 2705, false positive: 42
false negative: 1537, true negative: 113824
Precision: 0.985, Recall: 0.638
F1 score: 0.774
