In [1]:
import pandas as pd
import numpy as np

# Undersample Data

In [2]:
# Get undersampled data
un_df = pd.read_pickle('../../Data/creditcardUNDER.pkl')

In [3]:
from sklearn.model_selection import train_test_split

# Create a test set
X, y = un_df.drop('Class',1), un_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [4]:
import xgboost as xgb

# Build XGBClassifier

xg_cls = xgb.XGBClassifier(tree_method='gpu_exact')
xg_cls.fit(X_train,y_train)

XGBpreds = xg_cls.predict(X_test)

XGBtrain_preds = xg_cls.predict(X_train)

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Build RFClassifer

clf = RandomForestClassifier()
clf.fit(X_train,y_train)


CLFpreds = clf.predict(X_test)
CLFtrain_preds = clf.predict(X_train)



In [6]:
# Accuracy tables on the training and test set for RFC

table_train = pd.crosstab(y_train, CLFtrain_preds, margins=True)
table_test = pd.crosstab(y_test, CLFpreds, margins=True)
test_score = clf.score(X_test, y_test) 
train_score = clf.score(X_train, y_train) 

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    '****************RFC****************\n'
    '********** Under Sampled **********\n'
    '***********************************\n\n'    
    'Training set accuracy:\n'
    'Percent Type I errors:  {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    
    'Test set accuracy:\n'
    'Percent Type I errors: {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    '***********************************\n'
    '* Type I errors are normal purchases that have been flagged.\n'
    '* Type II errors are frauds that are not caught!\n'
).format(train_tI_errors, train_tII_errors, train_score, test_tI_errors, test_tII_errors, test_score))

****************RFC****************
********** Under Sampled **********
***********************************

Training set accuracy:
Percent Type I errors:  0.00000000
Percent Type II errors: 0.01014493
Score: 0.98985507

Test set accuracy:
Percent Type I errors: 0.00578035
Percent Type II errors: 0.06936416
Score: 0.92485549

***********************************
* Type I errors are normal purchases that have been flagged.
* Type II errors are frauds that are not caught!



In [7]:
# Accuracy tables on the training and test set for XGB

table_train = pd.crosstab(y_train, XGBtrain_preds, margins=True)
table_test = pd.crosstab(y_test, XGBpreds, margins=True)
Xtest_score = xg_cls.score(X_test, y_test) 
Xtrain_score = xg_cls.score(X_train, y_train) 

Xtrain_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
Xtrain_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

Xtest_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
Xtest_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    '****************XGB****************\n'
    '********** Under Sampled **********\n'
    '***********************************\n\n'    
    'Training set accuracy:\n'
    'Percent Type I errors:  {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    
    'Test set accuracy:\n'
    'Percent Type I errors: {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    '***********************************\n'
    '* Type I errors are normal purchases that have been flagged.\n'
    '* Type II errors are frauds that are not caught!\n'
).format(Xtrain_tI_errors, Xtrain_tII_errors, Xtrain_score, Xtest_tI_errors, Xtest_tII_errors, Xtest_score))

****************XGB****************
********** Under Sampled **********
***********************************

Training set accuracy:
Percent Type I errors:  0.00000000
Percent Type II errors: 0.00144928
Score: 0.99855072

Test set accuracy:
Percent Type I errors: 0.00578035
Percent Type II errors: 0.04624277
Score: 0.94797688

***********************************
* Type I errors are normal purchases that have been flagged.
* Type II errors are frauds that are not caught!



In [8]:
dfvs = pd.DataFrame([[train_tI_errors, test_tI_errors, 'Train','RFC'],
                     [train_tII_errors, test_tII_errors, 'Test', 'RFC'],
                     [Xtrain_tI_errors, Xtest_tI_errors,'Train', 'XGB'],
                     [Xtrain_tII_errors, Xtest_tII_errors, 'Test', 'XGB']], 
                    columns=['Type_I', 'Type_II', 'Type', 'Model'])


In [9]:
1 - dfvs.iloc[3]['Type_II'] /dfvs.iloc[1]['Type_II']

0.33333333333333326

In [10]:
dfvs

Unnamed: 0,Type_I,Type_II,Type,Model
0,0.0,0.00578,Train,RFC
1,0.010145,0.069364,Test,RFC
2,0.0,0.00578,Train,XGB
3,0.001449,0.046243,Test,XGB


### You can see that XBG is 33% better on the test for Type II errors

# Upsample Data

In [11]:
up_df = pd.read_pickle('../../Data/dedicatedTrain.pkl')
up_dftest = pd.read_pickle('../../Data/dedicatedTest.pkl')

In [12]:
X_train, y_train = up_df.drop('Class',1), up_df['Class']
X_test, y_test = up_dftest.drop('Class',1), up_dftest['Class']

In [13]:
# Build XGBClassifier

xg_cls = xgb.XGBClassifier(tree_method='gpu_exact')
xg_cls.fit(X_train,y_train)

XGBpreds = xg_cls.predict(X_test)

XGBtrain_preds = xg_cls.predict(X_train)

In [14]:
# Build RFClassifer

clf = RandomForestClassifier()
clf.fit(X_train,y_train)


CLFpreds = clf.predict(X_test)
CLFtrain_preds = clf.predict(X_train)



In [15]:
# Accuracy tables on the training and test set for RFC

table_train = pd.crosstab(y_train, CLFtrain_preds, margins=True)
table_test = pd.crosstab(y_test, CLFpreds, margins=True)
test_score = clf.score(X_test, y_test) 
train_score = clf.score(X_train, y_train) 

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    '****************RFC****************\n'
    '********** Under Sampled **********\n'
    '***********************************\n\n'    
    'Training set accuracy:\n'
    'Percent Type I errors:  {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    
    'Test set accuracy:\n'
    'Percent Type I errors: {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    '***********************************\n'
    '* Type I errors are normal purchases that have been flagged.\n'
    '* Type II errors are frauds that are not caught!\n'
).format(train_tI_errors, train_tII_errors, train_score, test_tI_errors, test_tII_errors, test_score))

****************RFC****************
********** Under Sampled **********
***********************************

Training set accuracy:
Percent Type I errors:  0.00000176
Percent Type II errors: 0.00000000
Score: 0.99999824

Test set accuracy:
Percent Type I errors: 0.00000000
Percent Type II errors: 0.10569106
Score: 0.89430894

***********************************
* Type I errors are normal purchases that have been flagged.
* Type II errors are frauds that are not caught!



In [16]:
# Accuracy tables on the training and test set for XGB

table_train = pd.crosstab(y_train, XGBtrain_preds, margins=True)
table_test = pd.crosstab(y_test, XGBpreds, margins=True)
Xtest_score = xg_cls.score(X_test, y_test) 
Xtrain_score = xg_cls.score(X_train, y_train) 

Xtrain_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
Xtrain_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

Xtest_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
Xtest_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    '****************XGB****************\n'
    '********** Under Sampled **********\n'
    '***********************************\n\n'    
    'Training set accuracy:\n'
    'Percent Type I errors:  {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    
    'Test set accuracy:\n'
    'Percent Type I errors: {:.8f}\n'
    'Percent Type II errors: {:.8f}\n'
    'Score: {:.8f}\n\n'
    '***********************************\n'
    '* Type I errors are normal purchases that have been flagged.\n'
    '* Type II errors are frauds that are not caught!\n'
).format(Xtrain_tI_errors, Xtrain_tII_errors, Xtrain_score, Xtest_tI_errors, Xtest_tII_errors, Xtest_score))

****************XGB****************
********** Under Sampled **********
***********************************

Training set accuracy:
Percent Type I errors:  0.00284139
Percent Type II errors: 0.00621763
Score: 0.99094098

Test set accuracy:
Percent Type I errors: 0.00406504
Percent Type II errors: 0.06097561
Score: 0.93495935

***********************************
* Type I errors are normal purchases that have been flagged.
* Type II errors are frauds that are not caught!



In [17]:
dfvs = pd.DataFrame([[train_tI_errors, test_tI_errors, 'Train','RFC'],
                     [train_tII_errors, test_tII_errors, 'Test', 'RFC'],
                     [Xtrain_tI_errors, Xtest_tI_errors,'Train', 'XGB'],
                     [Xtrain_tII_errors, Xtest_tII_errors, 'Test', 'XGB']], 
                    columns=['Type_I', 'Type_II', 'Type', 'Model'])

In [18]:
1 - dfvs.iloc[3]['Type_II'] /dfvs.iloc[1]['Type_II']

0.42307692307692313

In [19]:
dfvs

Unnamed: 0,Type_I,Type_II,Type,Model
0,2e-06,0.0,Train,RFC
1,0.0,0.105691,Test,RFC
2,0.002841,0.004065,Train,XGB
3,0.006218,0.060976,Test,XGB


### You can see that XBG is 42% better on the test for Type II errors