In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, classification_report
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox
from sklearn.feature_selection import RFE

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'train.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

file_key2 = 'test.csv'

bucket_object2 = bucket.Object(file_key2)
file_object2 = bucket_object2.get()
file_content_stream2 = file_object2.get('Body')

# reading data file
train = pd.read_csv(file_content_stream, delimiter = '|')
test = pd.read_csv(file_content_stream2, delimiter = '|')
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [2]:
# box cox transformation
transformed_lineItemVoidsPerPosition = boxcox(train['totalScanTimeInSeconds'])
train['totalScanTimeInSeconds'] = transformed_lineItemVoidsPerPosition[0]

transformed_lineItemVoidsPerPosition = boxcox(test['totalScanTimeInSeconds'])
test['totalScanTimeInSeconds'] = transformed_lineItemVoidsPerPosition[0]

# engineering interactions
train['interaction_1'] = train['totalScanTimeInSeconds'] * train['scannedLineItemsPerSecond']
train['interaction_2'] = train['totalScanTimeInSeconds'] * train['valuePerSecond']
train['interaction_3'] = train['grandTotal'] * train['valuePerSecond']
train['interaction_4'] = train['lineItemVoids'] * train['lineItemVoidsPerPosition']
train['interaction_5'] = train['scannedLineItemsPerSecond'] * train['valuePerSecond']
train['interaction_6'] = train['scannedLineItemsPerSecond'] * train['lineItemVoidsPerPosition']

test['interaction_1'] = test['totalScanTimeInSeconds'] * test['scannedLineItemsPerSecond']
test['interaction_2'] = test['totalScanTimeInSeconds'] * test['valuePerSecond']
test['interaction_3'] = test['grandTotal'] * test['valuePerSecond']
test['interaction_4'] = test['lineItemVoids'] * test['lineItemVoidsPerPosition']
test['interaction_5'] = test['scannedLineItemsPerSecond'] * test['valuePerSecond']
test['interaction_6'] = test['scannedLineItemsPerSecond'] * test['lineItemVoidsPerPosition']

# engineering by strong heredity
train['heredity_1'] = train['trustLevel'] * train['interaction_1']
train['heredity_2'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
train['heredity_3'] = train['interaction_1'] * train['scannedLineItemsPerSecond']

test['heredity_1'] = test['trustLevel'] * test['interaction_1']
test['heredity_2'] = test['trustLevel'] * test['scannedLineItemsPerSecond']
test['heredity_3'] = test['interaction_1'] * test['scannedLineItemsPerSecond']

# engineering by decision tree
train['DT_1'] = np.where((train['trustLevel'] <= 1.5) & (train['heredity_1'] <= 4.412) & (train['totalScanTimeInSeconds'] <= 281.406), 1, 0)
test['DT_1'] = np.where((test['trustLevel'] <= 1.5) & (test['heredity_1'] <= 4.412) & (test['totalScanTimeInSeconds'] <= 281.406), 1, 0)

In [None]:
# defining input and target variables
x = train.drop(columns = 'fraud', axis = 1)
y = train['fraud']

# lists to store supports 
logit_support, rf_support, ada_support = list(), list(), list()

for i in range(0,100):
    print(i)
    # splitting the data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
    
    ## Defining scaler for logistic regression
    scaler = MinMaxScaler()
    ## Scaling the data
    x_train_logit = pd.DataFrame(scaler.fit_transform(x_train), columns = x.columns)
    x_test_logit = pd.DataFrame(scaler.fit_transform(x_test), columns = x.columns)
    
    ## Running RFE with Random forest model
    logit_rfe = RFE(estimator = LogisticRegression(), n_features_to_select = 5).fit(x_train_logit, y_train)
    logit_support.append(logit_rfe.support_)
    
    ## Running RFE with Random forest model
    rf_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), n_features_to_select = 5).fit(x_train, y_train)
    rf_support.append(rf_rfe.support_)
    
    ## Running RFE with Random forest model
    ada_rfe = RFE(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth = 3), 
                                                n_estimators = 500, learning_rate = .01), n_features_to_select = 5).fit(x_train, y_train)
    ada_support.append(ada_rfe.support_)
    
logit_results = pd.DataFrame(logit_support)
logit_results.columns = x_train.columns

rf_results = pd.DataFrame(rf_support)
rf_results.columns = x_train.columns

ada_results = pd.DataFrame(ada_support)
ada_results.columns = x_train.columns


In [5]:
# logistic features by percentage of importance
100*logit_results.apply(np.sum, axis = 0) / logit_results.shape[0]

trustLevel                   100.0
totalScanTimeInSeconds       100.0
grandTotal                     0.0
lineItemVoids                  1.0
scansWithoutRegistration      61.0
quantityModifications          0.0
scannedLineItemsPerSecond      0.0
valuePerSecond                 0.0
lineItemVoidsPerPosition      38.0
interaction_1                100.0
interaction_2                  0.0
interaction_3                  0.0
interaction_4                  0.0
interaction_5                  0.0
interaction_6                  0.0
heredity_1                     0.0
heredity_2                     0.0
heredity_3                     0.0
DT_1                         100.0
dtype: float64

In [6]:
# rf features by percentage of importance
100*rf_results.apply(np.sum, axis = 0) / rf_results.shape[0]

trustLevel                   100.0
totalScanTimeInSeconds         2.0
grandTotal                     0.0
lineItemVoids                  0.0
scansWithoutRegistration       0.0
quantityModifications          0.0
scannedLineItemsPerSecond      0.0
valuePerSecond                 0.0
lineItemVoidsPerPosition       0.0
interaction_1                100.0
interaction_2                  0.0
interaction_3                  0.0
interaction_4                  0.0
interaction_5                  0.0
interaction_6                  0.0
heredity_1                   100.0
heredity_2                   100.0
heredity_3                    98.0
DT_1                           0.0
dtype: float64

In [7]:
# ada features by percentage of importance
100*ada_results.apply(np.sum, axis = 0) / ada_results.shape[0]

trustLevel                     9.0
totalScanTimeInSeconds        92.0
grandTotal                     7.0
lineItemVoids                 39.0
scansWithoutRegistration      66.0
quantityModifications          0.0
scannedLineItemsPerSecond      0.0
valuePerSecond                 4.0
lineItemVoidsPerPosition       6.0
interaction_1                100.0
interaction_2                  1.0
interaction_3                  0.0
interaction_4                  9.0
interaction_5                 14.0
interaction_6                 16.0
heredity_1                    46.0
heredity_2                    91.0
heredity_3                     0.0
DT_1                           0.0
dtype: float64

In [27]:
# features by percentage of importance accross all 3 models
(100*logit_results.apply(np.sum, axis = 0)/300) + (100*rf_results.apply(np.sum, axis = 0)/ 300) + (100*ada_results.apply(np.sum, axis = 0)/ 300) 

trustLevel                    69.666667
totalScanTimeInSeconds        64.666667
grandTotal                     2.333333
lineItemVoids                 13.333333
scansWithoutRegistration      42.333333
quantityModifications          0.000000
scannedLineItemsPerSecond      0.000000
valuePerSecond                 1.333333
lineItemVoidsPerPosition      14.666667
interaction_1                100.000000
interaction_2                  0.333333
interaction_3                  0.000000
interaction_4                  3.000000
interaction_5                  4.666667
interaction_6                  5.333333
heredity_1                    48.666667
heredity_2                    63.666667
heredity_3                    32.666667
DT_1                          33.333333
dtype: float64

In [None]:
# from the results above we can see the top 5 features accoss all 3 models are... trustLevel, totalScanTimeInSeconds, interation_1, heredity_1, and heridity_2