In [21]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from scipy.stats import boxcox

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'train.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the training data file
train = pd.read_csv(file_content_stream, sep = '|')
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [22]:
## reading the test file
file_key= 'test.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the training data file
test = pd.read_csv(file_content_stream, sep = '|')
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


## feature engineering using boxcox transformation
***

In [23]:
## feature engineering using boxcox transformation
transformed_trust_level = boxcox(train['trustLevel'])

train['trustLevel'] = transformed_trust_level[0]

transformed_trust_level = boxcox(test['trustLevel'])

test['trust_level'] = transformed_trust_level[0]

In [24]:
## grand total / totalScanTimeInSeconds
train['grandtotal_totalscantime'] = train['grandTotal'] / train['totalScanTimeInSeconds']

test['grandtotal_totalscantime'] = test['grandTotal'] / test['totalScanTimeInSeconds']

In [25]:
## scansWithoutRegistration * quantityModifications
train['scans_quantity'] = train['scansWithoutRegistration'] * train['quantityModifications']

test['scans_quantity'] = test['scansWithoutRegistration'] * test['quantityModifications']

In [26]:
## log base 10 of grandTotal
train['grandTotal_log10'] = np.log10(train['grandTotal'])

test['grandTotal_log10'] = np.log10(test['grandTotal'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [27]:
## 1 / valuePerSecond
train['1_valuePerSecond'] = 1 / train['valuePerSecond']

test['1_valuePerSecond'] = 1 / test['valuePerSecond']

In [28]:
## scannedLineItemsPerSecond ^ 2
train['scannedLineItemsPerSecond_2'] = train['scannedLineItemsPerSecond']**2

test['scannedLineItemsPerSecond_2'] = test['scannedLineItemsPerSecond']**2

In [29]:
## scannedLineItemsPerSecond * valuePerSecond
train['scannedLineItemsPerSecond_valuePerSecond'] = train['scannedLineItemsPerSecond'] * train['valuePerSecond']

test['scannedLineItemsPerSecond_valuePerSecond'] = test['scannedLineItemsPerSecond'] * test['valuePerSecond']

In [30]:
## trustLevel * scannedLineItemsPerSecond
train['interaction_1'] = train['trustLevel'] * train['scannedLineItemsPerSecond']

test['interaction_1'] = test['trustLevel'] * test['scannedLineItemsPerSecond']

## scannedLineItemsPerSecond_2 * trustLevel
train['interaction_2'] = train['scannedLineItemsPerSecond_2'] * train['trustLevel']

test['interaction_2'] = test['scannedLineItemsPerSecond_2'] * test['trustLevel']

## scannedLineItemsPerSecond_2 * scannedLineItemsPerSecond
train['interaction_3'] = train['scannedLineItemsPerSecond_2'] * train['scannedLineItemsPerSecond']

test['interaction_3'] = test['scannedLineItemsPerSecond_2'] * test['scannedLineItemsPerSecond']


In [31]:
## engineering feature based on the decision tree results

train['interaction_4'] = np.where(((train['trustLevel'] <= 0.431) &
                                      (train['scannedLineItemsPerSecond'] <= 0.012) &
                                      (train['totalScanTimeInSeconds'] <= 895)), 1, 0)

test['interaction_4'] = np.where(((test['trustLevel'] <= 0.431) &
                                      (test['scannedLineItemsPerSecond'] <= 0.012) &
                                      (test['totalScanTimeInSeconds'] <= 895)), 1, 0)


train['interaction_5'] = np.where(((train['trustLevel'] <= 0.431) &
                                      (train['trustLevel'] <= 1.212) &
                                      (train['totalScanTimeInSeconds'] <= 1304)), 1, 0)
test['interaction_5'] = np.where(((test['trustLevel'] <= 0.431) &
                                      (test['trustLevel'] <= 1.212) &
                                      (test['totalScanTimeInSeconds'] <= 1304)), 1, 0)

In [32]:
## defining input and target variables
x = train.drop(columns = ['fraud'])
y = train['fraud']

In [33]:
## scaling the data
scaler = MinMaxScaler()

x = pd.DataFrame(scaler.fit_transform(x))

In [34]:
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,grandtotal_totalscantime,scans_quantity,grandTotal_log10,1_valuePerSecond,scannedLineItemsPerSecond_2,scannedLineItemsPerSecond_valuePerSecond,interaction_1,interaction_2,interaction_3,interaction_4,interaction_5
0,2.728286,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0,0.051898,0,1.737987,19.268739,0.000757,0.001428,0.075067,0.002065,2.082918e-05,0,0
1,1.561821,108,27.36,5,2,4,0.12963,0.253333,0.357143,0,0.253333,8,1.437116,3.947368,0.016804,0.03284,0.202458,0.026245,0.002178276,0,0
2,1.561821,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0,0.041003,50,1.793511,24.388674,7.4e-05,0.000352,0.013393,0.000115,6.305688e-07,0,0
3,3.240277,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0,0.051541,16,1.965249,19.402015,0.000262,0.000835,0.052467,0.00085,4.245289e-06,0,0
4,2.728286,430,81.53,3,7,2,0.062791,0.189605,0.111111,0,0.189605,14,1.911317,5.274132,0.003943,0.011905,0.171311,0.010757,0.0002475631,0,0


In [35]:
x.columns = [['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
              'scansWithoutRegistration', 'quantityModifications', 'scannedLineItemsPerSecond',
              'valuePerSecond', 'lineItemVoidsPerPosition', 'grandtotal_totalscantime', 'scans_quantity', 
              'grandTotal_log10', '1_valuePerSecond', 'scannedLineItemsPerSecond_2', 'scannedLineItemsPerSecond_valuePerSecond', 
              'interaction_4', 'interaction_5', 'interaction_1', 'interaction_2', 'interaction_3']]
              

### Recursive Feature Elimination (RFE)
***

In [37]:
## creating lists to store the results
logit_results = list()
rf_results = list()
ada_results = list()

## creating loop to run RFE 100 times
for i in range(0,10):
    
    print(i)
    
    ## splitting the data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
    
    
    ## creating logitstic regression model for RFE
    logit_rfe = RFE(estimator = LogisticRegression(), n_features_to_select = 5).fit(x_train, y_train)
    rf_rfe = RFE(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3), 
                                                n_features_to_select = 5).fit(x_train, y_train)
    ada_rfe = RFE(estimator = AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth = 3), 
                                                n_estimators = 500, learning_rate = 0.01)).fit(x_train, y_train)
    
    ## extracting rankings
    logit_results.append(logit_rfe.ranking_)
    rf_results.append(rf_rfe.ranking_)
    ada_results.append(ada_rfe.ranking_)

    
## turning lists into dataframes
rf_results = pd.DataFrame(rf_results)
rf_results.columns = x.columns

logit_results = pd.DataFrame(logit_results)
logit_results.columns = x.columns

ada_results = pd.DataFrame(ada_results)
ada_results.columns = x.columns

# (100*rf_results.apply(np.sum, axis = 0)) / (rf_results.shape[0])

0
1
2
3
4


KeyboardInterrupt: 

In [45]:
## combining all dataframes
all_md = pd.concat([rf_results, logit_results, ada_results])
all_md.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,1,1,2,3,4,5,1,1,1
1,1,1,2,3,4,5,1,1,1
2,1,1,3,2,4,5,1,1,1
3,1,1,2,3,4,5,1,1,1
4,1,1,3,2,4,5,1,1,1


In [46]:
all_md = pd.DataFrame(all_md.sum(axis = 0))

In [47]:
all_md

Unnamed: 0,0
trustLevel,36
totalScanTimeInSeconds,30
grandTotal,70
lineItemVoids,56
scansWithoutRegistration,72
quantityModifications,143
scannedLineItemsPerSecond,69
valuePerSecond,85
lineItemVoidsPerPosition,59


In [48]:
## sorting the features based on the average 
all_md = all_md.sort_values(by = 0, ascending = True)

In [49]:
all_md

Unnamed: 0,0
totalScanTimeInSeconds,30
trustLevel,36
lineItemVoids,56
lineItemVoidsPerPosition,59
scannedLineItemsPerSecond,69
grandTotal,70
scansWithoutRegistration,72
valuePerSecond,85
quantityModifications,143


In [None]:
## do we want to rank based on the sum or the average?