In [36]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
import precision_recall_cutoff

# Reading the csv file
train = pd.read_csv('train_dataset.csv')

# Defining the file to be read from s3 bucket
test = pd.read_csv('test_dataset.csv')
test = test.dropna()
test.shape

(498121, 44)

In [37]:
train.shape

(1879, 45)

In [38]:
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,interaction_1,interaction_2,interaction_3,interaction_4,average_seconds_per_item,trustLevel_1,trustLevel_2,trustLevel_3,trustLevel_4,trustLevel_5,trustLevel_6,totalScanTimeInSeconds_0_1,grandTotal_0_1,lineItemVoids_0_1,scansWithoutRegistration_0_1,quantityModifications_0_1,scannedLineItemsPerSecond_0_1,valuePerSecond_0_1,lineItemVoidsPerPosition_0_1,average_seconds_per_item_01,Labels,Labels_0,Labels_1,Labels_2,Labels_3,Labels_4,Labels_5,interaction_5,interaction_6,interaction_7,interaction_8,heredity_1,heredity_2,heredity_3,interaction_9
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0,0,0,0,1,19.27,0,0,0,0,1,0,0.575178,0.547174,0.636364,0.0,0.6,0.004045,0.00137,0.021944,0.000131,0,1,0,0,0,0,0,0.0,0.002427,0.314722,0,0,0,0,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0,0,0,0,1,3.95,0,0,1,0,0,0,0.057955,0.273637,0.454545,0.2,0.8,0.019364,0.006689,0.032468,2.7e-05,1,0,1,0,0,0,0,0.090909,0.015491,0.015859,0,0,0,3,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0,0,0,0,1,24.39,0,0,1,0,0,0,0.827775,0.621811,0.272727,1.0,1.0,0.001204,0.001083,0.020979,0.000165,5,0,0,0,0,0,1,0.272727,0.001204,0.514719,0,0,0,15,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0,0,0,0,1,19.4,0,0,0,0,0,1,0.97813,0.923462,0.727273,0.4,0.8,0.002347,0.001361,0.025078,0.000132,2,0,0,1,0,0,0,0.290909,0.001877,0.903266,0,0,0,12,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0,0,0,0,1,5.27,0,0,0,0,1,0,0.234008,0.815608,0.272727,0.7,0.4,0.009337,0.005007,0.010101,3.6e-05,4,0,0,0,0,1,0,0.190909,0.003735,0.190858,0,0,0,20,0


In [57]:
# Defining input and target variables
X = train[['totalScanTimeInSeconds', 'interaction_1', 'interaction_4', 'heredity_1', 'interaction_9', 'heredity_2', 'interaction_5']]
Y = train['fraud']

# Splitting the data into train, test, and validation
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.2)


# Scaling the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns = X.columns)

X_test = scaler.fit_transform(X_test)
X_test = pd.DataFrame(X_test, columns = X.columns)

X_val = scaler.fit_transform(X_val)
X_val = pd.DataFrame(X_val, columns = X.columns)


X_val

Unnamed: 0,totalScanTimeInSeconds,interaction_1,interaction_4,heredity_1,interaction_9,heredity_2,interaction_5
0,0.195068,0.0,0.0,0.0,0.0,0.0,0.381818
1,0.327123,0.0,1.0,0.0,0.0,0.0,0.109091
2,0.551233,0.0,1.0,0.0,0.0,0.0,0.054545
3,1.000000,0.0,0.0,0.0,0.0,0.0,0.272727
4,0.741370,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...
295,0.350685,0.0,1.0,0.0,0.0,0.0,0.909091
296,0.757808,1.0,0.0,1.0,1.0,1.0,0.363636
297,0.939726,0.0,0.0,0.0,0.0,0.0,0.072727
298,0.076164,0.0,1.0,0.0,0.0,0.0,0.545455


In [59]:
# Defining top 7, 6, and 5 variables
# Train dataset
X_train_7 = X_train
X_train_6 = X_train.drop(['interaction_5'], axis = 1)
X_train_5 = X_train.drop(['interaction_5', 'heredity_2'], axis = 1)

# Test dataset
X_test_7 = X_test
X_test_6 = X_test[['totalScanTimeInSeconds', 'interaction_1', 'interaction_4', 'heredity_1', 'interaction_9', 'heredity_2']]
X_test_5 = X_test[['totalScanTimeInSeconds', 'interaction_1', 'interaction_4', 'heredity_1', 'interaction_9']]

# Validation dataset
X_val_7 = X_val
X_val_6 = X_val[['totalScanTimeInSeconds', 'interaction_1', 'interaction_4', 'heredity_1', 'interaction_9', 'heredity_2']]
X_val_5 = X_val[['totalScanTimeInSeconds', 'interaction_1', 'interaction_4', 'heredity_1', 'interaction_9']]


### Random Forest Model

### Fitting the model with the top 5 features

In [None]:
## Defining the hyper-parameters for RF
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15], 
                 'min_samples_leaf': [5, 7], 
                 'max_depth' : [3, 5, 7]}

# Defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# Performing GridSearch
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train_5, Y_train)

# Extracting the best model
RF_model = RF_grid_search.best_estimator_

# Predicting on validation and test
RF_val_pred = RF_model.predict(X_val_5)
RF_test_pred = RF_model.predict(X_test_6)

### Fitting the model with the top 6 features

In [None]:
# Defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# Performing GridSearch
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train_6, Y_train)

# Extracting the best model
RF_model = RF_grid_search.best_estimator_

# Predicting on validation and test
RF_val_pred = RF_model.predict(X_val_6)
RF_test_pred = RF_model.predict(X_test_6)

### Fitting the model with the top 7 features

In [None]:
# Defining customized scoring function
my_score_function = make_scorer(cost_function, greater_is_better = True, needs_proba = True)

# Performing GridSearch
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = my_score_function, n_jobs = -1).fit(X_train_7, Y_train)

# Extracting the best model
RF_model = RF_grid_search.best_estimator_

# Predicting on validation and test
RF_val_pred = RF_model.predict(X_val_7)
RF_test_pred = RF_model.predict(X_test_7)

### Support Vector Machine

### Fitting the model with the top 5 features

In [None]:
## Defining the hyper-parameters for svm
svm_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 
                  'C': [0.01, 0.1, 1, 10],
                  'gamma': [0.001, 0.01, 0.1, 1]}


svm_grid_search = GridSearchCV(SVR(), svm_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train_5, Y_train)

# Extracting the best model
svm_md = svm_grid_search.best_estimator_

# Predicting on validation and test
svm_val_pred = svm_md.predict(X_val_5)
svm_test_pred = svm_md.predict(X_test_5)

### Fitting the model with the top 6 features

In [None]:
svm_grid_search = GridSearchCV(SVR(), svm_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train_6, Y_train)

# Extracting the best model
svm_md = svm_grid_search.best_estimator_

# Predicting on validation and test
svm_val_pred = svm_md.predict(X_val_6)
svm_test_pred = svm_md.predict(X_test_6)

### Fitting the model with the top 7 features

In [None]:
svm_grid_search = GridSearchCV(SVR(), svm_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train_7, Y_train)

# Extracting the best model
svm_md = svm_grid_search.best_estimator_

# Predicting on validation and test
svm_val_pred = svm_md.predict(X_val_7)
svm_test_pred = svm_md.predict(X_test_7)

### AdaBoost

### Fitting the model with the top 5 features

In [None]:
## Defining the hyper-parameters for RF
Ada_param_grid = {'n_estimators': [100, 300, 500],
                 'base_estimator__min_samples_split': [10, 15], 
                 'base_estimator__min_samples_leaf': [5, 7], 
                 'base_estimator__max_depth' : [3, 5, 7],
                 'learning_rate': [0.001, 0.01, 0.1]}

## Running grid search with 3 fold
Ada_grid_search = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), Ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_train_5, Y_train)

# Extracting the best model
ada_md = Ada_grid_search.best_estimator_

# Predicting on validation and test
ada_val_pred = ada_md.predict(X_val_5)
ada_test_pred = ada_md.predict(X_test_5)

### Fitting the model with the top 6 features

In [None]:
## Running grid search with 3 fold
Ada_grid_search = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), Ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_train_6, Y_train)

# Extracting the best model
ada_md = Ada_grid_search.best_estimator_

# Predicting on validation and test
ada_val_pred = ada_md.predict(X_val_6)
ada_test_pred = ada_md.predict(X_test_6)

### Fitting the model with the top 7 features

In [None]:
## Running grid search with 3 fold
Ada_grid_search = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), Ada_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_train_7, Y_train)

# Extracting the best model
ada_md = Ada_grid_search.best_estimator_

# Predicting on validation and test
ada_val_pred = ada_md.predict(X_val_7)
ada_test_pred = ada_md.predict(X_test_7)