In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
import precision_recall_cutoff


# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key_train = "train.csv"
bucket_object = bucket.Object(file_key_train)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
train = pd.read_csv(file_content_stream, sep="|")

# Defining the file to be read from s3 bucket
file_key_test = "test.csv"
bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

# Reading the csv file
test = pd.read_csv(file_content_stream_test, sep='|')
test = test.dropna()

In [2]:
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


In [3]:
## Creating interactions/features from the decision tree
# interaction 1
test['interaction_1'] = np.where((test['trustLevel'] <= 1.5) & 
                                     (test['scannedLineItemsPerSecond'] > 0.012), 1, 0)

# interaction 2
test['interaction_2'] = np.where((test['trustLevel'] <= 1.5) & 
                                     (test['scannedLineItemsPerSecond'] <= 0.012), 1, 0)

# interaction 3
test['interaction_3'] = np.where((test['trustLevel'] > 1.5) & 
                                     (test['trustLevel'] <= 2.5), 1, 0)

# interaction 4
test['interaction_4'] = np.where((test['trustLevel'] > 2.5), 1, 0)
                                  
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,interaction_1,interaction_2,interaction_3,interaction_4
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429,0,0,0,1
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259,0,0,0,1
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0,0,1,0,0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857,0,0,0,1
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526,0,0,0,1


In [4]:
test['average_seconds_per_item'] = round(test['totalScanTimeInSeconds'] / test['grandTotal'], 1)
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,interaction_1,interaction_2,interaction_3,interaction_4,average_seconds_per_item
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429,0,0,0,1,5.3
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259,0,0,0,1,17.0
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0,0,1,0,0,11.6
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857,0,0,0,1,6.3
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526,0,0,0,1,21.1


In [5]:
## Changing sales to dummy variables
test = pd.concat([test, pd.get_dummies(test['trustLevel'], prefix= 'trustLevel')], axis = 1)


test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,interaction_1,interaction_2,interaction_3,interaction_4,average_seconds_per_item,trustLevel_1,trustLevel_2,trustLevel_3,trustLevel_4,trustLevel_5,trustLevel_6
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429,0,0,0,1,5.3,0,0,0,1,0,0
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259,0,0,0,1,17.0,0,0,1,0,0,0
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0,0,1,0,0,11.6,1,0,0,0,0,0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857,0,0,0,1,6.3,0,0,0,0,1,0
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526,0,0,0,1,21.1,0,0,0,0,1,0


In [6]:
test.columns

Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'interaction_1', 'interaction_2',
       'interaction_3', 'interaction_4', 'average_seconds_per_item',
       'trustLevel_1', 'trustLevel_2', 'trustLevel_3', 'trustLevel_4',
       'trustLevel_5', 'trustLevel_6'],
      dtype='object')

In [7]:
## Standardizing variables

In [10]:
scaler = MinMaxScaler()
test = test.dropna()
test[['totalScanTimeInSeconds_0_1', 'grandTotal_0_1', 'lineItemVoids_0_1',
      'scansWithoutRegistration_0_1', 'quantityModifications_0_1', 'scannedLineItemsPerSecond_0_1',
     'valuePerSecond_0_1', 'lineItemVoidsPerPosition_0_1']] = scaler.fit_transform(test[['totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
                                                                                                                                 'scansWithoutRegistration', 'quantityModifications', 'scannedLineItemsPerSecond',
                                                                                                                                 'valuePerSecond', 'lineItemVoidsPerPosition']])
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,interaction_1,interaction_2,interaction_3,interaction_4,average_seconds_per_item,trustLevel_1,trustLevel_2,trustLevel_3,trustLevel_4,trustLevel_5,trustLevel_6,totalScanTimeInSeconds_0_1,grandTotal_0_1,lineItemVoids_0_1,scansWithoutRegistration_0_1,quantityModifications_0_1,scannedLineItemsPerSecond_0_1,valuePerSecond_0_1,lineItemVoidsPerPosition_0_1
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429,0,0,0,1,5.3,0,0,0,1,0,0,0.254645,0.884888,0.363636,0.8,0.8,0.000481,0.0019,0.051948
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259,0,0,0,1,17.0,0,0,1,0,0,0,0.548087,0.589959,0.636364,0.6,0.2,0.000878,0.000589,0.023569
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0,0,1,0,0,11.6,1,0,0,0,0,0,0.087978,0.140014,0.363636,0.5,0.8,0.000188,0.000867,0.363636
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857,0,0,0,1,6.3,0,0,0,0,1,0,0.290164,0.847985,0.818182,0.3,0.8,0.000859,0.001598,0.058442
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526,0,0,0,1,21.1,0,0,0,0,1,0,0.485792,0.421642,0.363636,0.0,0.0,0.000693,0.000475,0.019139


In [13]:
test.columns

Index(['trustLevel', 'totalScanTimeInSeconds', 'grandTotal', 'lineItemVoids',
       'scansWithoutRegistration', 'quantityModifications',
       'scannedLineItemsPerSecond', 'valuePerSecond',
       'lineItemVoidsPerPosition', 'interaction_1', 'interaction_2',
       'interaction_3', 'interaction_4', 'average_seconds_per_item',
       'trustLevel_1', 'trustLevel_2', 'trustLevel_3', 'trustLevel_4',
       'trustLevel_5', 'trustLevel_6', 'totalScanTimeInSeconds_0_1',
       'grandTotal_0_1', 'lineItemVoids_0_1', 'scansWithoutRegistration_0_1',
       'quantityModifications_0_1', 'scannedLineItemsPerSecond_0_1',
       'valuePerSecond_0_1', 'lineItemVoidsPerPosition_0_1'],
      dtype='object')

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

X = test[['totalScanTimeInSeconds_0_1', 'grandTotal_0_1', 'lineItemVoids_0_1', 'scansWithoutRegistration_0_1', 
       'quantityModifications_0_1', 'scannedLineItemsPerSecond_0_1', 'valuePerSecond_0_1', 'lineItemVoidsPerPosition_0_1',
        'trustLevel_1', 'trustLevel_2', 'trustLevel_3', 'trustLevel_4', 'trustLevel_5', 'trustLevel_6']]

# Defining the number of clusters to be considered
krange = list(range(2, 20))

# Defining list to store scores
CH_scores, DB_scores, St_scores = list(), list(), list()

for clusters in krange:
    
    # Defining the k-means model
    cluster_md = KMeans(n_clusters = clusters, n_init = 20).fit(X)
    cluster_assignments = cluster_md.labels_
    
    # Extracting the CH, DB, and St scores
    CH = calinski_harabasz_score(X, cluster_assignments)
    DB = davies_bouldin_score(X, cluster_assignments)
    St = silhouette_score(X, cluster_assignments)
    
    # Appending scores
    CH_scores.append(CH)
    DB_scores.append(DB)
    St_scores.append(St)
    
# Visualizing the scores
## calinski_harabasz_score
fig, axs = plt.subplots(1, 3, figsize = (15, 6))
axs[0].plot(krange, CH_scores)
axs[0].set_xlabel('Number of Clusters (K)')
axs[0].set_ylabel('CH_score')
axs[0].grid()

## davies_bouldin_score
axs[1].plot(krange, DB_scores)
axs[1].set_xlabel('Number of Clusters (K)')
axs[1].set_ylabel('DB_scores')
axs[1].grid()

## silhouette_score
axs[2].plot(krange, St_scores)
axs[2].set_xlabel('Number of Clusters (K)')
axs[2].set_ylabel('St_scores')
axs[2].grid()


### From the visualization, six clusters seems to be the appropriate number of clusters for the data.

### Labeling observations with KMeans model

In [None]:
cluster_md = KMeans(n_clusters = 6, n_init = 20).fit(X)
test['Labels'] = cluster_md.labels_
test.head()

In [None]:
## Changing sales to dummy variables
test = pd.concat([test, pd.get_dummies(test['Labels'], prefix= 'Labels')], axis = 1)

test.head()

In [None]:
cluster_md.cluster_centers_

In [None]:
cluster_md.labels_

In [None]:
test['interaction_5'] = test['lineItemVoids_0_1'] * test['scansWithoutRegistration_0_1']
test['interaction_6'] = test['quantityModifications_0_1'] * test['scannedLineItemsPerSecond_0_1']
test['interaction_7'] = test['totalScanTimeInSeconds_0_1'] * test['grandTotal_0_1']

test.head()

In [None]:
test['interaction_8'] = np.where((test['Labels'] == 1) & (test['trustLevel'] == 1), 1, 0)
test.head()