# Importing the Data

In [9]:
import re

# to handle datasets
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.pipeline import make_pipeline 

# import datetime
import datetime

# encoders
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# model creation
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV 
import pickle

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)
pd.pandas.set_option('display.max_rows', 100)

In [10]:
# load the csvs - it is available open source and online
df = pd.read_csv('2019_Winter_Data_Science_Intern_Challenge_Data_Set.csv')
df.head()

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items,payment_method,created_at
0,1,53,746,224,2,cash,2017-03-13 12:36:56
1,2,92,925,90,1,cash,2017-03-03 17:38:52
2,3,44,861,144,1,cash,2017-03-14 4:23:56
3,4,18,935,156,1,credit_card,2017-03-26 12:43:37
4,5,18,883,156,1,credit_card,2017-03-01 4:35:11


In [None]:
# order id can be dropped since theres only unique values
df['order_id'].value_counts().value_counts()

# App Question

Question 1: Given some sample data, write a program to answer the following

On Shopify, we have exactly 100 sneaker shops, and each of these shops sells only one model of shoe. We want to do some analysis of the average order value (AOV). When we look at orders data over a 30 day window, we naively calculate an AOV of $3145.13. Given that we know these shops are selling sneakers, a relatively affordable item, something seems wrong with our analysis. 

Think about what could be going wrong with our calculation. Think about a better way to evaluate this data. 
What metric would you report for this dataset?
What is its value?


# Data Exploration

In [14]:
def wrangle(path):
    df = pd.read_csv(path)

    # convert created_at column to a datetime object
    df['created_at']= pd.to_datetime(df['created_at'])

    # sort the dataframe by date
    df = df.sort_values(by=['created_at'])

    return df

In [15]:
path = '2019_Winter_Data_Science_Intern_Challenge_Data_Set.csv'
df = wrangle(path)
df.head()

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items,payment_method,created_at
1862,1863,39,738,536,4,cash,2017-03-01 00:08:09
1741,1742,39,910,268,2,cash,2017-03-01 00:10:19
3228,3229,97,912,324,2,cash,2017-03-01 00:14:12
1267,1268,80,798,290,2,credit_card,2017-03-01 00:19:31
2689,2690,49,799,258,2,credit_card,2017-03-01 00:22:25


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   order_id        5000 non-null   int64         
 1   shop_id         5000 non-null   int64         
 2   user_id         5000 non-null   int64         
 3   order_amount    5000 non-null   int64         
 4   total_items     5000 non-null   int64         
 5   payment_method  5000 non-null   object        
 6   created_at      5000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 273.6+ KB


In [17]:
df.tail()

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items,payment_method,created_at
2630,2631,53,940,112,1,credit_card,2017-03-30 23:12:13
1685,1686,34,818,244,2,cash,2017-03-30 23:16:10
1474,1475,21,815,142,1,cash,2017-03-30 23:26:54
317,318,52,848,292,2,cash,2017-03-30 23:41:34
2457,2458,95,700,168,1,credit_card,2017-03-30 23:55:35


In [4]:
df['payment_method'].value_counts()

credit_card    1735
debit          1671
cash           1594
Name: payment_method, dtype: int64

In [23]:
df.sort_values(by=['order_amount'])

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items,payment_method,created_at
4760,4761,92,937,90,1,debit,2017-03-20 07:37:28
1843,1844,92,987,90,1,debit,2017-03-06 07:01:09
2092,2093,92,986,90,1,debit,2017-03-04 06:44:05
4414,4415,92,927,90,1,credit_card,2017-03-17 09:57:01
228,229,92,757,90,1,debit,2017-03-13 23:57:51
...,...,...,...,...,...,...,...
4868,4869,42,607,704000,2000,credit_card,2017-03-22 04:00:00
15,16,42,607,704000,2000,credit_card,2017-03-07 04:00:00
1602,1603,42,607,704000,2000,credit_card,2017-03-17 04:00:00
2969,2970,42,607,704000,2000,credit_card,2017-03-28 04:00:00


In [None]:
# any order amount greater than $10000 came from two different stores
df[df['order_amount'] > 10000]

# Using Interquartile Range to Find Outliers

In [8]:
df.describe()

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,50.0788,849.0924,3145.128,8.7872
std,1443.520003,29.006118,87.798982,41282.539349,116.32032
min,1.0,1.0,607.0,90.0,1.0
25%,1250.75,24.0,775.0,163.0,1.0
50%,2500.5,50.0,849.0,284.0,2.0
75%,3750.25,75.0,925.0,390.0,3.0
max,5000.0,100.0,999.0,704000.0,2000.0


In [22]:
# Use the 25% and 75& range of the order_amount to calculate outlier
Q1_Order_Amount_Outlier = 163
Q3_Order_Amount_Outlier = 390
IQR_Order_Amount = Q3_Order_Amount_Outlier - Q1_Order_Amount_Outlier

# Calculate the outlier
Minimum_Outlier = Q1_Order_Amount_Outlier - (1.5*IQR_Order_Amount)
Maximum_Outlier = Q3_Order_Amount_Outlier + (1.5*IQR_Order_Amount)

print('Minimum Outlier: ', Minimum_Outlier)
print('Maximum Outlier: ', Maximum_Outlier)

Minimum Outlier:  -177.5
Maximum Outlier:  730.5


In [24]:
# Only need to use Maximum outlier as a condition since minimum outlier is negative
Outlier_Condition = (df['order_amount'] <= Maximum_Outlier)

# create a df without the outliers
df_no_outlier = df[Outlier_Condition]

# sanity check to see if outliers removed
df_no_outlier.sort_values(by=['order_amount']).tail()

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items,payment_method,created_at
3824,3825,43,736,724,4,credit_card,2017-03-25 17:34:51
2786,2787,43,876,724,4,credit_card,2017-03-01 01:41:43
2312,2313,79,745,724,4,debit,2017-03-27 09:26:31
4659,4660,47,894,725,5,cash,2017-03-01 04:49:59
1124,1125,52,994,730,5,credit_card,2017-03-07 06:54:05


In [25]:
df_no_outlier.describe()

Unnamed: 0,order_id,shop_id,user_id,order_amount,total_items
count,4859.0,4859.0,4859.0,4859.0,4859.0
mean,2497.395966,49.852645,849.905742,293.715374,1.950196
std,1443.356555,29.049171,86.887496,144.453395,0.919791
min,1.0,1.0,700.0,90.0,1.0
25%,1244.5,24.0,776.0,162.0,1.0
50%,2498.0,50.0,850.0,280.0,2.0
75%,3749.5,74.0,925.0,380.0,3.0
max,5000.0,100.0,999.0,730.0,5.0


# Average Order Value of Each Store

In [30]:
df['order_id'].value_counts().value_counts()

1    5000
Name: order_id, dtype: int64

# Answer

At first glance, it seems the reason that the average (mean) order amount is so high is that there are some values are outliers (there an order that ordered 2,000 itmes for a total amount of $704,000).

One Solution is to find the median instead of the mean so that it's easier to ignore the outliers. I found the median to be $284, which is a much more believable amount.

Another solution is to find the outlier for the order amount, and create a new dataset with the outliers removed since the outliers are outrageous in amount and are either a mistake or a huge unique type transaction (i.e. warehouse inventory order). After removing the outliers, we found the average (mean) order amount to be ~$294. To note, the median didn't change much with median order amount being $280. Therefore I'm pretty confident using the $294 as the average order amount.

However, since each store only sells one product, it might make sense to calculate the average order value for each store individually.