# Contents
1. Imports
2. Checks
3. Wrangling
4. Cleaning
5. Exports

# 1. Imports

In [35]:
#Libraries
import pandas as pd
import numpy as np
import os

In [36]:
#Path
path = r'/Users/davidgriesel/Documents/0 - Analytics Projects/Online Grocery Store'

[Link to data dictionary](https://gist.github.com/jeremystan/c3b39d947d9b88b3ccff3147dbcf6c6b)
- eval_set variable not relevant to analysis

In [37]:
#Set list with variables to import
list_vars = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [38]:
#Display list
list_vars

['order_id',
 'user_id',
 'order_number',
 'order_dow',
 'order_hour_of_day',
 'days_since_prior_order']

In [39]:
#Dataset - import variables from list only
df_orders = pd.read_csv(os.path.join(path, '02 - Data', 'Original Data', 'orders.csv'), usecols = list_vars)

# 2. Checks

In [40]:
#Dimensions
df_orders.shape

(3421083, 6)

In [41]:
#Preview
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [42]:
#Summary stats
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


##### Observations:
- Dataset contains 3,421,083 records
- Possible missing records under days_since_prior_order variable

# 3. Wrangling

## 3.1. Drop Columns

### 3.1.1. Identify Redundant Columns

In [43]:
#Check frequency distribution of variable
df_orders['order_id'].value_counts(dropna = False)

order_id
2539329    1
1591157    1
1354759    1
1971373    1
1558866    1
          ..
3266950    1
118963     1
9433       1
2938641    1
272231     1
Name: count, Length: 3421083, dtype: int64

##### Observations:
- 3,421,083 values ranging between 1 to 3,421,083
- With 3,421,1083 records these values are likely to be unique and/or sequential meaning dataset is complete
- Retain column

In [44]:
#Check frequency distribution of variable
df_orders['user_id'].value_counts(dropna = False)

user_id
152340    100
185641    100
185524    100
81678     100
70922     100
         ... 
13046       4
70208       4
111440      4
31762       4
47980       4
Name: count, Length: 206209, dtype: int64

##### Observations:
- 206,209 values ranging between 1 to 206,209
- All customers have likely placed at least one order
- Retain column

In [45]:
#Check frequency distribution of variable
df_orders['order_number'].value_counts(dropna = False)

order_number
1      206209
2      206209
3      206209
4      206209
5      182223
        ...  
96       1592
97       1525
98       1471
99       1421
100      1374
Name: count, Length: 100, dtype: int64

##### Observations:
- 100 values ranging between 1 and 100
- Customers placed between 1 and 100 orders per customer
- Retain column

In [46]:
#Check frequency distribution of variable
df_orders['order_dow'].value_counts(dropna = False).sort_index()

order_dow
0    600905
1    587478
2    467260
3    436972
4    426339
5    453368
6    448761
Name: count, dtype: int64

In [47]:
#Count values
df_orders['order_dow'].nunique(dropna = False)

7

##### Observations:
- 7 values ranging between 0 to 6 with 0 = Saturday, 1 = Sunday, 2 = Monday, ...
- Orders have been placed on all weekdays
- Retain column

In [48]:
#Check frequency distribution of variable
df_orders['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [49]:
#Count values
df_orders['order_hour_of_day'].nunique(dropna = False)

24

##### Observations:
- 24 values ranging between 0 to 23
- Orders have been placed at all times
- Retain column

In [50]:
#Check frequency distribution of variable
df_orders['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [51]:
#Count values
df_orders['days_since_prior_order'].nunique(dropna = False)

32

##### Observations: 
- 32 values ranging between 0 - 30, plus NaN
- Customers take between 0 and 30 days to place orders
- 206,209 NaN records, equal to number of values in user_id
- Retain column

### 3.1.2. Address Redundant Columns

##### Observations:
- No redundant columns identified

## 3.2. Rename Columns

### 3.2.1. Identify Unclear Descriptions

In [52]:
#Check Column Descriptions
df_orders.columns

Index(['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order'],
      dtype='object')

##### Observations:
- Unclear what difference is between order_id & order_number
- order_dow & days_since_prior_order can be described better

### 3.2.2. Address Unclear Descriptions

In [53]:
#Rename vague column descriptions
df_orders.rename(columns = {'order_number' : 'user_order_count',
                           'order_dow' : 'order_day_of_week',
                           'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [54]:
#Confirm results
df_orders.columns

Index(['order_id', 'user_id', 'user_order_count', 'order_day_of_week',
       'order_hour_of_day', 'days_since_last_order'],
      dtype='object')

##### Observations:
- Variables successfully renamed

## 3.3. Data Types

### 3.3.1. Identify Inconsistent Data Types

In [55]:
#Check data types
df_orders.dtypes

order_id                   int64
user_id                    int64
user_order_count           int64
order_day_of_week          int64
order_hour_of_day          int64
days_since_last_order    float64
dtype: object

##### Observations:
- Data types are consistent with contents of variables

### 3.3.2. Address Inconsistent Data Types

##### Observations:
- No inconsistencies noted

# 4. Cleaning

## 4.1. Accuracy

### 4.1.1. Identify Inaccurate Values

In [56]:
#Review descriptive statistics
df_orders.describe()

Unnamed: 0,order_id,user_id,user_order_count,order_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [57]:
#Number of unique values per variable
df_orders.nunique()

order_id                 3421083
user_id                   206209
user_order_count             100
order_day_of_week              7
order_hour_of_day             24
days_since_last_order         31
dtype: int64

##### Observations:
- The dataset contains 3,421,083 unique order_id's
- There are 206,209 unique user_id's in the dataset and all customers placed at least one order
- Customers placed between 1 and 100 orders each, most placing between 5 and 23 orders, with 17 orders on average
- Orders occur on all weekdays, with a slight tendency toward earlier weekdays
- Orders occur at all times of the day, but concentrate between 10:00 and 16:00, with slight tendency toward early afternoon
- Customers take between 0 and 30 days between orders, most waiting between 4 and 15 days, with 11 days on average between orders
- 206,209 missing values in days_since_last_order

### 4.1.2. Address Inaccurate Values

##### Observations
- No inaccuracies noted

## 4.2. Missing Values

### 4.2.1. Identify missing values

In [58]:
#Number of missing values per variable
df_orders.isnull().sum()

order_id                      0
user_id                       0
user_order_count              0
order_day_of_week             0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

##### Observations:
- Confirmed 'days_since_last_order' has 206,209 'NaN' values

In [59]:
#Filters records where days_since_last_order is missing
df_orders[df_orders['days_since_last_order'].isnull() == True]

Unnamed: 0,order_id,user_id,user_order_count,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


##### Observations:
- Missing values seem to correspond with 'user_order_number' of 1

In [60]:
#Filters records where user_order_number is 1 AND days_since_last_order is missing
df_orders[(df_orders['user_order_count'] == 1) & (df_orders['days_since_last_order'].isnull() == True)]

Unnamed: 0,order_id,user_id,user_order_count,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


##### Observations:
- Confirmed that all records where 'days_since_last_order' = NaN have a corresponding 'user_order_count' = 1
- The number of missing records agree to total number of customers and appear to be first-time orders

### 4.2.2. Address Missing Values

##### Observations:
- Replacing NaN with 0 would suggest a previous order occurring on the same day as first-time order
- Best option is to add a flag variable explaining the missing values

In [61]:
#Add boolean variable which returns True if days_since_last_order is NaN
df_orders['is_first_order'] = df_orders['days_since_last_order'].isna()

In [62]:
#Confirm results
df_orders[(df_orders['user_order_count'] == 1) & (df_orders['days_since_last_order'].isnull() == True)]

Unnamed: 0,order_id,user_id,user_order_count,order_day_of_week,order_hour_of_day,days_since_last_order,is_first_order
0,2539329,1,1,2,8,,True
11,2168274,2,1,2,11,,True
26,1374495,3,1,1,14,,True
39,3343014,4,1,6,11,,True
45,2717275,5,1,3,12,,True
...,...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,,True
3420934,3189322,206206,1,3,18,,True
3421002,2166133,206207,1,6,19,,True
3421019,2227043,206208,1,1,15,,True


In [63]:
#Check frequency of new variable
df_orders['is_first_order'].value_counts()

is_first_order
False    3214874
True      206209
Name: count, dtype: int64

##### Observations:
- Flag sucessfully added

## 4.3. Mixed Type Variables

### 4.3.1. Find Mixed Type Variables

In [64]:
#Finding mixed type data
for col in df_orders.columns.tolist():
    weird = (df_orders[[col]].map(type) != df_orders[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_orders[weird]) > 0:
        print (col)

##### Observations:
- No variables returned

### 4.3.2. Address Mixed Type Variables

##### Observations:
- No variables identified with with mixed type data

## 4.4. Duplicates

### 4.4.1. Find Duplicates

In [65]:
#Identify duplicates, create subset, and view results
df_orders_duplicates = df_orders[df_orders.duplicated()]
df_orders_duplicates

Unnamed: 0,order_id,user_id,user_order_count,order_day_of_week,order_hour_of_day,days_since_last_order,is_first_order


In [66]:
#Confirm dimensions
df_orders_duplicates.shape

(0, 7)

##### Observations:
- No records returned

### 4.4.2. Address Duplicates

##### Observations
- No duplicate records identified

# 5. Exports

In [67]:
#Confirm dimensions
df_orders.shape

(3421083, 7)

In [68]:
#Export cleaned dataset
df_orders.to_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'cleaned_orders.pkl'))