# Table of Contents

### 0.1 Importing Libraries

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### 0.2 Importing Data

In [4]:
# Identify the file pathway to data files
path = r'C:\Users\CJ\Documents\_CJ-Stuff\Career Foundry\Data Immersion\Ach 4 - Python\2023-03 Instacart Basket Analysis'

In [5]:
# Create 'vars_list' so we only import the variables we need from orders.csv
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [6]:
# Import data
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), usecols = vars_list)

### 0.3 Exploring Original df

In [7]:
df.shape

(3421083, 6)

In [8]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [9]:
df.tail()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0
3421082,272231,206209,14,6,14,30.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


All of the columns are useful, but some have inefficient datatypes and order_dow could be named more clearly.

In [11]:
df.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


The count of 'days_since_prior_order' is different from the count for the other variables (and the df as a whole) which indicates missing values to be explored.

The count of order_id and the max value for order_id is the same -- meaning that the order_id is likely a unique identifier.

For the columns that are quantitative (order_dow, order_hour_of_day, days_since_prior_order), the basic statistics (max, min, mean, median) all seem plausible.

### 0.4 Renaming columns

In [12]:
# Because order_dow could be potentially unclear, 
# we're renaming it to order_day_of_week
# where 0 = Sat, 1 = Sun, 2 = Mon, etc.
df.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [13]:
# Confirming the change
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### 0.5 Adjusting datatypes to be more appropriate for variables

In [14]:
# order_day_of_week ranges between 0 to 6
# order_hour_of_day ranges between 0 to 23
# so int8 (which goes up to 127) is more than sufficient
df['order_day_of_week'] = df['order_day_of_week'].astype('int8')
df['order_hour_of_day'] = df['order_hour_of_day'].astype('int8')

In [15]:
# order_number ranges between 1 to 100
# and int16 will allow it to grow up to over 32k
df['order_number'] = df['order_number'].astype('int16')

In [16]:
# order_id and user_id are each unique integer identifiers
# so int64 is more appropriate than float64
df['user_id'] = df['user_id'].astype('int64')
df['order_id'] = df['order_id'].astype('int64')

In [17]:
# days_since_prior_order ranges between 0 to 300
# so float16 is more than sufficient. 
df['days_since_prior_order'] = df['days_since_prior_order'].astype('float16')

In [18]:
# Confirming the changes.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int16  
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
dtypes: float16(1), int16(1), int64(2), int8(2)
memory usage: 71.8 MB


### 0.6 Cleaning data

#### 0.6.1 Overall df checks

In [19]:
# Checking for nulls across df
df.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

When we clean the 'days_since_prior_order' column, those 206,209 missing values will need to be addressed.

In [20]:
# Check for mixed data types
for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

No mixed data types found.

In [21]:
# Checking for whole-row duplicates
dups = df[df.duplicated()]

In [22]:
dups.shape

(0, 6)

The dataframe of dups has zero rows which means that there were no duplicated rows.

#### 0.6.2 - order_id cleaning/wrangling

In [23]:
#Exploring the data for this column
df['order_id'].describe()

count    3.421083e+06
mean     1.710542e+06
std      9.875817e+05
min      1.000000e+00
25%      8.552715e+05
50%      1.710542e+06
75%      2.565812e+06
max      3.421083e+06
Name: order_id, dtype: float64

In [24]:
# Confirming uniqueness of order_id numbers
df.order_id.nunique()

3421083

The number of unique order_id numbers matches the number of rows in the df

#### 0.6.3 - user_id cleaning/wrangling

In [25]:
#Exploring the data for this column
df['user_id'].describe()

count    3.421083e+06
mean     1.029782e+05
std      5.953372e+04
min      1.000000e+00
25%      5.139400e+04
50%      1.026890e+05
75%      1.543850e+05
max      2.062090e+05
Name: user_id, dtype: float64

In [26]:
df.user_id.nunique()

206209

There are 206,209 unique users in this df.

In [27]:
# Checking the number of duplicated user_ids
df[df.duplicated('user_id')].shape


(3214874, 6)

This fits the data since 

3,421,083 total rows 

-- 206,209 first orders of unique users 

= 3,214,874 repeat orders (which would result in duplicated user_ids)

#### 0.6.4 - order_number cleaning/wrangling

In [28]:
#Exploring the data for this column
df['order_number'].describe()

count    3.421083e+06
mean     1.715486e+01
std      1.773316e+01
min      1.000000e+00
25%      5.000000e+00
50%      1.100000e+01
75%      2.300000e+01
max      1.000000e+02
Name: order_number, dtype: float64

In [29]:
df.order_number.nunique()

100

In [30]:
df['order_number'].value_counts()

1      206209
2      206209
3      206209
4      206209
5      182223
        ...  
96       1592
97       1525
98       1471
99       1421
100      1374
Name: order_number, Length: 100, dtype: int64

The order_number variable ranges from 1 to 100.  
All users in this df have ordered at least 4 times, and 1374 users have ordered 100 times. 

#### 0.6.5 - order_day_of_week cleaning/wrangling

In [31]:
#Exploring the data for this column
df['order_day_of_week'].describe()

count    3.421083e+06
mean     2.776219e+00
std      2.046829e+00
min      0.000000e+00
25%      1.000000e+00
50%      3.000000e+00
75%      5.000000e+00
max      6.000000e+00
Name: order_day_of_week, dtype: float64

In [32]:
df.order_day_of_week.nunique()

7

In [33]:
df['order_day_of_week'].value_counts().sort_index()

0    600905
1    587478
2    467260
3    436972
4    426339
5    453368
6    448761
Name: order_day_of_week, dtype: int64

The order_day_of_week variable ranges from 0 to 6 (where 0 = Saturday, 1 = Sunday, 2 = Monday, etc.)

#### 0.6.6 - order_hour_of_day cleaning/wrangling

In [34]:
#Exploring the data for this column
df['order_hour_of_day'].describe()

count    3.421083e+06
mean     1.345202e+01
std      4.226088e+00
min      0.000000e+00
25%      1.000000e+01
50%      1.300000e+01
75%      1.600000e+01
max      2.300000e+01
Name: order_hour_of_day, dtype: float64

In [35]:
df.order_hour_of_day.nunique()

24

In [36]:
df['order_hour_of_day'].value_counts().sort_index()

0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: order_hour_of_day, dtype: int64

The order_hour_of_day variable ranges from 0 to 23 (where 0 = Midnight, 1 = 1 a.m., 2 = 2 a.m. ... 12 = Noon, 13 = 1 p.m., etc.)

#### 0.6.7 - days_since_prior_order cleaning/wrangling

In [37]:
#Exploring the data for this column
df['days_since_prior_order'].describe()

count    3214874.0
mean           NaN
std            0.0
min            0.0
25%            4.0
50%            7.0
75%           15.0
max           30.0
Name: days_since_prior_order, dtype: float64

In [38]:
df.days_since_prior_order.nunique()

31

In [39]:
df['days_since_prior_order'].value_counts(dropna=False).sort_index()

0.0      67755
1.0     145247
2.0     193206
3.0     217005
4.0     221696
5.0     214503
6.0     240013
7.0     320608
8.0     181717
9.0     118188
10.0     95186
11.0     80970
12.0     76146
13.0     83214
14.0    100230
15.0     66579
16.0     46941
17.0     39245
18.0     35881
19.0     34384
20.0     38527
21.0     45470
22.0     32012
23.0     23885
24.0     20712
25.0     19234
26.0     19016
27.0     22013
28.0     26777
29.0     19191
30.0    369323
NaN     206209
Name: days_since_prior_order, dtype: int64

Zero days since prior order would indicate that there had been an earlier order that same day.  

The orders with NaN for days_since_prior_order would then refer to a customer who has *never* had a prior order due to them being a *new* customer.  Thus these NaN orders would be the customer's first order ever.

In [40]:
# Confirming that the orders with NaN for days_since_prior_order
# all also have an order_number = 1
df_days_null = df[df['days_since_prior_order'].isnull() == True]

In [41]:
df_days_null['order_number'].describe()

count    206209.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: order_number, dtype: float64

This shows that all 206,209 rows with null values for days_since_prior_order
are also labeled as having an order_number = 1.  So we can appropriately flag these rows as first_order_of_new_customer

This could also be confirmed using a crosstab.
This code is included here as a requirement of my CareerFoundry exercise.

In [42]:
# Confirming that the orders with an order_number = 1 never show up
# as having been ordered later than another order
# (i.e. have a non-null value for days_since_prior order)

crosstab = pd.crosstab(df['days_since_prior_order'], df['order_number'], dropna = False)

In [43]:
crosstab.shape

(31, 100)

In [44]:
crosstab

order_number,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
days_since_prior_order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0,2899,2915,2964,2634,2443,2236,2101,1972,1782,...,156,154,125,151,151,153,120,144,134,137
1.0,0,4822,4613,4690,4423,4047,4000,3744,3503,3358,...,397,396,373,359,366,330,356,360,339,321
2.0,0,6203,6209,6231,5789,5679,5274,5090,4914,4705,...,416,405,407,389,359,351,359,346,318,329
3.0,0,7411,7577,7414,7097,6770,6424,6078,5750,5663,...,345,287,330,294,291,292,254,221,240,223
4.0,0,8415,8476,8469,8029,7608,7092,6735,6534,6395,...,221,238,225,191,193,185,183,170,170,153
5.0,0,9216,9329,9220,8600,8228,7703,7270,6828,6563,...,142,128,105,119,119,120,100,95,82,81
6.0,0,11419,11701,11601,10771,10054,9521,8842,8527,7727,...,77,91,86,82,67,59,58,50,58,44
7.0,0,16788,16609,16341,15135,13834,13268,12452,11612,10980,...,60,56,48,56,36,37,37,41,32,35
8.0,0,10422,10304,10148,9378,8738,8068,7594,6894,6511,...,29,22,24,26,23,30,18,16,12,13
9.0,0,7053,7160,7013,6610,5993,5497,5082,4760,4404,...,12,8,14,12,8,6,7,8,7,11


Note that order_number = 1 never appears when days_since_prior_order is a non-null value. 

##### Create first_order_of_new_customer flag

In [45]:
# Creating a copy of the orders dataframe in which to insert the new column.
df_clean = df

In [46]:
# Assigning boolean values to new column
df_clean['first_order_of_new_customer'] = df_clean['days_since_prior_order'].isnull()

In [47]:
#Checking to see if first set of values were correctly labeled as True/False in the new column.
df_clean.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


In [48]:
# Confirming that the new column is showing up as the correct datatype
df_clean['first_order_of_new_customer'].dtype

dtype('bool')

In [49]:
# Checking the shape of the orders_clean df to make sure it is the same number of rows (3421083)
# with one additional column (7 instead of 6).
df_clean.shape

(3421083, 7)

In [50]:
# Confirming that the number of 'True' values in the first_order_of_new_customer 
# matches the number of null values in days_since_prior_order (206,209)
df_clean['first_order_of_new_customer'].sum()

206209

### 0.7 Export clean df

In [51]:
# Confirming final shape and datetypes
df_clean.shape

(3421083, 7)

In [52]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   order_id                     int64  
 1   user_id                      int64  
 2   order_number                 int16  
 3   order_day_of_week            int8   
 4   order_hour_of_day            int8   
 5   days_since_prior_order       float16
 6   first_order_of_new_customer  bool   
dtypes: bool(1), float16(1), int16(1), int64(2), int8(2)
memory usage: 75.0 MB


In [53]:
# Export df as customers_clean
df_clean.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_clean.pkl'))