### Contents:
    01 Importing libraries and data
    02 Data exploration
    03 Cleaning
        a clarify confusing column names
        b down sample data types
        c outliers
        d missing values
        e duplicates
    04 Export

# Cleaning orders df

## 01 Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/Emily/Documents/CF Data Analysis Program/Immersion 4/Instacart Basket Analysis'

In [3]:
df = pd.read_csv(os.path.join(path, '02 Data', 'original data', 'orders.csv'), index_col = False)

## 02 Initial exploration

In [4]:
# view the top 5 rows and all column names
df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [5]:
# check out the shape of the df (rows and columns)
df.shape

(3421083, 7)

In [6]:
# check out the descriptive stats of whole df
# df.describe() would have just shown info for the numeric columns
df.describe(include = 'all')

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083,3421083.0,3421083.0,3421083.0,3214874.0
unique,,,3,,,,
top,,,prior,,,,
freq,,,3214874,,,,
mean,1710542.0,102978.2,,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,,23.0,5.0,16.0,15.0


In [7]:
# check out the data type of each column
# can also use df.dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


## 03 Cleaning up

### a) Clarify necessary columns and column names

In [8]:
# get rid of columns that aren't necessary
# and reset the main df variable
df = df.drop(columns = ['eval_set'])

In [9]:
# rename confusing columns
# inplace = True means it's not just a temp rename
df.rename(columns = {'order_dow': 'order_day_of_week'}, inplace = True)

In [10]:
# check to make sure it worked
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### b) Data types

In [11]:
# order_number, order_day_of_week, order_hour_of_day, and days_since_prior_order
# only goes up to max 100, so int8 will work just fine
df['order_number'] = df['order_number'].astype('int8')
df['order_day_of_week'] = df['order_day_of_week'].astype('int8')
df['order_hour_of_day'] = df['order_hour_of_day'].astype('int8')

In [12]:
# The other numeric columns can be int32 to reduce space
df['user_id'] = df['user_id'].astype('int32')
df['order_id'] = df['order_id'].astype('int32')

In [13]:
# days_since_prior_order includes NaN, so it has to be a float, but we can still reduce it to float16
df['days_since_prior_order'] = df['days_since_prior_order'].astype('float16')

In [14]:
# this looks better, and the memory needed is reduced by about 2.5x
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
dtypes: float16(1), int32(2), int8(3)
memory usage: 42.4 MB


### c) Identifying outliers

In [15]:
# this looks pretty good
df['order_day_of_week'].value_counts(dropna = False).sort_index()

0    600905
1    587478
2    467260
3    436972
4    426339
5    453368
6    448761
Name: order_day_of_week, dtype: int64

In [16]:
# this also looks pretty good
df['order_hour_of_day'].value_counts(dropna = False).sort_index()

0      22758
1      12398
2       7539
3       5474
4       5527
5       9569
6      30529
7      91868
8     178201
9     257812
10    288418
11    284728
12    272841
13    277999
14    283042
15    283639
16    272553
17    228795
18    182912
19    140569
20    104292
21     78109
22     61468
23     40043
Name: order_hour_of_day, dtype: int64

In [17]:
# make sure to always include NaN when doing value_counts
# seems like NaN means something different than 0
# maybe it's the first order a customer is making?
df['days_since_prior_order'].value_counts(dropna = False).sort_index()

0.0      67755
1.0     145247
2.0     193206
3.0     217005
4.0     221696
5.0     214503
6.0     240013
7.0     320608
8.0     181717
9.0     118188
10.0     95186
11.0     80970
12.0     76146
13.0     83214
14.0    100230
15.0     66579
16.0     46941
17.0     39245
18.0     35881
19.0     34384
20.0     38527
21.0     45470
22.0     32012
23.0     23885
24.0     20712
25.0     19234
26.0     19016
27.0     22013
28.0     26777
29.0     19191
30.0    369323
NaN     206209
Name: days_since_prior_order, dtype: int64

### d) Missing values

In [18]:
# find which columns any missing values are in
# we know there are lots of NaN in days_since_prior_order
df.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### e) Duplicates

In [19]:
# check to see if any records are exact duplicates
# no dups here!
df[df.duplicated()]

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


## Export

In [20]:
df.to_csv(os.path.join(path, '02 Data', 'prepared data', 'orders_clean.csv'))