# 4.4 Data Wrangling & Subsetting

## Instacart Grocery Basket Analysis

#### -Errol Hinkamp

##### Table of Contents

1. Import libraries
2. Import data
3. Exercise walkthrough
- 3.1 Drop columns
- 3.2 Rename columns
- 3.3 Change data types
- 3.4 Transpose data
- 3.5 Create data dictionary
- 3.6 Create subset
4. Change column data type
5. Rename column
6. Determine busiest hour for orders
7. Find identity of value using data dictionary
8. Create subset of breakfast items
9. Create dinner party subset
10. Create subset of user_id=1
11. Export dataframes

# 1. Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Import data

In [2]:
# Import first dataframe
path=r'C:\Users\Errol\Documents\Data Analyst Work\Achievement 4\Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col=False)
# Import second dataframe
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

# 3. Exercise walkthrough

### 3.1 Drop columns

In [3]:
# Drop 'eval_set' column from orders dataframe
df_ords = df_ords.drop(columns = ['eval_set'])

In [4]:
# Search for missing values in 'days_since_prior_order' column in orders dataframe
df_ords['days_since_prior_order'].value_counts(dropna=False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

### 3.2 Rename columns

In [5]:
# Rename 'order_dow' column in orders dataframe
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [6]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### 3.3 Change data types

In [7]:
# Change 'order_id' column in orders dataframe to string
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [8]:
df_ords['order_id'].dtype

dtype('O')

### 3.4 Transpose data

In [9]:
# Import third dataframe
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col=False)

In [10]:
# Transpose departments dataframe
df_dep_t=df_dep.T

In [11]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [12]:
# Take new header from first row of departments dataframe
new_header = df_dep_t.iloc[0]

In [13]:
# Make copy of departments dataframe without first row
df_dep_t_new = df_dep_t[1:]

In [14]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [15]:
# Add new header to new dataframe
df_dep_t_new.columns = new_header

In [16]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### 3.5 Create data dictionary

In [17]:
# Assign data dictionary
data_dict = df_dep_t_new.to_dict('index')

In [18]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [19]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [20]:
# Determine identity of department_id 19
print(data_dict.get('19'))

{'department': 'snacks'}


### 3.6 Create subset

In [21]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [22]:
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [23]:
# Making subset of only snacks
df_snacks =  df_prods[df_prods['department_id']==19]

In [24]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# 4. Change column data type

In [25]:
df_ords.dtypes

order_id                   object
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [26]:
# Change 'user_id' column in orders dataframe to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [27]:
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

# 5. Rename column

In [28]:
df_ords.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [29]:
# The column names are all self-explanatory, but for the purpose of this assignment I will rename one
# Rename 'days_since_prior_order' column in orders dataframe
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [30]:
df_ords.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_last_order'],
      dtype='object')

# 6. Determine busiest hour for orders

In [31]:
# Get frequency of values in 'order_hour_of_day'
df_ords['order_hour_of_day'].value_counts(dropna=False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

##### The busiest hour is 10 AM

# 7. Find identity of value using data dictionary

In [32]:
# Consult data dictionary to find identity of department_id 4
print(data_dict.get('4'))

{'department': 'produce'}


# 8. Create subset of breakfast items

In [33]:
# According to the data dictionary, breakfast items are department_id=14
df_breakfast=df_prods.loc[df_prods['department_id']==14]

In [34]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


# 9. Create dinner party subset

In [35]:
# According to the data dictionary, alcohol, deli, beverages, and meat/seafood are department_ids 5, 20, 7, & 12, respectively
df_dinner_parties=df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [36]:
df_dinner_parties.head(20)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [37]:
# Check shape of df_dinner_parties
df_dinner_parties.shape

(7650, 5)

# 10. Create subset of user_id=1

In [38]:
# Make subset of user_id=1
df_user_id_1=df_ords.loc[df_ords['user_id']=='1']

In [39]:
df_user_id_1.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [40]:
# Basic calculations
df_user_id_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [41]:
# Frequencies
df_user_id_1['order_number'].value_counts(dropna=False)

1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
Name: order_number, dtype: int64

In [42]:
df_user_id_1['orders_day_of_week'].value_counts(dropna=False)

4    4
1    3
2    2
3    2
Name: orders_day_of_week, dtype: int64

In [43]:
df_user_id_1['order_hour_of_day'].value_counts(dropna=False)

7     3
8     3
16    1
9     1
12    1
14    1
15    1
Name: order_hour_of_day, dtype: int64

In [44]:
df_user_id_1['days_since_last_order'].value_counts(dropna=False)

14.0    2
NaN     1
20.0    1
28.0    1
21.0    1
19.0    1
0.0     1
30.0    1
15.0    1
29.0    1
Name: days_since_last_order, dtype: int64

# 11. Export dataframes

In [45]:
# Export df_ords
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [46]:
# Export df_dep_t_new
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))