# 01 Importing Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# import order data
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

path = r'/Users/Dena/Instacart Basket Analysis'

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), usecols = vars_list)

In [3]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [4]:
# import product data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [5]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


# 02 Data Wrangling

In [6]:
# rename dow columm
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [8]:
# change data type
df_ords['order_id'] = df_ords['order_id'].astype('str')

df_ords['order_id'].dtype

dtype('O')

In [9]:
# import department data
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'))

df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [13]:
# transpose the department data
df_dep.T

# create a transposed dataframe
df_dep_t = df_dep.T

# add an index
df_dep_t.reset_index()

# take the 1st row for a new header
new_header = df_dep_t.iloc[0]

# take the data from row 1 and on as a new df
df_dep_t_new = df_dep_t[1:]

df_dep_t_new.columns = new_header

df_dep_t_new.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [11]:
# turn the department data into a data dictionary
data_dict = df_dep_t_new.to_dict('index')

In [12]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

# 03 Task 4.4 Questions

In [16]:
# Question 2 -- Change another variable into a suitable format
df_ords['orders_day_of_week'] = df_ords['orders_day_of_week'].astype('str')

df_ords['orders_day_of_week'].dtype

dtype('O')

In [21]:
# Question 3 -- Change a variable name
df_ords.rename(columns = {'order_number' : 'number_of_orders_by_user'}, inplace = True)

df_ords.head()

Unnamed: 0,order_id,user_id,number_of_orders_by_user,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [22]:
# Question 4 -- Find the busiest hour for placing orders
df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

In [23]:
# The busiest hour for placing orders is hour 10

In [24]:
# Question 5 -- Find the meaning of 4 in the department_id column
print(data_dict.get('4'))

{'department': 'produce'}


In [None]:
# The meaning of 4 is produce

In [25]:
# Question 6 -- Create a subset of breakfast item sales
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [None]:
# The department_id for "breakfast" is 14

In [26]:
df_breakfast =  df_prods[df_prods['department_id']==4]

df_breakfast.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
30,31,White Pearl Onions,123,4,7.5
42,43,Organic Clementines,123,4,11.5
44,45,European Cucumber,83,4,14.3
65,66,European Style Spring Mix,123,4,11.7
88,89,Yogurt Fruit Dip Sliced Apples,123,4,12.6
98,99,Local Living Butter Lettuce,83,4,10.0
119,120,Cauliflower Florettes,123,4,2.3
141,142,Arugula Salad,123,4,2.4
142,143,Organic Lemons,24,4,12.0
147,148,Nectarines,24,4,9.3


In [28]:
# Question 7 -- Create a subset including alcohol, deli, beverages, and meat/seafood
# i.e. departments 5, 20, 7, and 12
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

df_dinner_party.head(25)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [29]:
# Question 8 -- How many rows does the last dataframe have?
df_dinner_party.shape

(7650, 5)

In [30]:
# The dinner party data frame has 7650 rows

In [31]:
# Question 9 -- Extract information about user_id 1
df_user_1 =  df_ords[df_ords['user_id']==1]

df_user_1.head(25)

Unnamed: 0,order_id,user_id,number_of_orders_by_user,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [32]:
# Question 10 -- Basic stats about the behavior of User 1
df_user_1.describe()

Unnamed: 0,user_id,number_of_orders_by_user,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,1.0,6.0,10.090909,19.0
std,0.0,3.316625,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,1.0,3.5,7.5,14.25
50%,1.0,6.0,8.0,19.5
75%,1.0,8.5,13.0,26.25
max,1.0,11.0,16.0,30.0


# 04 Exports

In [35]:
# Question 10 -- Export your df_ords dataframe as “orders_wrangled.csv” in your “Prepared Data” folder

In [33]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [37]:
# Question 11 -- Export the df_dep_t_new dataframe as “departments_wrangled.csv” in your “Prepared Data” folder

In [36]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))