## Contents
    01 Import libraries and data
    02 Rename columns
    03 Wrangle/transpose dataframe
    04 Create a dictionary
        a filter using dictionary
    05 Wrangling with different df
    06 Reverse search in dictionary
    07 Export

# 4.4 Exercise

## 01 Import the libraries and csv files

In [1]:
import pandas as pd
import numpy as np
import os

In [3]:
path = r'/Users/Emily/Documents/CF Data Analysis Program/Immersion 4/Instacart Basket Analysis'

In [4]:
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [5]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), usecols = vars_list)

In [6]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'original data', 'products.csv'), index_col = False)

In [6]:
# frequency of values in 'days_since_prior_order' column INCLUDING NaN
df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

### 02 Rename columns inplace

In [7]:
# rename 'order_dow' column to something more intuitive. inplace = True means the column is overwritten, not doubled
df_ords.rename(columns = {'order_dow':'order_day_of_week'},inplace = True)

In [8]:
# check to see if the column was actually re-named
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [9]:
# changing a variable's data type
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [10]:
# check to see if it changed from integer to string/object
df_ords['order_id'].dtype

dtype('O')

## Importing more data

In [7]:
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'original data', 'departments.csv'), index_col = False)

In [12]:
# initial check to see what it looks like: very wide
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [8]:
# create a transpose and re-name it instead of overwriting
df_dep_t = df_dep.T

In [9]:
# check to see what it looks like now. Weird top line due to the transpose
df_dep_t.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


In [10]:
# add an index to the datatable. Not sure why this step exists.
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [12]:
df_dep_t.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22 entries, department_id to 21
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       22 non-null     object
dtypes: object(1)
memory usage: 908.0+ bytes


## 03 Wrangle/transpose dataframe

In [14]:
# assign new_header variable to the actual header in the 0th row of the dt
new_header = df_dep_t.iloc[0]

In [15]:
# assign a new variable to the actual list, starting from the 1st index through the end
df_dep_t_new = df_dep_t[1:]

In [18]:
# check and see what it looks like now. Don't need .head() if you know how many records there are
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [16]:
# check to see what's stored in new_header
new_header

0    department
Name: department_id, dtype: object

In [17]:
# assign the values in 'new_header' to the column names for the df
df_dep_t_new.columns = new_header

In [18]:
# check and see what it looks like now
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [19]:
df_dep_t_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, 1 to 21
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   department  21 non-null     object
dtypes: object(1)
memory usage: 336.0+ bytes


## 04 Create dictionary

In [22]:
# create a data dictionary from the transposed df with the new headers
data_dict = df_dep_t_new.to_dict('index')

In [23]:
# see what it looks like now that's it's a dictionary, not just a df
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [24]:
# check out the top of the products df
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


### a) filter using dictionary

In [25]:
# use the dictionary via .get function
print(data_dict.get('19'))

{'department': 'snacks'}


In [26]:
# identify all the records in the products df that are in beverages (department_id == 7)
df_prods['department_id'] == 7

0        False
1        False
2         True
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [27]:
# filter the products df by matching only the department_id == 7
df_prods[df_prods['department_id'] == 7]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
...,...,...,...,...,...
49659,49655,Apple Cider,98,7,10.7
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5


In [28]:
# grab a slice of the data that is only the beverages and rename it
df_beverages = df_prods[df_prods['department_id'] == 7]

In [29]:
# check to see what it looks like
df_beverages.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0


# 04 More wrangling - Task 4.4

In [30]:
# Step 2: find another identifier variable in df_ords that doesn't need to be numeric
# user_id is going to be converted to a string/object
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [31]:
df_ords['user_id'].dtype

dtype('O')

In [32]:
# Step 3: look for a variable in df_ords with and unintuitive name and rename WITHOUT overwriting the df
# days_since_prior_order is going to be changed to days_between_orders
df_ords_2 = df_ords.rename(columns = {'days_since_prior_order' : 'days_between_orders'}, inplace = False)

In [33]:
df_ords_2.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_between_orders
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [34]:
# Step 4: what is the busiest hour for placing orders?
# create a frequency table of the values in order_hour_of_day
# the most popular hour to place an order is 10am
df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

In [35]:
# Step 5: determine the meaning of department_id '4'
# to call an entry from a data dictionary, use .get function & print to make it appear
# department_id '4' is the produce department
print(data_dict.get('4'))

{'department': 'produce'}


## 06 Reverse search in dictionary

In [36]:
# Step 6: create a subset of just breakfast item sales
# first, I need to figure out what department_id breakfast is
# the department_id and the department names are stored in a nested dictionary
# this loop tests how to iterate within nested dictionaries
dep_search = 'breakfast'

for dict_id, dict_info in data_dict.items():
    print(dict_info)

{'department': 'frozen'}
{'department': 'other'}
{'department': 'bakery'}
{'department': 'produce'}
{'department': 'alcohol'}
{'department': 'international'}
{'department': 'beverages'}
{'department': 'pets'}
{'department': 'dry goods pasta'}
{'department': 'bulk'}
{'department': 'personal care'}
{'department': 'meat seafood'}
{'department': 'pantry'}
{'department': 'breakfast'}
{'department': 'canned goods'}
{'department': 'dairy eggs'}
{'department': 'household'}
{'department': 'babies'}
{'department': 'snacks'}
{'department': 'deli'}
{'department': 'missing'}


In [37]:
# these loops iterate through the dictionary until 'breakfast' is found and then prints the department_id
for dict_id, dict_info in data_dict.items():
    for key in dict_info:
        if dict_info[key] == dep_search:
            print(dict_id)

14


In [38]:
# here is the subset of products that are in the breakfast department, 14
df_prods_breakfast = df_prods[df_prods['department_id'] == 14 ]
df_prods_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


In [39]:
# Step 7: create a subset of products df with all from these departments: alcohol, deli, beverages, meat/seafood
# usting the same loops as before, identify which department_ids are needed
dep_list = ['alcohol', 'deli', 'beverages', 'meat seafood']
dep_id_list = []

for dict_id, dict_info in data_dict.items():
    for key in dict_info:
        if dict_info[key] in dep_list:
            print(dict_id)
            dep_id_list.append(dict_id)

5
7
12
20


In [40]:
# make the list of department_ids into integers
dep_id_list = [int(s) for s in dep_id_list]
dep_id_list

[5, 7, 12, 20]

In [41]:
# create subset of all products used in dinner parties
party_prods = df_prods.loc[df_prods['department_id'].isin(dep_id_list)]
party_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


In [42]:
# Step 8: how many rows does the last dataframe have?
# 7,650 rows
party_prods.shape

(7650, 5)

In [43]:
# Step 9: extract all info about user_id == 1
df_ords_user_1 = df_ords.loc[df_ords['user_id'] == '1']

In [44]:
# Step 10: provide some details about the user's behavior
# descriptive stats for user 1's orders
df_ords_user_1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [45]:
# look at all records from user 1
df_ords_user_1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [46]:
# there isn't much consistency for which day of the week or what time of day they place orders
# they made two orders in one day, about 2 hours apart
# one order has NaN listed as 'days_since_prior_order' -- maybe it was their first order?

## Export

In [47]:
df_ords.to_csv(os.path.join(path, '02 Data', 'prepared data', 'orders_wrangled.csv'))

In [48]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'prepared data', 'departments_wrangled.csv'))

In [49]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [50]:
df_ords.shape

(3421083, 6)