# Learning Goals - Wrangling Procedures

# 01. Importing Libraries

In [1]:
# Importing Pandas, Numpy, OS
import pandas as pd
import numpy as np
import os

# 02. Import Original Data with a path creation

In [2]:
path = r'/Users/ladyarmanoid/Documents/Data Analytics/CareerFoundry/Course/2- Data Immersion/Achievement 4/Instacart Basket Analysis/'

## Orders Dataframe Import

In [11]:
# df_ords = Orders data frame
df_ords = pd.read_csv(os.path.join(path,'02 Data','Original Data','orders.csv'), index_col = False)

## Products Dataframe Import

In [12]:
# df_prods = Products data frame
df_prods = pd.read_csv(os.path.join(path,'02 Data','Original Data','products.csv'), index_col = False)

# 03. Identify missing values

In [5]:
# df_ords missing values
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

# 04. Rename a column

In [6]:
# Rename order_dow to orders_day_of_week for more clarity
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [8]:
# Removal of the eval_set column using drop
df_ords = df_ords.drop(columns = ['eval_set'])

In [9]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# 05. Exclude non-numerical columns

In [13]:
# df_ords
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [14]:
df_ords ['order_id'].dtype

dtype('O')

# 06. Department Dataframe Import

In [15]:
# df_dep = Department data frame
df_dep = pd.read_csv(os.path.join(path,'02 Data','Original Data','departments.csv'), index_col = False)

In [16]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [18]:
# Transposing df_dep
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [19]:
df_dep_t = df_dep.T

In [20]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [23]:
# Take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [24]:
new_header

0    department
Name: department_id, dtype: object

In [25]:
# Remove the duplicate header
df_dep_t_new = df_dep_t [1:]

In [26]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [27]:
# Take the data under the header row for a new df
df_dep_t_new = df_dep_t [1:]

In [28]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [29]:
df_dep_t_new.columns = new_header # set the header row as the df header

In [30]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 07. Data Dictionary for the Department Dataframe

In [31]:
# Meaning for the values in the department_id column
data_dict = df_dep_t_new.to_dict('index')

In [32]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [33]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [34]:
print(data_dict.get('19'))

{'department': 'snacks'}


# 08. Subsetting

In [35]:
# Subset for df_prods
df_snacks = df_prods[df_prods['department_id']==19]

In [36]:
df_prods ['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [37]:
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [38]:
df_snacks = df_prods[df_prods['department_id']==19]

In [39]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [40]:
# Another way to look for snacks results: find the position using a new column name using loc
df_snacks_2 = df_prods.loc[df_prods['department_id'] ==19]

In [41]:
df_snacks_2

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [42]:
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [43]:
# Another option is to find the position using a new name to look into a list.
# This is valuable because it allows you to look at multiple deparment_ids at the same time, i.e. (isin([17,18,19]))
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [44]:
df_snacks_3.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# Task Steps - Wrangling Procedures

# 2. Finding an identifier in df_ords that doesn't need to be in numeric format and change it to a suitable format

In [45]:
df_ords.dtypes

order_id                   object
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

# 3. Changing a column name without overwriting the dataframe

In [None]:
df_ords.rename(columns = {'order_hour_of_day' : 'hour_of_day_ordered'})

# 4. What is the busiest hour of placed orders?

In [46]:
df_ords['order_hour_of_day'].value_counts()

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

# The busiest hour of the day for placed orders is 10am.

# 5. Determining the meand behind the value 4 of the "department_id" column within df_prods using the data dictionary

In [47]:
print(data_dict.get('4'))

{'department': 'produce'}


# The department 4 is Produce

# 6. Subset about breakfast item sales

In [48]:
# df_dep data dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [49]:
# Defining the breakfast data frame
df_breakfast = df_prods[df_prods['department_id']==14]

In [50]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


# There are 1,116 breakfast items (or rows)

# 7. Details about product customers might use for dinner parties

In [51]:
# Listing the parties' items in a data frame
df_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [52]:
df_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


# 8. The last data frame I created called 'df_parties' has 7,650 rows

# 9. Compiling all order from user_id '1'

In [53]:
df_ords[df_ords['user_id']==1]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


# 10. Basic stats on user_id '1'

In [54]:
# Defining user_id dataframe.
df_user_id = df_ords[df_ords['user_id']==1]

In [55]:
df_user_id.describe()

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,11.0,10.0
mean,1.0,6.0,2.636364,10.090909,19.0
std,0.0,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,1.0,7.0,0.0
25%,1.0,3.5,1.5,7.5,14.25
50%,1.0,6.0,3.0,8.0,19.5
75%,1.0,8.5,4.0,13.0,26.25
max,1.0,11.0,4.0,16.0,30.0


# 12. Exporting df_ords dataframe

In [56]:
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))

# 13. Exporting df_dep_t_new dataframe

In [57]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))

### Reminder on dropping columns: example 'eval_set' in df_drops
### You have removed the "eval_set" column from df_ords and not only visually as in line [8]) by using:
### df_ords = df_ords.drop(columns = ['eval_set'])

### If you only want to remove it visually, so that you do not overwrite the original data frame, you use:
### df.drop(columns = ['variable']), here becoming df.drop(columns = ['variable'])

### Another alternative is to create a new version with a different name, such as:
### df_ords_2 = df_ords.drop(columns = ['eval_set'])
### This approach also has its disadvantages, mainly in the form of space:
### If you create a new version of your data frame for each step of your process, you could end up with ten, or twenty data frames on your HD