# 4.4 Data Wrangling

### 01. Importing Libraries and Data Frames

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# shortcuts for importing dataframes
path = r"C:\Users\Asus\Documents\DA CareerFoundry\Part II - Data Immersion\Python - Anaconda\August 2025 Instacart Basket Analysis\02 Data\Original Data"

In [3]:
# importing orders df
df_ords = pd.read_csv(os.path.join(path, 'orders.csv'), index_col = False)

In [4]:
# importing products df
df_prods = pd.read_csv(os.path.join(path, 'products.csv'), index_col = False)

### 02. Dropping Columns

In [5]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
# drop eval_set column from orders.csv
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [7]:
# the latter created a visual; it didn't change the df. So, in order to do it, the following function serves:
df_ords = df_ords.drop(columns = ['eval_set'])
# this action cannot be undone.

In [8]:
# counting empty/missing values 
df_ords['days_since_prior_order'].value_counts(dropna = False)
# value_counts(dropna = False) serves to count the missing values without dropping/delete them.

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [9]:
# renaming columns
# function synthax : df.rename(columns = {'old_name' : 'new_name'}, inplace = True)
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)
# inplace = true means it will overwrite the original column name with the new one rather than creating a copy.

In [10]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [11]:
# to selectively apply describe() function to targeted columns, 
# it's needed to change the data type of the excluded columns into strings\str, like this:
df_ords['order_id'] = df_ords['order_id'].astype('str')
# a string data type column will be ignored by the describe() function.

In [12]:
# no need for descriptive statistics on user_id variable as well:
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [13]:
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [14]:
# let's check the data types of these columns now:
df_ords['order_id'].dtype
df_ords['user_id'].dtype

dtype('O')

In [15]:
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

### 03. Transposing Data

In [16]:
# importing departments.csv
df_dep = pd.read_csv(os.path.join(path, 'departments.csv'), index_col = False)

In [17]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [18]:
# let's transpose this strange df:
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [19]:
# let's save this in a new dataframe:
df_dep_t = df_dep.T

In [20]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [21]:
# fixing the headers:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [22]:
# fixing the headers, part2:
new_header = df_dep_t.iloc[0]
# iloc[] function selects data in pandas and takes the index number of a row or column 

In [23]:
new_header

0    department
Name: department_id, dtype: object

In [24]:
# part 3
df_dep_t_new = df_dep_t[1:]

In [25]:
# The 1 represents row 1 in the dataframe. 
# Remember that since Python starts indexing from 0 rather than 1, “row 1” is actually the second row in the dataframe. 
# The colon that follows the 1 represents onward until the end of the dataframe. 
# Together, they tell Python to copy everything starting from row 1 until the end of the dataframe. 
df_dep_t_new.reset_index()
# Take a look at the picture below:

Unnamed: 0,index,0
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [26]:
# part 4:
df_dep_t_new.columns = new_header
# df.columns serves to show the name of columns but also to assign new names to them. 
#  The command above says 
# “take the values stored in the new_header variable and put them in the df_dep_t_new dataframe as the names of its columns.”
# let's see how it looks now:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### 04. Data Dictionary

In [27]:
# making a dictionary by transforming df_dep_t_new into one:
data_dict = df_dep_t_new.to_dict('index')

In [28]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

##### The to_dict() function you just used transformed your df_dep_t_new dataframe into dictionary format and saved it in a new variable, data_dict. The argument, index, tells Python to use the numbered rows as the key values for the entries in the dictionary.

In [29]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [30]:
print(data_dict.get('19'))

{'department': 'snacks'}


### 06. Subsetting

##### Subsetting refers to creating a smaller data set from a whole data set (in this case, df_prods) based on a particular filter. This is a common procedure when conducting analyses, as you often won’t need the entire data set you were provided with. 

In [31]:
# creating a subset that only contains data from the snacks department:
df_snacks = df_prods[df_prods['department_id']==19]
# Within the brackets on the right-hand side, you’re telling Python to look into the df_prods dataframe 
# and find a column called "department_id." And, within that "department_id" column, 
# it should look for a value of 19.

In [32]:
# Let's save these results:
df_snacks =  df_prods[df_prods['department_id']==19]

##### Another way of doing all of this:

In [33]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [34]:
# or:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

## Task Submission

#### Question:
Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [35]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


#### Answer: 
'order_id' and 'user_id' need not to be a numeric variable. I've changed both to str format (string) whilst performing the exercise. 

#### Question: 
Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

#### Answer:
The following function renames the column temporarily, without printing it on the dataframe.
To change it permanently, inplace = True command should be given.

In [36]:
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = False)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


#### Question:
Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

#### Answer:
The following function serves to count the frequency of all given values in the mentioned column.

Ten hours (in the morning) is the busiest hour.

In [37]:
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

#### Question:
Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

#### Answer:
A dictionary has already been made with this function.

data_dict = df_dep_t_new.to_dict('index')

The answer is that 4 codes for 'produce' department.

In [38]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [39]:
print(data_dict.get('4'))

{'department': 'produce'}


#### Question:
The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

#### Answer:
printing the dictionary will allow to know which id codes for breakfast products.

In [40]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

14 is the dep_id for breakfast products.

Subsetting function, for filtering/selecting only the breakfast products:

In [41]:
df_prods[df_prods['department_id']==14]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [42]:
df_breakfast = df_prods[df_prods['department_id']==14]

The subset is called 'df_breakfast'.

#### Question:
They’d also like to see details about products that customers might use to throw dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

#### Answer:
To create a new subset with multiple variables, isin([]) must be used.

In [43]:
df_prods[df_prods['department_id'].isin([5, 7, 12, 20])]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


##### The subset is called 'df_dinner_parties'

In [44]:
df_dinner_parties = df_prods[df_prods['department_id'].isin([5, 7, 12, 20])]

#### Question:
It’s important that you keep track of total counts in your dataframes. How many rows does the last dataframe you created have?

#### Answer:
The last dataframe I created has 7650 rows and 5 columns.

#### Question:
Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

#### Answer:


In [45]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


There's info on user_id = 1. Let's do a subset:

In [46]:
df_ords[df_ords['user_id']== 1]

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


Nothing happens. Let's treat '1' as a string instead of an integer:

In [47]:
df_ords[df_ords['user_id']== '1']

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


'1' is coded not as an integer but a string.

To code it as an integer, the following function works: df_ords['user_id'] = df_ords['user_id'].astype(int)

The subset referring to user_id = 1 is called 'df_userid_1'.


In [48]:
df_userid_1 = df_ords[df_ords['user_id']== '1']

In [49]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [50]:
# how to check how many aisle_ids I have? Maybe the aisle_id connects to another key in df_ords?

Tutor said: "You can get unique values of aisle_id in a list using function: 

df_prods['aisle_id'].unique()

If you want to know total count of unique values use:

df_prods['aisle_id'].nunique()

In [51]:
df_prods['aisle_id'].unique()

array([ 61, 104,  94,  38,   5,  11,  98, 116, 120, 115,  31, 119,  74,
        56, 103,  35,  79,  63,  40,  20,  49,  47,   3,  41, 127, 121,
        81, 123, 107, 131, 106,  69, 100,  64,  78,  83,  58,  66,  87,
        14, 105,  22, 134,  23, 111, 128,  50,   9,  96,  92,  89,  88,
       130,  26,  77,  65,  12,  95,  19,   7,   6,  97,   2, 112,  10,
        51,  13,  75,  70,  93,  34,  62,  21,  29,  45, 118,   4, 108,
        37, 109,  91,  24,  67, 132,  25, 129,  46,  16,  52,  17,  73,
       122,  42,  59, 126,  33,   1,   8,  82, 114, 124, 117,  72, 110,
        85,  44,  80, 101,  84,  30,  27,  90,  48, 133,  53,  28,  68,
        43, 125,  57,  15,  55,  36,  54, 102,  60,  99,  18,  39,  71,
       113,  86,  32,  76])

In [52]:
# the problem is that this array of numbers is not in order. How to?

In [53]:
df_prods['aisle_id'].nunique()

134

In [54]:
# Ah, this helps most.

#### Question:
You also need to provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

#### Answer:


In [55]:
df_userid_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


This user places orders between Sunday (min day of week = 1) and Wednesday (max day of week = 4), between 7am and 4pm. The user placed 11 orders so far, between 1 and 11 products. On average, the user makes an order once every 19 days.

#### Exporting Data

In [56]:
# shortcuts for importing dataframes
path2 = r"C:\Users\Asus\Documents\DA CareerFoundry\Part II - Data Immersion\Python - Anaconda\August 2025 Instacart Basket Analysis\02 Data\Prepared Data"

Export your df_ords dataframe as “orders_wrangled.csv” in your “Prepared Data” folder.

In [57]:
# the "index = False" will prevent an unwanted column "Unnamed: 0".
df_ords.to_csv(os.path.join(path2, 'orders_wrangled.csv'), index =  False)

Export the df_dep_t_new dataframe as “departments_wrangled.csv” in your “Prepared Data” folder so that you have a “.csv” file of your departments data in the correct format.

In [58]:
df_dep_t_new.to_csv(os.path.join(path2, 'departments_wrangled.csv'), index = False)