# 4.4 Data Wrangling

### 01. Importing Libraries and Data Frames

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [3]:
# shortcuts for importing dataframes
path = r"C:\Users\Asus\Documents\DA CareerFoundry\Part II - Data Immersion\Python - Anaconda\August 2025 Instacart Basket Analysis\02 Data\Original Data"

In [4]:
# importing orders df
df_ords = pd.read_csv(os.path.join(path, 'orders.csv'), index_col = False)

In [5]:
# importing products df
df_prods = pd.read_csv(os.path.join(path, 'products.csv'), index_col = False)

### 02. Dropping Columns

In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [9]:
# drop eval_set column from orders.csv
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [10]:
# the latter created a visual; it didn't change the df. So, in order to do it, the following function serves:
df_ords = df_ords.drop(columns = ['eval_set'])
# this action cannot be undone.

In [11]:
# counting empty/missing values 
df_ords['days_since_prior_order'].value_counts(dropna = False)
# value_counts(dropna = False) serves to count the missing values without dropping/delete them.

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [12]:
# renaming columns
# function synthax : df.rename(columns = {'old_name' : 'new_name'}, inplace = True)
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)
# inplace = true means it will overwrite the original column name with the new one rather than creating a copy.

In [13]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [14]:
# to selectively apply describe() function to targeted columns, 
# it's needed to change the data type of the excluded columns into strings\str, like this:
df_ords['order_id'] = df_ords['order_id'].astype('str')
# a string data type column will be ignored by the describe() function.

In [16]:
# no need for descriptive statistics on user_id variable as well:
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [17]:
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [19]:
# let's check the data types of these columns now:
df_ords['order_id'].dtype
df_ords['user_id'].dtype

dtype('O')

In [21]:
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

### 03. Transposing Data

In [22]:
# importing departments.csv
df_dep = pd.read_csv(os.path.join(path, 'departments.csv'), index_col = False)

In [23]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [25]:
# let's transpose this strange df:
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [27]:
# let's save this in a new dataframe:
df_dep_t = df_dep.T

In [28]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [29]:
# fixing the headers:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [30]:
# fixing the headers, part2:
new_header = df_dep_t.iloc[0]
# iloc[] function selects data in pandas and takes the index number of a row or column 

In [31]:
new_header

0    department
Name: department_id, dtype: object

In [32]:
# part 3
df_dep_t_new = df_dep_t[1:]

In [35]:
# The 1 represents row 1 in the dataframe. 
# Remember that since Python starts indexing from 0 rather than 1, “row 1” is actually the second row in the dataframe. 
# The colon that follows the 1 represents onward until the end of the dataframe. 
# Together, they tell Python to copy everything starting from row 1 until the end of the dataframe. 
df_dep_t_new.reset_index()
# Take a look at the picture below:

Unnamed: 0,index,0
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [38]:
# part 4:
df_dep_t_new.columns = new_header
# df.columns serves to show the name of columns but also to assign new names to them. 
#  The command above says 
# “take the values stored in the new_header variable and put them in the df_dep_t_new dataframe as the names of its columns.”
# let's see how it looks now:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### 04. Data Dictionary

In [39]:
# making a dictionary by transforming df_dep_t_new into one:
data_dict = df_dep_t_new.to_dict('index')

In [41]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

##### The to_dict() function you just used transformed your df_dep_t_new dataframe into dictionary format and saved it in a new variable, data_dict. The argument, index, tells Python to use the numbered rows as the key values for the entries in the dictionary.

In [43]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [44]:
print(data_dict.get('19'))

{'department': 'snacks'}


##### Subsetting refers to creating a smaller data set from a whole data set (in this case, df_prods) based on a particular filter. This is a common procedure when conducting analyses, as you often won’t need the entire data set you were provided with. 

In [46]:
# creating a subset that only contains data from the snacks department:
df_snacks = df_prods[df_prods['department_id']==19]
# Within the brackets on the right-hand side, you’re telling Python to look into the df_prods dataframe 
# and find a column called "department_id." And, within that "department_id" column, 
# it should look for a value of 19.

In [47]:
# Let's save these results:
df_snacks =  df_prods[df_prods['department_id']==19]

##### Another way of doing all of this:

In [48]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [49]:
# or:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]