# 1. Importing Libraries and Data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# create path
path = "/Users/charlottelin/Documents/06-2025 Instacart Basket Analysis"

In [4]:
# import orders data
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col=False)

In [5]:
# import products data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col=False)

In [6]:
# import departments data
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col=False)

# 2. Data Wrangling

## 2.1 "Orders" Data Frame

In [8]:
#dropping "eval_set" from orders.csv
df_ords = df_ords.drop(columns = ['eval_set'])

In [10]:
#spot any NaN value
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [12]:
#changing the "order_dow" column name to "orders_day_of_week"
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [13]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [14]:
#changing data type of "order_id" and "user_id" from number to string
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [15]:
df_ords['order_id'].dtype

dtype('O')

## 2.2 "Departments" Data Frame

In [17]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [18]:
#Transposing df_dep to long format
df_dep_t = df_dep.T

In [19]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [20]:
# Adding an Index to df_dep
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [22]:
#take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [23]:
new_header

0    department
Name: department_id, dtype: object

In [24]:
#copy df_dep_t from row 1 forward
df_dep_t_new = df_dep_t[1:]

In [25]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [26]:
#set new_header as the new header for df_dep_t_new
df_dep_t_new.columns = new_header

In [27]:
#final result of df_dep_t_new
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 3. Data Dictionary

In [28]:
data_dict = df_dep_t_new.to_dict('index')

In [29]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [30]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [31]:
print(data_dict.get('19'))

{'department': 'snacks'}


# 4. Subsetting

In [32]:
#Create a subset for the snacks department
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [33]:
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [34]:
df_snacks = df_prods[df_prods['department_id']==19]

In [35]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# 5. Task

## Step 2 - Identify non-essential variable in df_ords

In [37]:
# View all variables in df_ords
df_ords.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [39]:
# Change "order_number" to string
df_ords['order_number'] = df_ords['order_number'].astype('str')

## Step 3 - Revise unintuitive variable name in df_ords

In [42]:
# "Order_dow" was already changed to "order_day_of_week"; another choice could be "order_hour_of_day"
df_ords.rename(columns={'order_hour_of_day': 'order_time'}, inplace = False)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


## Step 4. Busiest Hour

In [43]:
df_ords['order_hour_of_day'].value_counts(dropna = True)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

### Answer: The busiest hour for placing orders is 10 AM, followed closely by 11 AM and 1 PM.

## Step 5. The meaning of "4" in df_prods

In [44]:
print(data_dict.get('4'))

{'department': 'produce'}


### Answer: A department_id of 4 corresponds to the <b>produce</b> department according to the data dictionary.

## Step 6. Subsetting for Breakfast Items

In [46]:
#identify department_id for breakfast (14)
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [48]:
#create the subset for breakfast
df_breakfast = df_prods.loc[df_prods['department_id'] == 14]

In [49]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


## Step 7 - Subsetting for Dinner Party Items

In [53]:
# Dinner items chosen: 5 (alcohol), 7 (beverages), 12 (meat seafood) and 20 (deli)
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

### Note to client: the dinner party list could be analyzed for special promotions.

## Step 8 - Number of Rows in df_dinner_party

In [55]:
# total counts of rows in the dinner party dataframe
df_dinner_party.shape

(7650, 5)

## Step 9 - Info about Customer user_id 1

In [56]:
# Find info about customer with the user_id of 1
df_user1=df_ords.loc[df_ords['user_id'] == '1']
df_user1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## Step 10 - Stats about the User's Behavior

In [57]:
df_user1.describe()

Unnamed: 0,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,10.0
mean,2.636364,10.090909,19.0
std,1.286291,3.477198,9.030811
min,1.0,7.0,0.0
25%,1.5,7.5,14.25
50%,3.0,8.0,19.5
75%,4.0,13.0,26.25
max,4.0,16.0,30.0


**User 1's Order Behavior Summary**

- **Order count:** 11  
- **Avg. days between orders:** 19  
- **Order time pattern:** Typically between **7 AM and 4 PM**, with a **mean of 10:09 AM**  
- **Ordering days:** Mostly **early in the week** (day 2–3)  
- **Order spacing:** Irregular — ranging from same-day to 30-day gaps  

# 6. Export Data

In [58]:
#export df_ords
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [59]:
#export departments data frame
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))