# 4.4 Data Wrangling and Subsetting

## Step 1: Importing Libraries and Reading Data
This section imports necessary libraries and loads the data files into pandas DataFrames.

In [123]:
import pandas as pd
import numpy as np
import os

In [124]:
path = r'/Users/canancengel/A4_Instacart Basket Analysis/02_Data/Original Data'

## Step 2: Data Overview
Preview the first few rows and review the structure of the orders and products datasets.

In [125]:
import pandas as pd
import numpy as np
import os

path = '/Users/canancengel/4.3_orders_products'

orders_path = os.path.join(path, 'orders.csv')
products_path = os.path.join(path, 'products.csv')

df_ords = pd.read_csv(orders_path)
df_prods = pd.read_csv(products_path)

#Preview the first few rows and review the structure of the orders and products datasets.
    
print(df_ords.head())
df_ords.head()
df_ords.tail()
df_ords.info()
df_prods.head()
df_prods.tail()
df_prods.info()

   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2539329        1    prior             1          2                  8   
1   2398795        1    prior             2          3                  7   
2    473747        1    prior             3          3                 12   
3   2254736        1    prior             4          4                  7   
4    431534        1    prior             5          4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64 

## Step 3: Dropping Unnecessary Columns
Remove columns that are not required for further analysis.

In [126]:
df_ords = df_ords.drop(columns=['eval_set'])

## Step 4: Converting Numeric Variables to Suitable Format
Convert identifier variables to an appropriate data type for analysis.

In [127]:
df_ords.rename(columns={
    'order_id': 'OrderID',
    'user_id': 'UserID',
    'order_dow': 'Order_DayOfWeek',
    'order_hour_of_day': 'Order_HourOfDay',
    'days_since_prior_order': 'Days_SincePriorOrder'
}, inplace=True)

In [128]:
df_prods.rename(columns={
    'product_id': 'ProductID',
    'product_name': 'ProductName',
    'aisle_id': 'AisleID',
    'department_id': 'DepartmentID',
    'prices': 'Prices'
}, inplace=True)

## Step 5: Renaming Variables
Rename columns to have more intuitive names and save the results in a new DataFrame and a CSV file.

In [129]:
df_ords_renamed = df_ords.rename(columns={'order_dow': 'Order_DayOfWeek'})
df_ords_renamed.to_csv(os.path.join(path, "Prepared Data", "orders_renamed.csv"), index=False)

In [130]:
print(df_ords.dtypes)
print(df_prods.dtypes)

OrderID                   int64
UserID                    int64
order_number              int64
Order_DayOfWeek           int64
Order_HourOfDay           int64
Days_SincePriorOrder    float64
dtype: object
ProductID         int64
ProductName      object
AisleID           int64
DepartmentID      int64
Prices          float64
dtype: object


## Step 6: Descriptive Statistics for Order Hour of Day
Show descriptive statistics and frequency counts for the order_hour_of_day variable.

In [131]:
print(df_ords['Order_HourOfDay'].describe())
print(df_ords['Order_HourOfDay'].value_counts())

count    3.421083e+06
mean     1.345202e+01
std      4.226088e+00
min      0.000000e+00
25%      1.000000e+01
50%      1.300000e+01
75%      1.600000e+01
max      2.300000e+01
Name: Order_HourOfDay, dtype: float64
Order_HourOfDay
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64


In [132]:
df_ords.head()

Unnamed: 0,OrderID,UserID,order_number,Order_DayOfWeek,Order_HourOfDay,Days_SincePriorOrder
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [133]:
df_ords['OrderID'] = df_ords['OrderID'].astype('str')

## Step 7: Rename Columns

In [134]:
df_ords.rename(columns={'order_dow': 'order_day_of_week'}, inplace=True)

## Step 8: Convert another identifier to string

In [135]:
df_ords['user_id'] = df_ords['UserID'].astype('str')

In [136]:
df_ords['OrderID'] = df_ords['OrderID'].astype(str)

## Step 9: Rename unintuitive column (without overwriting)

In [137]:
df_ords_renamed = df_ords.rename(columns={'days_since_prior_order': 'days_waited_before_next_order'})

In [138]:
df_ords.rename(columns={'order_dow': 'Order_DayOfWeek'}, inplace=True)

## Step 10: Busiest order hour

In [139]:
print(df_ords.columns)
print(df_prods.columns)

Index(['OrderID', 'UserID', 'order_number', 'Order_DayOfWeek',
       'Order_HourOfDay', 'Days_SincePriorOrder', 'user_id'],
      dtype='object')
Index(['ProductID', 'ProductName', 'AisleID', 'DepartmentID', 'Prices'], dtype='object')


In [140]:
print(df_ords['Order_HourOfDay'].describe())
print(df_ords['Order_HourOfDay'].value_counts())

count    3.421083e+06
mean     1.345202e+01
std      4.226088e+00
min      0.000000e+00
25%      1.000000e+01
50%      1.300000e+01
75%      1.600000e+01
max      2.300000e+01
Name: Order_HourOfDay, dtype: float64
Order_HourOfDay
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64


**Result:** The busiest order time is 10:00.

In [141]:
user1_orders = df_ords[df_ords['user_id'] == 1]
print(user1_orders.head())

Empty DataFrame
Columns: [OrderID, UserID, order_number, Order_DayOfWeek, Order_HourOfDay, Days_SincePriorOrder, user_id]
Index: []


In [144]:
orders_after_10 = df_ords[df_ords['order_number'] > 10]
print(orders_after_10.head())

    OrderID  UserID  order_number  Order_DayOfWeek  Order_HourOfDay  \
10  1187899       1            11                4                8   
21  1402090       2            11                1               10   
22  3186735       2            12                1                9   
23  3268552       2            13                4               11   
24   839880       2            14                3               10   

    Days_SincePriorOrder user_id  
10                  14.0       1  
21                  30.0       2  
22                  28.0       2  
23                  30.0       2  
24                  13.0       2  


## Step 11: Meaning of department_id = 4
Identify what department_id 4 represents by checking the data dictionary.

In [145]:
department_4_products = df_prods[df_prods['DepartmentID'] == 4]

**Department ID 4:** This ID means “produce” (vegetables & fruits).

In [146]:
print(department_4_products['ProductName'].unique())

['White Pearl Onions' 'Organic Clementines' 'European Cucumber' ...
 'Baby Food Blueberry, Parsnip & Buckwheat Stage 2' 'Cabernet Tomatoes'
 'Cucumber Kirby']


## Step 12: Breakfast items subset
Subset the products data to include only breakfast-related departments and save the result as a new DataFrame and CSV file.

In [147]:
breakfast_department_ids = [14, 19, 8]
breakfast_items = df_prods[df_prods['DepartmentID'].isin(breakfast_department_ids)]
breakfast_items.to_csv(os.path.join(path, 'Prepared Data', 'breakfast_items.csv'), index=False)
print("Number of breakfast items:", breakfast_items.shape[0])

Number of breakfast items: 8352


## Step 13: Dinner party departments subset
Create a subset for dinner party departments and save as a new DataFrame and CSV file.

In [148]:
party_department_ids = [5, 7, 11, 12]
party_items = df_prods[df_prods['DepartmentID'].isin(party_department_ids)]
party_items.to_csv(os.path.join(path, 'Prepared Data', 'dinner_party_items.csv'), index=False)
print("Number of dinner party items:", party_items.shape[0])

Number of dinner party items: 12893


## Step 14: Number of Rows in Dinner Party Subset
Display the total number of rows in the dinner party subset.

In [149]:
dinner_party_department_ids = [5, 7, 12, 20] 

In [150]:
dinner_party = df_prods[df_prods['DepartmentID'].isin(dinner_party_department_ids)]

In [151]:
dinner_party.shape

(7650, 5)

## Step 15: Extract Data for user_id = 1
Subset the orders DataFrame to show all records for user_id = 1 and export the results.

In [152]:
user1_orders = df_ords[df_ords['user_id'] == 1]

In [153]:
user1_orders.to_csv(os.path.join(path, 'Prepared Data', 'user1_orders.csv'), index=False)

In [154]:
print(user1_orders.describe())
print("Total orders by user 1:", user1_orders.shape[0])

       UserID  order_number  Order_DayOfWeek  Order_HourOfDay  \
count     0.0           0.0              0.0              0.0   
mean      NaN           NaN              NaN              NaN   
std       NaN           NaN              NaN              NaN   
min       NaN           NaN              NaN              NaN   
25%       NaN           NaN              NaN              NaN   
50%       NaN           NaN              NaN              NaN   
75%       NaN           NaN              NaN              NaN   
max       NaN           NaN              NaN              NaN   

       Days_SincePriorOrder  
count                   0.0  
mean                    NaN  
std                     NaN  
min                     NaN  
25%                     NaN  
50%                     NaN  
75%                     NaN  
max                     NaN  
Total orders by user 1: 0


In [155]:
print(user1_orders.columns)

Index(['OrderID', 'UserID', 'order_number', 'Order_DayOfWeek',
       'Order_HourOfDay', 'Days_SincePriorOrder', 'user_id'],
      dtype='object')


In [156]:
mode_hour = user1_orders['Order_HourOfDay'].mode()
if not mode_hour.empty:
    print("Most common order hour:", mode_hour.iloc[0])
else:
    print("No common order hour found.")

No common order hour found.


In [157]:
mode_day = user1_orders['Order_DayOfWeek'].mode()
if not mode_day.empty:
    print("Most common order day:", mode_day.iloc[0])
else:
    print("No common order day found.")

No common order day found.


## Step 16: Summary Statistics for user_id = 1
Print descriptive statistics and the most common order hour and day for user 1.

In [158]:
user_1 = df_ords[df_ords['user_id'] == 1]
user_1.describe()

Unnamed: 0,UserID,order_number,Order_DayOfWeek,Order_HourOfDay,Days_SincePriorOrder
count,0.0,0.0,0.0,0.0,0.0
mean,,,,,
std,,,,,
min,,,,,
25%,,,,,
50%,,,,,
75%,,,,,
max,,,,,


## Step 17: Exporting Cleaned DataFrames
Save all wrangled and subsetted DataFrames as CSV files for future use.

In [159]:
os.makedirs(os.path.join(path, 'Prepared Data'), exist_ok=True)

In [160]:
path = r"/Users/canancengel/4.3_orders_products"

In [161]:
df_dep = pd.read_csv(os.path.join(path, 'departments.csv'))

In [162]:
df_dep_t_new = df_dep.T

In [163]:
prep_path = os.path.join(path, "Prepared Data")

In [164]:
df_dep_t_new.to_csv(os.path.join(prep_path, 'departments_wrangled.csv'), index=False)