# Orders, Products and Department - Merging:
1. Importing libraries and datasets
2. Reducing memory usage and merging orders with orders_prior df
3. Reducing memory usage and merging orders_prior_merged with products df
3. Merging orders_prods_merged with dept df
4. Exporting ords_prods_dept merged df

## 1. Importing libraries and datasets

In [1]:
# Importing libraries

import pandas as pd
import os

In [2]:
# Accessing EnvFile for path

%run EnvFile.ipynb

Stored 'path' (str)


In [3]:
# Importing orders_checked.csv

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))

In [4]:
# Importing orders_products_prior.csv

df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'))

In [21]:
# Importing products_checked.csv

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

In [5]:
df_dept = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)

## 2. Reducing memory usage and merging orders with orders_prior df

In [7]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_day_of_week       int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


In [8]:
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [9]:
# Change datatypes for ords dataset to reduce memory usage

df_ords['order_id'] = df_ords['order_id'].astype('int32')
df_ords['user_id'] = df_ords['user_id'].astype('int32')
df_ords['order_number'] = df_ords['order_number'].astype('int8')
df_ords['order_day_of_week'] = df_ords['order_day_of_week'].astype('int8')
df_ords['order_hour_of_day'] = df_ords['order_hour_of_day'].astype('int8')
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].astype('float16')

In [10]:
# Change datatypes for ords_prior data set 

df_ords_prior['order_id'] = df_ords_prior['order_id'].astype('int32')
df_ords_prior['product_id'] = df_ords_prior['product_id'].astype('int32')
df_ords_prior['add_to_cart_order'] = df_ords_prior['add_to_cart_order'].astype('int32')
df_ords_prior['reordered'] = df_ords_prior['reordered'].astype('int8')

In [11]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
dtypes: float16(1), int32(2), int8(3)
memory usage: 42.4 MB


In [12]:
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int32
 1   product_id         int32
 2   add_to_cart_order  int32
 3   reordered          int8 
dtypes: int32(3), int8(1)
memory usage: 402.1 MB


In [13]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [14]:
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


##### In ords_prior dataset, there can be multiple rows for each order_id as this table includes the list of different products bought during each order.

In [15]:
# Merging df_ords and df_ords_prior based on 'order_id' using .merge() function

df_ords_prior_merged = df_ords.merge(df_ords_prior, on = 'order_id', indicator = True)

In [16]:
df_ords_prior_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [17]:
# Using .value_counts() for _merge to check if there is full match or not.

df_ords_prior_merged['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [18]:
# Dropping _merge column

df_ords_prior_merged = df_ords_prior_merged.drop(columns = ['_merge'])

In [19]:
# Checking the shape of merged df

df_ords_prior_merged.shape

(32434489, 9)

In [20]:
# Export in pickle format

df_ords_prior_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_prior_merged.pkl'))

## 3. Reducing memory usage and merging orders_prior with products df

In [22]:
df_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49672 entries, 0 to 49671
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int64  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int64  
 3   department_id  49672 non-null  int64  
 4   prices         49672 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.9+ MB


In [23]:
# Changing datatype to reduce memory usage

df_prods['product_id'] = df_prods['product_id'].astype('int32')
df_prods['aisle_id'] = df_prods['aisle_id'].astype('int8')
df_prods['department_id'] = df_prods['department_id'].astype('int8')

In [24]:
df_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49672 entries, 0 to 49671
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  int32  
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  int8   
 3   department_id  49672 non-null  int8   
 4   prices         49672 non-null  float64
dtypes: float64(1), int32(1), int8(2), object(1)
memory usage: 1.0+ MB


In [25]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


#### The prods dataset includes reference for corresponding product name and prices for each product id. So there cannot be multiple entries for each product_id.

In [26]:
# Checking for duplicates for product_id in prods dataset

df_prods.drop_duplicates(subset = ['product_id'])

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49667,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49668,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49669,49686,Artisan Baguette,112,3,7.8
49670,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


#### There are 2 duplicates as we can notice from the shape of prods dataset reduced from 49672 to 49670. So, let's remove the duplicates and reassign the unique values to prods dataset.

In [27]:
df_prods = df_prods.drop_duplicates(subset = ['product_id'])

In [28]:
# Checking the shape of df_prods

df_prods.shape

(49670, 5)

In [29]:
# Merging ords_prior_merged and prods dataset

df_ords_prods_merged = df_ords_prior_merged.merge(df_prods, on = 'product_id', how = 'inner')

In [30]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0


In [32]:
# Checking shape of merged df

df_ords_prods_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404289 entries, 0 to 32404288
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
 6   product_id              int32  
 7   add_to_cart_order       int32  
 8   reordered               int8   
 9   product_name            object 
 10  aisle_id                int8   
 11  department_id           int8   
 12  prices                  float64
dtypes: float16(1), float64(1), int32(4), int8(6), object(1)
memory usage: 1.4+ GB


## 3. Merging orders_prods_merged with dept df

In [33]:
# Merging prepared department df with current df

df_ords_prods_dept = df_ords_prods_merged.merge(df_dept, on = 'department_id')

In [34]:
df_ords_prods_dept.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,department
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,beverages
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,beverages
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,beverages
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,beverages
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,beverages


In [35]:
# Checking memory usage after reducing

df_ords_prods_dept.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404289 entries, 0 to 32404288
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
 6   product_id              int32  
 7   add_to_cart_order       int32  
 8   reordered               int8   
 9   product_name            object 
 10  aisle_id                int8   
 11  department_id           int8   
 12  prices                  float64
 13  department              object 
dtypes: float16(1), float64(1), int32(4), int8(6), object(2)
memory usage: 1.7+ GB


## 4. Exporting ords_prods_dept merged df

In [36]:
# Exporting orders, orders_prior and products merged df as orders_products_merged.pkl

df_ords_prods_dept.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_dept.pkl'))