# Merge the 'orders_products_prior.csv' and 'orders_clean.csv' 

### 1. Import datasets 'orders_clean.csv' and 'order_products_prior.csv'

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
fpath = r'C:\Users\Mei\Instacart Basket Analysis\02 Data'

In [3]:
df_ords = pd.read_csv(os.path.join(fpath,'Prepared data','orders_clean.csv'), index_col = False)

In [4]:
df_order_products_prior = pd.read_csv(os.path.join(fpath,'Prepared data','order_products_prior.csv'),index_col = False)

# Wrangle and Consistency check on 'order_products_prior'

## 1. wrangle the data

In [5]:
df_order_products_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [6]:
df_order_products_prior.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1


In [7]:
df_order_products_prior['order_id'] = df_order_products_prior['order_id'].astype('str')

In [8]:
df_order_products_prior['product_id'] = df_order_products_prior['product_id'].astype('str')

## 2. Consistency check

### 2.1 Mixed type check

In [9]:
for col in df_order_products_prior.columns.tolist():
    wierd = (df_order_products_prior[[col]].applymap(type) != df_order_products_prior[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(df_order_products_prior[wierd]) >0 :
        print(col)

### no mixed types

### 2.2 Missing values check

In [10]:
df_order_products_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

### no missing values

### 2.3  Duplicates check

In [11]:
df_order_products_prior[df_order_products_prior.duplicated()]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


### no duplicated

# Merge the two files

In [12]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_setting            object 
 3   order_number            int64  
 4   oder_day_of_week        int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


In [13]:
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [16]:
list_1 = ['order_id','user_id','order_number','order_day_of_week','order_hour_of_day','days_since_prior_order']

In [17]:
df_ords_sub = df_ords[list]

In [19]:
df_ords = df_ords_sub

In [22]:
list_2 = ['order_id','product_id']

In [23]:
df_order_products_prior_sub = df_order_products_prior[list]

In [24]:
df_order_products_prior = df_order_products_prior_sub

In [25]:
df_ords.shape

(3421083, 7)

In [26]:
df_order_products_prior.shape

(32434489, 4)

### 2. Merge these two files 

In [27]:
df_merge = pd.merge(df_ords,df_order_products_prior,on = 'order_id', indicator = True)

In [28]:
df_merge.head(3)

Unnamed: 0,order_id,user_id,eval_setting,order_number,oder_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,prior,1,2,8,7.0,196,1,0,both
1,2539329,1,prior,1,2,8,7.0,14084,2,0,both
2,2539329,1,prior,1,2,8,7.0,12427,3,0,both


In [29]:
df_merge['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [30]:
df_merge.shape

(32434489, 11)

### 3. Export the merged file as .pkl

In [31]:
df_merge.to_pickle(os.path.join(fpath,'Prepared data','orders_products_combined.pkl'))

# Merge the file 'orders_products_combined.pkl' with file 'products.csv'

### 1. Import dateset 'products.csv'

In [32]:
df_prods = pd.read_csv(os.path.join(fpath,'Prepared data','products_clean.csv'),index_col = False)

In [33]:
df_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49677 entries, 0 to 49676
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49677 non-null  int64  
 1   product_name   49677 non-null  object 
 2   aisle_id       49677 non-null  int64  
 3   department_id  49677 non-null  int64  
 4   prices         49677 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.9+ MB


In [34]:
df_prods['product_id'] = df_prods['product_id'].astype('str')

In [35]:
df_prods.shape

(49677, 5)

### 3. Merge these two dataframes 

In [37]:
df_merge.head(2)

Unnamed: 0,order_id,user_id,eval_setting,order_number,oder_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,prior,1,2,8,7.0,196,1,0,both
1,2539329,1,prior,1,2,8,7.0,14084,2,0,both


In [38]:
m = df_merge.drop(columns = ['eval_setting','_merge'])

In [40]:
df_merge_2 = pd.merge(df_prods, m, on = 'product_id', indicator = True)

In [42]:
df_merge_2.shape

(32406041, 14)

In [43]:
df_merge_2['_merge'].value_counts()

both          32406041
left_only            0
right_only           0
Name: _merge, dtype: int64

### 4. Export the merged file as .pkl

In [45]:
df_merge_2.to_pickle(os.path.join(fpath,'Prepared data','orders_products_merged.pkl'))