# 3. Importing libraries and orders_products_combined pickle file and confirming the head/shape/describe results align with expectations

In [None]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/danielmccormick/Desktop/CareerFoundry/Python/08-23 Instacart Basket Analysis'

In [3]:
df_ords_prods = pd.read_pickle(os.path.join(path, 'IC_Data', 'IC_prepared_data', 'orders_products_combined.pkl'))

In [4]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,user_order_count,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [5]:
# 4. Confirming the the imported pickle file data frame has the expected shape of (32434489, 10). 
df_ords_prods.shape

(32434489, 10)

In [6]:
df_ords_prods.describe()

Unnamed: 0,order_id,user_id,user_order_count,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0,32434490.0,30356420.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,102937.2,17.14205,2.738818,13.42498,11.10407,25576.34,8.351076,0.5896975
std,987300.7,59466.48,17.53504,2.090049,4.246365,8.778914,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,855943.0,51421.0,5.0,1.0,10.0,5.0,13530.0,3.0,0.0
50%,1711048.0,102611.0,11.0,3.0,13.0,8.0,25256.0,6.0,1.0
75%,2565514.0,154391.0,24.0,5.0,16.0,15.0,37935.0,11.0,1.0
max,3421083.0,206209.0,99.0,6.0,23.0,30.0,49688.0,145.0,1.0


# 5. Merging orders_products_combined with products data set

In [7]:
# importing cleaned products dataframe and confirming that the head/shape/describe results align with expectations 
df_prods = pd.read_csv(os.path.join(path, 'IC_Data', 'IC_prepared_data', 'products_clean.csv'), index_col = [0])

In [8]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [9]:
df_prods.shape

(49672, 5)

In [10]:
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [11]:
# Merging the df_ords_prods and df_prods dataframes
# When I attempt to run: 
# df_merged_total = df_ords_prods.merge(df_prods, on = ['product_id'], indicator = True, how = 'outer') 
# I get an error: "ValueError: Cannot use name of an existing column for indicator column"
# After doing some research it seems as though the df_ords_prods dataframe created earlier in the exercise
# already has the _merge column with every row saying "both", and since that is already in the dataframe
# I cannot do another merge with "indicator = True" since there is already an indicator column in one of the dataframes.
# For that reason I will drop the _merge column from the df_ords_prods dataframe and then run the merge function
# with "Indicator = True"
df_ords_prods = df_ords_prods.drop('_merge', axis=1)

In [12]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,user_order_count,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,196,1,0
1,2539329,1,1,2,8,,14084,2,0
2,2539329,1,1,2,8,,12427,3,0
3,2539329,1,1,2,8,,26088,4,0
4,2539329,1,1,2,8,,26405,5,0


In [13]:
df_merged_total = df_ords_prods.merge(df_prods, on = ['product_id'], indicator = True, how = 'outer')

# 6. Running checks to confirm that the merge was fully successful

In [14]:
# Checking the results of the full outer merge shows that there were not full matches for every data point. About
# 30k values were only in the left dataframe, the df_ords_prods dataframe. There were 11 values only found in the 
# right dataframe, the df_prods dataframe. 
df_merged_total['_merge'].value_counts()

both          32404859
left_only        30200
right_only          11
Name: _merge, dtype: int64

In [15]:
# because of these results, we will want to re-do the merge that we executed above as an inner join to make sure 
# that the newly created dataframe only has data that matches from both data frames. 
df_merged_total = df_ords_prods.merge(df_prods, on = ['product_id'], indicator = True)

In [16]:
df_merged_total['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [17]:
# Now we have a dataframe with all the data combined that only contained data that matched across the two data frames. 
# this data is now good to export and use in future analyses. 

# 7. Exporting the data frame as a pickle file. 

In [18]:
df_merged_total.to_pickle(os.path.join(path, 'IC_Data', 'IC_prepared_data', 'orders_products_merged.pkl'))