# Combining Exporting Data Part1 - w/ orders_products_prior cleaned

### This script contains the following points:

#### 01. Importing libraries
#### 02. Importing orders_products_prior.csv (original) and orders_checked.csv (cleaned) data
#### 03. Examining orders_products_prior.csv (original) and orders_checked.csv (cleaned) data
#### 04. Data Wrangling
* Value Counts for the orders_products_prior dataframe columns
* Changing the orders_products_prior dataframe order_id, product_id and reordered column data type

#### 05. Data Consistency Checks
#### 06. Duplicates in the df_ords_prior dataframe
#### 07. Merging the df_ords_prior and df_ords_clean dataframes 
* Merging The Instacart Data

#### 08. Exporting the df_ords_prior and df_ords_clean merged dataframe as pkl file

## 01. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Importing orders_products_prior.csv (original) and orders_checked.csv (cleaned) data

In [2]:
# Assign the main project folder path to the variable path and view
path = r'/Users/elsaekevall/Jupyter_Notebook/Career_Foundry/06_2022_Instacart_Basket_Analysis/'
path

'/Users/elsaekevall/Jupyter_Notebook/Career_Foundry/06_2022_Instacart_Basket_Analysis/'

In [3]:
# Use the os.path.join() function to import the orders_products_prior.csv and orders_checked.csv files as pandas dataframes without the index column
df_ords_prior = pd.read_csv(os.path.join(path, '02_Data', '02_1_Original_Data', 'orders_products_prior.csv'))
df_ords_clean = pd.read_pickle(os.path.join(path, '02_Data', '02_2_Prepared_Data', 'orders_checked.pkl'))

## 03. Examining orders_products_prior.csv (original) and orders_checked.csv (cleaned) data

In [4]:
# View the first and last five rows of the dataframe
df_ords_prior

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [5]:
# View the shape
df_ords_prior.shape

(32434489, 4)

In [6]:
# View the first and last five rows of the dataframe
df_ords_clean

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0,False
3421079,1854736,206209,11,4,10,30.0,False
3421080,626363,206209,12,1,12,18.0,False
3421081,2977660,206209,13,1,12,7.0,False


In [7]:
# Dataframe information
df_ords_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 object 
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   first_order             bool   
dtypes: bool(1), float64(1), int64(3), object(2)
memory usage: 159.9+ MB


In [8]:
# View the shape
df_ords_clean.shape

(3421083, 7)

## 04. Data Wrangling
### Value Counts for the orders_products_prior dataframe columns

In [9]:
# View value counts for the order_id column
df_ords_prior['order_id'].value_counts(dropna = False)

1564244    145
790903     137
61355      127
2970392    121
2069920    116
          ... 
188978       1
1341758      1
1522006      1
1600231      1
2343880      1
Name: order_id, Length: 3214874, dtype: int64

In [10]:
# View value counts for the product_id column
df_ords_prior['product_id'].value_counts(dropna = False)

24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
          ...  
37660         1
42235         1
31333         1
3117          1
10806         1
Name: product_id, Length: 49677, dtype: int64

In [11]:
# View value counts for the add_to_cart_order column
df_ords_prior['add_to_cart_order'].value_counts(dropna = False)

1      3214874
2      3058126
3      2871133
4      2664106
5      2442025
        ...   
141          1
142          1
143          1
144          1
145          1
Name: add_to_cart_order, Length: 145, dtype: int64

In [12]:
# View value counts for the reordered column
df_ords_prior['reordered'].value_counts(dropna = False)

1    19126536
0    13307953
Name: reordered, dtype: int64

### Changing the orders_products_prior dataframe order_id, product_id and reordered column data type

In [13]:
# Dataframe information
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [14]:
# Change the order_id column and view the data types
df_ords_prior['order_id'] = df_ords_prior['order_id'].astype('str')
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   order_id           object
 1   product_id         int64 
 2   add_to_cart_order  int64 
 3   reordered          int64 
dtypes: int64(3), object(1)
memory usage: 989.8+ MB


In [15]:
# Change the order_id column and view the data types
df_ords_prior['product_id'] = df_ords_prior['product_id'].astype('str')
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   order_id           object
 1   product_id         object
 2   add_to_cart_order  int64 
 3   reordered          int64 
dtypes: int64(2), object(2)
memory usage: 989.8+ MB


In [16]:
# Change the order_id column and view the data types
df_ords_prior['reordered'] = df_ords_prior['reordered'].astype(bool)
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   order_id           object
 1   product_id         object
 2   add_to_cart_order  int64 
 3   reordered          bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 773.3+ MB


In [17]:
# View value counts for the reordered column
df_ords_prior['reordered'].value_counts(dropna = False)

True     19126536
False    13307953
Name: reordered, dtype: int64

* **Order_id and product_id changed data to strings and reordered data changed to boolean, add_to_cart_order data left as integer**

## 05. Data Consistency Checks

In [18]:
# Investigate the accuracy of the columns in the df_ords dataframe
df_ords_prior.describe()

Unnamed: 0,add_to_cart_order
count,32434490.0
mean,8.351076
std,7.126671
min,1.0
25%,3.0
50%,6.0
75%,11.0
max,145.0


#### Mixed Data Type

In [19]:
# Run code to check for mixed types on df_ords_prior
for col in df_ords_prior.columns.tolist():
  weird = (df_ords_prior[[col]].applymap(type) != df_ords_prior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords_prior[weird]) > 0:
    print (col)

* **There is no mixed-type data in the df_ords dataframe**

In [20]:
# Check for null values in df_ords_prior
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

* **There are no null values in the df_ords dataframe**

## 06. Duplicates in the df_ords_prior dataframe

In [21]:
#Look for duplicates within the products dataframe and view them
df_dups_ords_prior = df_ords_prior[df_ords_prior.duplicated()]
df_dups_ords_prior

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


* **There are no duplicates in the df_ords dataframe**

## 07. Merging the df_ords_prior and df_ords_clean dataframes 

Questions to ask before choosing a combination method include:

* How many rows and columns do the dataframes to be combined contain? Do they have the same number of columns?
* Do the dataframes contain information on the same subject? Do they share a common column or columns?
* Should the combined dataframe be long format or wide format?
* Is a full match expected after combining the dataframes?
* If not, what type of join should you use? Which part of the data should you keep in the final dataframe?

### Merging The Instacart Data

In [26]:
# Create new df_merged_large dataframe with default join “inner”, therefore data set will only contain observations included in both input data sets and view the first five rows
df_merged_large = df_ords_clean.merge(df_ords_prior, on = 'order_id', indicator = True)
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196,1,False,both
1,2539329,1,1,2,8,,True,14084,2,False,both
2,2539329,1,1,2,8,,True,12427,3,False,both
3,2539329,1,1,2,8,,True,26088,4,False,both
4,2539329,1,1,2,8,,True,26405,5,False,both


In [27]:
# View the shape
df_merged_large.shape

(32434489, 11)

In [28]:
# Use value_counts() function to sum up all the values in the “_merge” column to see if there is a full match
df_merged_large['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [29]:
# Create new df_merged_large dataframe specifying an “outer” join combining all the observations and view the first five rows
df_merged_large_outer = df_ords_clean.merge(df_ords_prior, on = 'order_id', how = 'outer', indicator = True)
df_merged_large_outer.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196,1.0,False,both
1,2539329,1,1,2,8,,True,14084,2.0,False,both
2,2539329,1,1,2,8,,True,12427,3.0,False,both
3,2539329,1,1,2,8,,True,26088,4.0,False,both
4,2539329,1,1,2,8,,True,26405,5.0,False,both


In [30]:
# Use value_counts() function to sum up all the values in the “_merge” column to see if there is a full match
df_merged_large_outer['_merge'].value_counts()

both          32434489
left_only       206209
right_only           0
Name: _merge, dtype: int64

## 08. Exporting the df_ords_prior and df_ords_clean merged dataframe as pkl file

In [31]:
# Export the df_merged_large dataframe to pkl
df_merged_large.to_pickle(os.path.join(path, '02_Data', '02_2_Prepared_Data', 'orders_products_combined.pkl'))