# 3 IC Merging - orders with orders_products_prior

### This script contains the following points
#### 01 Import Libraries
#### 02 Import Data
#### 03 First Look at Data and Downsizing of Data Types
#### 04 Merge
#### 05 Export Data

# 01 Import Libraries

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [2]:
# Set a path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
# Import the "checked_customers" file

cust = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'checked_customers.csv'), index_col = False)

In [5]:
# Import the "merged_orders_products_products" file

ords_prods_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'merged_orders_products_products.pkl'))

# 03 First Look at Data and Downsizing of Data Types

##### "ords"

In [6]:
# Shape of "cust"

cust.shape

(206209, 8)

In [7]:
# First few rows of "cust"

cust.head()

Unnamed: 0.1,Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,0,26711,Female,Missouri,48,3,married,165665
1,1,33890,Female,New Mexico,36,0,single,59285
2,2,65803,Male,Idaho,35,2,married,99568
3,3,125935,Female,Iowa,40,0,single,42049
4,4,130797,Female,Maryland,26,1,married,40374


In [8]:
# Drop "Unnamed: 0" column

cust = cust.drop(columns =['Unnamed: 0'])

In [9]:
cust.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


In [10]:
# Data types of "cust"

cust.dtypes

user_id          int64
gender          object
state           object
age              int64
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [11]:
# Dowsize data types for a better merging

cust['user_id'] = cust['user_id'].astype('int32')
cust['age'] = cust['age'].astype('int8')
cust['n_dependants'] = cust['n_dependants'].astype('int8')

In [12]:
# Check the data types

cust.dtypes

user_id          int32
gender          object
state           object
age               int8
n_dependants      int8
fam_status      object
income           int64
dtype: object

In [13]:
# Check for irregularities due to downsizing

cust.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


##### "ords_prods"

In [14]:
# Shape of "ords_prods_prods"

ords_prods_prods.shape

(32404859, 14)

In [15]:
# First few rows of "ords_prods_prods"

ords_prods_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both


In [16]:
# Data types of "ords_prods_prods"

ords_prods_prods.dtypes

product_id                   int32
product_name                object
aisle_id                      int8
department_id                 int8
prices                     float64
order_id                     int32
user_id                      int32
order_number                  int8
orders_day_of_week            int8
order_hour_of_day             int8
days_since_prior_order     float64
add_to_cart_order            int32
reordered                     int8
_merge                    category
dtype: object

In [17]:
# Drop "_merge" column

ords_prods_prods = ords_prods_prods.drop(columns =['_merge'])

In [18]:
ords_prods_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1


# 04 Merge

In [19]:
# Merge the two dataframes

merged = cust.merge(ords_prods_prods, on = 'user_id', indicator = True)

In [20]:
# Have a look at the new dataframe

merged.shape

(32404859, 20)

In [21]:
merged.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income,product_id,product_name,aisle_id,department_id,prices,order_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,26711,Female,Missouri,48,3,married,165665,196,Soda,77,7,9.0,2543867,5,1,9,30.0,2,0,both
1,26711,Female,Missouri,48,3,married,165665,196,Soda,77,7,9.0,1285508,7,5,15,11.0,1,1,both
2,26711,Female,Missouri,48,3,married,165665,196,Soda,77,7,9.0,2578584,8,1,15,10.0,2,1,both
3,26711,Female,Missouri,48,3,married,165665,6184,Clementines,32,4,4.3,518967,1,2,9,,1,0,both
4,26711,Female,Missouri,48,3,married,165665,6184,Clementines,32,4,4.3,2524893,3,3,11,30.0,2,1,both


# 05 Export Data

In [22]:
merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'merged_all.pkl'))