# 4.9 Merging the clean customer dataset with orders

### This script contains the following points:

#### 1. Complete finetuning of the ords_prods_merge dataset
#### 2. Drop unnecessary columns
#### 3. Merge the dataframes together
#### 4. Export the dataframes

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/nekow/Documents/Instacart Basket Analysis'

In [7]:
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_flagged.pkl'))

## 1. Complete finetuning of the ords_prods_merge dataset

In [8]:
# Dropping unnecessary columns

ords_prods_merge.drop(columns = ['_merge', 'eval_set', 'reordered'])

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,add_to_cart_order,price_range,Busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_order_frequency,order_frequency_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,5,Mid-range product,Average Days,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,1,Mid-range product,Average Days,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,20,Mid-range product,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,10,Mid-range product,Slowest days,Most orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,11,Mid-range product,Slowest days,Average orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,49688,Fresh Foaming Cleanser,73,11,13.5,1788356,200215,2,0,9,...,27,,Busiest days,Most orders,6,New customer,8.220313,Low spender,7.0,Frequent customer
32404855,49688,Fresh Foaming Cleanser,73,11,13.5,3401313,200377,1,4,11,...,5,,Slowest days,Most orders,4,New customer,7.364516,Low spender,30.0,Non-frequent customer
32404856,49688,Fresh Foaming Cleanser,73,11,13.5,809510,200873,5,3,8,...,12,,Slowest days,Average orders,20,Regular customer,8.500344,Low spender,6.0,Frequent customer
32404857,49688,Fresh Foaming Cleanser,73,11,13.5,2359893,200873,9,3,15,...,11,,Slowest days,Most orders,20,Regular customer,8.500344,Low spender,6.0,Frequent customer


In [9]:
# Filling the missing values in order frequency flag column

ords_prods_merge['order_frequency_flag'].fillna('One-time customers', inplace=True)

In [10]:
ords_prods_merge['user_id'].max()

206209

## 2. Drop unnecessary columns

In [11]:
# Importing wrangled customer dataframe

cust_cleaned = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'cust_cleaned.csv'))

In [12]:
# Dropping unnecessary columns

cust_cleaned.drop(columns = ['Unnamed: 0', 'first_name'])

Unnamed: 0,user_id,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...
206204,168073,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rollins,Female,California,27,4/1/2020,1,married,99799


In [13]:
# Merge the columns on the user_id variable

ords_prods_cust = cust_cleaned.merge(ords_prods_merge, on = ['user_id'])

In [14]:
ords_prods_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 34 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   Unnamed: 0              int64   
 1   user_id                 int64   
 2   first_name              object  
 3   last_name               object  
 4   gender                  object  
 5   state                   object  
 6   age                     int64   
 7   date_joined             object  
 8   n_dependants            int64   
 9   fam_status              object  
 10  income                  int64   
 11  product_id              int64   
 12  product_name            object  
 13  aisle_id                int64   
 14  department_id           int64   
 15  prices                  float64 
 16  order_id                int64   
 17  eval_set                object  
 18  order_number            int64   
 19  orders_day_of_week      int64   
 20  order_hour_of_day       int64   
 21  days_s

In [16]:
ords_prods_cust.shape

(32404859, 34)

In [11]:
# Drop unecessary merge columns

insta_clean = ords_prods_cust.drop(columns = ['_merge', 'Unnamed: 0'])

In [12]:
insta_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 32 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   user_id                 int64  
 1   first_name              object 
 2   last_name               object 
 3   gender                  object 
 4   state                   object 
 5   age                     int64  
 6   date_joined             object 
 7   n_dependants            int64  
 8   fam_status              object 
 9   income                  int64  
 10  product_id              int64  
 11  product_name            object 
 12  aisle_id                int64  
 13  department_id           int64  
 14  prices                  float64
 15  order_id                int64  
 16  eval_set                object 
 17  order_number            int64  
 18  orders_day_of_week      int64  
 19  order_hour_of_day       int64  
 20  days_since_prior_order  float64
 21  add_to_cart_order       int64

## 4. Export the dataframe

In [14]:
# Exporting the full dataframe

insta_clean.to_pickle(os.path.join(path,'02 Data','Prepared Data','ords_prods_all.pkl'))