##### Contents:
###### 1. Import Libraries
###### 2. Import customers dataframe
###### 3. Data wrangling
###### 4. Consistency checks
###### 5. Combine dataframes
###### 6. Drop Unnecessary columns
###### 7. Merge dataframe
###### 8. Export dataframe

##### 1. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

##### 2. Import customers dataframe:

In [3]:
path = r'C:\Users\Daniella\ACH4 - Instacart Basket Analysis'

In [7]:
df_customers = pd.read_csv(os.path.join(path, '02. Data', 'Original Data', 'customers.csv'))

In [7]:
# Import ords_prods_merg dataframe
ords_prods_merge = pd.read_pickle(os.path.join(path, '02. Data', 'Prepared Data', 'ords_prods_grouped.pkl'))

##### 3. Data wrangling

In [19]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,Nº of dependants,Family status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [21]:
df_customers.shape

(206209, 10)

In [11]:
# Renaming columns:
df_customers.rename(columns={'Surnam': 'Surname'}, inplace=True)

In [13]:
df_customers.rename(columns={'STATE': 'State'}, inplace=True)

In [15]:
df_customers.rename(columns={'n_dependants': 'Nº of dependants'}, inplace=True)

In [17]:
df_customers.rename(columns={'fam_status': 'Family status'}, inplace=True)

##### 4. Consistency checks

In [23]:
# Finding missing values:
missing_values = df_customers.isnull().sum()

In [25]:
missing_values

user_id                 0
First Name          11259
Surname                 0
Gender                  0
State                   0
Age                     0
date_joined             0
Nº of dependants        0
Family status           0
income                  0
dtype: int64

In [27]:
# Finding duplicates:
duplicates = df_customers.duplicated()

In [29]:
duplicates

0         False
1         False
2         False
3         False
4         False
          ...  
206204    False
206205    False
206206    False
206207    False
206208    False
Length: 206209, dtype: bool

In [31]:
#  Find the total number of duplicates:
num_duplicates = duplicates.sum()

In [33]:
num_duplicates

0

##### No duplicates found

In [73]:
# Check column types:
df_customers.dtypes

user_id              int32
Gender              object
State               object
Age                  int32
Nº of dependants     int64
Family status       object
income               int32
dtype: object

In [44]:
# Convert df_customers_dask dataframe to int32 and float32
df_customers['user_id'] = df_customers['user_id'].astype('int32')

In [46]:
df_customers['Age'] = df_customers['Age'].astype('int32')

In [48]:
df_customers['income'] = df_customers['income'].astype('int32')

##### 5. Combine dataframes

In [62]:
# Checki data type:
ords_prods_merge.dtypes

order_id                    int32
user_id                     int32
order_number                int32
orders_day_of_the_week      int32
order_hour_of_day           int32
days_since_prior_order    float32
product_id                  int32
add_to_cart_order           int32
reordered                   int32
product_name               object
aisle_id                    int32
department_id               int32
prices                    float32
price_range_loc            object
busiest_day                object
busiest_days               object
busiest_period_of_day      object
max_order                   int32
loyalty_flag               object
average_spend             float32
spender_flag               object
Customer_frequency        float32
frequency_flag             object
dtype: object

##### The main column "user_id" has the same data type in both dataframes

##### 6. Drop Unnecessary columns:

In [65]:
# Drop the 'First Name' column from the previous dataframe:
df_customers = df_customers.drop(columns=['First Name'])

In [67]:
# Drop the 'Surname' column from the previous dataframe:
df_customers = df_customers.drop(columns=['Surname'])

In [71]:
# Drop the 'date_joined' column from the previous dataframe:
df_customers = df_customers.drop(columns=['date_joined'])

In [60]:
# Drop the '_merge' column from the previous dataframe:
ords_prods_merge = ords_prods_merge.drop(columns=['_merge'])

KeyError: "['_merge'] not found in axis"

##### 7. Merge dataframe:

In [77]:
# Perform merge:
ords_prods_customers_merge = ords_prods_merge.merge(df_customers, on ='user_id')

##### 8. Export new dataframe:

In [81]:
ords_prods_customers_merge.to_pickle(os.path.join(path, '02. Data','Prepared Data', 'ords_prods_customers_merged.pkl'))