# Import libraries

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib as plt
import scipy

In [2]:
fpath = r'C:\Users\Mei\Instacart Basket Analysis\02 Data'

In [3]:
df_cust = pd.read_csv(os.path.join(fpath,'Original data','customers.csv'), index_col = False)

In [4]:
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


## 1. Wrangle the dataframe

In [7]:
df_cust.rename(columns = {'Surnam': 'surname'}, inplace = True)
df_cust.rename(columns = {'First Name': 'first_name'}, inplace = True)
df_cust.rename(columns = {'Gender':'gender'}, inplace = True)
df_cust.rename(columns = {'STATE': 'state'}, inplace = True)
df_cust.rename(columns = {'Age':'age'}, inplace = True)
df_cust.rename(columns = {'fam_status':'family_status'}, inplace = True)

In [8]:
df_cust.head(5)

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


## 2. Consistency check

In [9]:
for col in df_cust.columns.tolist():
    ck = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis =1)
    if len(df_cust[ck]) >0:
        print(ck)

0         False
1         False
2         False
3         False
4         False
          ...  
206204    False
206205    False
206206    False
206207    False
206208    False
Length: 206209, dtype: bool


### Conclusions: no mixed types

## 3. Missing records check

In [10]:
df_cust_nan = df_cust.isnull().sum()

In [11]:
df_cust_nan

user_id              0
first_name       11259
surname              0
gender               0
state                0
age                  0
date_joined          0
n_dependants         0
family_status        0
income               0
dtype: int64

In [12]:
df_cust.shape

(206209, 10)

In [13]:
11259/206209

0.054599944716282996

In [14]:
# delete the null values as the percentage of missing data is about 5% 
df_cust.dropna(inplace = True)

In [15]:
df_cust.shape

(194950, 10)

### Conclusions: the null data have been deleted 

## 4. Duplicate records check

In [16]:
df_cust_dup = df_cust[df_cust.duplicated()]

In [17]:
df_cust_dup

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,family_status,income


### Conclusions: no duplicated records in the file

In [18]:
df_cust.to_csv(os.path.join(fpath,'Prepared data','customers_clean.csv'), index = False)

## 5. Combine the dataset of customer with the merged file

In [19]:
df_merge = pd.read_pickle(os.path.join(fpath,'Prepared data','Entire_updated_merged_upd1.pkl'))

In [20]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32406041 entries, 0 to 32406040
Data columns (total 20 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   product_id              int64  
 1   product_name            object 
 2   aisle_id                int64  
 3   department_id           int64  
 4   prices                  float64
 5   order_id                int64  
 6   user_id                 int64  
 7   eval_setting            object 
 8   order_number            int64  
 9   order_day_of_week       int64  
 10  order_hour_of_day       int64  
 11  days_since_prior_order  float64
 12  add_to_cart_order       int64  
 13  reordered               int64  
 14  price_range             object 
 15  busy days               object 
 16  busiest_period_of_day   object 
 17  loyalty_flag            object 
 18  spender_flag            object 
 19  order_frequency         object 
dtypes: float64(2), int64(10), object(8)
memory usage: 5.1+ GB


### Due to the super big size of the merged file, i will only take the necessary columns which are needed for the next calculations in this exercise. 

#### 1) Subset of merged file

In [7]:
list_merge = ['product_id','department_id','prices','user_id','order_number','order_day_of_week','order_hour_of_day','loyalty_flag','spender_flag']

In [10]:
df_merge_sub = df_merge[list_merge]

#### 2) Subset of the df_cust

In [11]:
list_cust = ['user_id','gender','age','n_dependants','family_status','income','state']

In [14]:
df_cust_sub = df_cust[list_cust]

#### 3) Merge the 'customer' and merged file

In [15]:
df_merged_cust_merge = pd.merge(df_merge_sub,df_cust_sub,on = 'user_id')

In [16]:
df_merged_cust_merge.to_pickle(os.path.join(fpath,'Prepared data','merged_cusotmer.pkl'))