# Customer Data for Instacart

### Contents: 
#### Clean Customer Data
#### Drop unnecessary columns
#### Check for duplicates
#### Merge customer data with main dataframe

## 1.0 Import Libraries

In [1]:
#Import standard libraries
import pandas as pd
import numpy as np
import os

In [3]:
#Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 2.0 Import Data

In [4]:
#Create path for importing data
project_path = r'C:\Users\Owner\Documents\Career Foundry\Instacart Basket Analysis'
#Import customer data
df_customers = pd.read_csv(os.path.join(project_path, '02 Data', '02 01 Originals', 'customers.csv'))

In [5]:
df_orders_products = pd.read_pickle(os.path.join(project_path, '02 Data', '02 02 Prepared Data', 'orders_products_flagged.pkl'),)

## 3.0 Wrangle and Clean Data

### 3.1 Wrangle Data

In [10]:
df_customers.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


#### 3.1.1 Frequency of values

In [11]:
df_customers['fam_status'].value_counts(dropna = False)

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: fam_status, dtype: int64

In [12]:
df_customers['Gender'].value_counts(dropna = False)

Male      104067
Female    102142
Name: Gender, dtype: int64

In [13]:
df_customers['STATE'].value_counts(dropna = False)

Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana       

In [14]:
df_customers['Age'].value_counts(dropna = False )

19    3329
55    3317
51    3317
56    3306
32    3305
      ... 
65    3145
25    3127
66    3114
50    3102
36    3101
Name: Age, Length: 64, dtype: int64

In [15]:
df_customers['n_dependants'].value_counts(dropna = False)

0    51602
3    51594
1    51531
2    51482
Name: n_dependants, dtype: int64

In [16]:
df_customers['income'].value_counts(dropna = False)

57192     10
95891     10
95710     10
97532      9
98675      9
          ..
73141      1
71524      1
74408      1
44780      1
148828     1
Name: income, Length: 108012, dtype: int64

In [17]:
df_customers['date_joined'].value_counts(dropna = False)

9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: date_joined, Length: 1187, dtype: int64

#### 3.1.2 Change Column Names

In [18]:
#change Age to match cases of all columns
df_customers.rename(columns={'Age':'age'}, inplace=True)

In [19]:
#change Gender to match cases of all columns
df_customers.rename(columns={'Gender':'gender'}, inplace=True)

In [20]:
#change STATE to match case
df_customers.rename(columns={'STATE':'state'}, inplace=True)

In [21]:
#change n_dependants to logical name
df_customers.rename(columns={'n_dependants':'num_dependants'}, inplace=True)

In [22]:
#Review column name changes
df_customers.head(10)

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Male,Virginia,26,1/1/2017,2,married,32072


#### 3.1.3 Drop Columns Not Needed

In [None]:
#Drop First Name column
df_customers = df_customers.drop('First Name', axis=1)

In [8]:
#Drop Surnam column
df_customers = df_customers.drop('Surnam', axis=1)

In [24]:
df_customers.head(10)

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Male,Virginia,26,1/1/2017,2,married,32072


### 3.2 Clean Data

#### 3.2.1 Check for nulls

Based on frequency tables already ran, we know that there are no null values in gender, state, num_dependants or fam_status.

In [26]:
#Check for nulls in customers
df_customers.isnull().sum()

user_id           0
gender            0
state             0
age               0
date_joined       0
num_dependants    0
fam_status        0
income            0
dtype: int64

#### 3.2.2 Check for duplicate records

In [28]:
#Check for duplicates
df_dups = df_customers[df_customers.duplicated()]
df_dups

Unnamed: 0,user_id,gender,state,age,date_joined,num_dependants,fam_status,income


No values in the df_dups data frame shows that there are no duplicate values.

#### 3.2.3 Check for mixed data types

In [25]:
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].applymap(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)
    
#No result shows that there are no columns with mixed data types

### 4.0 Merge Data

In [None]:
#verify id column is available

In [30]:
df_orders_products.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range', 'busiest_day', 'grouped_days',
       'busiest_period_of_day', 'max_orders', 'loyalty_flag', 'mean_price',
       'spending_flag', 'median_days_since', 'frequency_flag'],
      dtype='object')

In [31]:
#check data type of user_id
type(df_customers['user_id'])

pandas.core.series.Series

In [32]:
type(df_orders_products['user_id'])

pandas.core.series.Series

In [33]:
#merge customer and orders_products data frame
df_cust_orders_prods = df_orders_products.merge(df_customers, on = 'user_id', indicator = True)

In [34]:
df_cust_orders_prods.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,median_days_since,frequency_flag,gender,state,age,date_joined,num_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,11.10408,196,1,0,Soda,...,20.0,Regular Customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,20.0,Regular Customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,20.0,Regular Customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,20.0,Regular Customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,20.0,Regular Customer,Female,Alabama,31,2/17/2019,3,married,40423,both


## 5.0 Export Data

In [36]:
#Export merged data as pickle
df_cust_orders_prods.to_pickle(os.path.join(project_path, '02 Data','02 02 Prepared Data', 'orders_products_customers.pkl'))