# 01 Import Libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [2]:
# Set a path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
# Import "customers"

customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# 03 First look at data

In [4]:
# Shape of "customers"

customers.shape

(206209, 10)

In [5]:
# First few rows of "customers"

customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
# Data types of "customers"

customers.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

# 04 Data Wrangling

### 01 Drop Columns

In [4]:
# Drop "First Name", "Surnam"  and"date_joined" from "customers"

customers = customers.drop(columns =['First Name', 'Surnam', 'date_joined'])

In [5]:
customers.head(1)

Unnamed: 0,user_id,Gender,STATE,Age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665


### 02 Rename Columns

In [9]:
# Rename columns from "customers" to follow the same naming convention

In [5]:
# Create a variabel with new column names

column_names = {
    'Gender' : 'gender',
    'STATE' : 'state',
    'Age' : 'age'}

In [6]:
# Rename columns

customers.rename(columns = column_names, inplace=True)

In [7]:
customers.head(1)

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665


### 03 Change Data Types

In [8]:
# Change data type of "user_id" to "int32" to match the data type of "user_id" in the merged "orders_products" file

customers['user_id'] = customers['user_id'].astype('int32')

In [9]:
customers['user_id'].dtype

dtype('int32')

In [10]:
# Change data type sof "age", "n_dependents" and "income" for less memory uptake

customers['age'] = customers['age'].astype('int8')
customers['n_dependants'] = customers['n_dependants'].astype('int8')

# 05 Consistency Checks

In [16]:
# Descriptive statistics of "customers"

customers.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,-0.591696
std,59527.555167,18.480962,1.118433,73.974701
min,1.0,18.0,0.0,-128.0
25%,51553.0,33.0,0.0,-65.0
50%,103105.0,49.0,1.0,-1.0
75%,154657.0,66.0,3.0,64.0
max,206209.0,81.0,3.0,127.0


In [17]:
# All the minimum, maximum and mean values seem fine.
# All columns have the same count of values.

### 01 Missing Values

In [12]:
# Check for missing values in "customers"

customers.isnull().sum()

user_id         0
gender          0
state           0
age             0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [19]:
# There are no missing values.

### 02 Mixed type data

In [20]:
# Check for mixed type data in "customers"

for col in customers.columns.tolist():
    weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (customers[weird]) > 0:
        print (col)

In [21]:
# There is no mixed type data.

### 03 Duplicate Data

In [22]:
# Check for duplicate data

customers_dups = customers[customers.duplicated()]

In [23]:
customers_dups

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income


In [24]:
# There are no duplicate rows.

In [11]:
# Number of rows after consistency check

customers.shape

(206209, 7)

# 06 Merge Data

In [12]:
# Import the merged "orders_products" file to merge with "customers"

ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_clean.pkl'))

In [13]:
# Select needed columns for next task because full merge isn't possible due to memory issues

columns = ['order_id', 'user_id', 'orders_day_of_week', 'order_hour_of_day', 'prices', 'loyalty_flag']

In [14]:
# Create smaller dataframe to merge with "customers"

ords_prods_small = ords_prods[columns]

In [15]:
# Delete the big file

del(ords_prods)

In [16]:
# Merge "ords_prods_small" and "customers"

ords_prods_all = customers.merge(ords_prods_small, on = 'user_id', indicator = True)

In [17]:
ords_prods_all.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income,order_id,orders_day_of_week,order_hour_of_day,prices,loyalty_flag,_merge
0,26711,Female,Missouri,48,3,married,165665,2543867,1,9,9.0,New customer,both
1,26711,Female,Missouri,48,3,married,165665,1285508,5,15,9.0,New customer,both
2,26711,Female,Missouri,48,3,married,165665,2578584,1,15,9.0,New customer,both
3,26711,Female,Missouri,48,3,married,165665,423547,2,9,12.6,New customer,both
4,26711,Female,Missouri,48,3,married,165665,2524893,3,11,12.6,New customer,both


# 07 Export dataframe

In [18]:
# Export "ords_prods_all"

ords_prods_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_selected.pkl'))