# 2 IC Data Consistency Checks - customers

### This script contains the following points
#### 01 Import Libraries
#### 02 Import Data
#### 03 First Look at Data
#### 04 Consistency Checks
#### 05 Export Data

# 01 Import Libraries

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [2]:
# Set the data path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
# Import the "wrangled_customers" file

cust_wr = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'wrangled_customers.csv'), index_col = False)

# 03 First Look at Data

In [4]:
# Shape of "cust_wr"

cust_wr.shape

(206209, 8)

In [5]:
# First few rows of "cust_wr"

cust_wr.head()

Unnamed: 0.1,Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,0,26711,Female,Missouri,48,3,married,165665
1,1,33890,Female,New Mexico,36,0,single,59285
2,2,65803,Male,Idaho,35,2,married,99568
3,3,125935,Female,Iowa,40,0,single,42049
4,4,130797,Female,Maryland,26,1,married,40374


In [6]:
# Drop "Unnamed: 0" column

cust_wr = cust_wr.drop(columns =['Unnamed: 0'])

In [7]:
cust_wr.head()

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


In [8]:
# Data types of "cust_wr"

cust_wr.dtypes

user_id          int64
gender          object
state           object
age              int64
n_dependants     int64
fam_status      object
income           int64
dtype: object

# 04 Consistency Checks

In [9]:
# Descriptive statistics of "cust_wr"

cust_wr.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


No obvious problems were detected.

### 01 Mixed-Type Data

In [10]:
# Check for mixed-type data

for col in cust_wr.columns.tolist():
    weird = (cust_wr[[col]].map(type) != cust_wr[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (cust_wr[weird]) > 0:
        print (col)

No mixed-type data could be found.

### 02 Missing Values

In [11]:
# Check for missing values in "cust_wr"

cust_wr.isnull().sum()

user_id         0
gender          0
state           0
age             0
n_dependants    0
fam_status      0
income          0
dtype: int64

No missing values could be found.

### 03 Duplicate Data

In [13]:
# Create a new subset with only duplicates

cust_wr_dups = cust_wr[cust_wr.duplicated()]

In [14]:
cust_wr_dups

Unnamed: 0,user_id,gender,state,age,n_dependants,fam_status,income


There is no duplicate data in this data set.

In [15]:
# Number of rows after consistency check

cust_wr.shape

(206209, 7)

# 05 Export Data

In [16]:
# Export "cust_wr"

cust_wr.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'checked_customers.csv'))