# Data Checks on: customers.csv

#### Contents. 
- Missing Data Check
- Duplicate Check
- Mixed-type Data Check
- Other inconsistency, if needed, check
- Summary if changes to the original dataframe

## Importing libraries and data

In [1]:
#importing the libraries

import pandas as pd
import numpy as np
import os

In [2]:
#importing datasets:
path = r'C:\Users\chris\Documents\Instacart Basket Analysis'
df_cust = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'customers.csv'))

## Viewing the dataframe

In [3]:
# size:
df_cust.shape

(206209, 10)

In [4]:
# columns:
df_cust.columns

Index(['user_id', 'First Name', 'Surnam', 'Gender', 'STATE', 'Age',
       'date_joined', 'n_dependants', 'fam_status', 'income'],
      dtype='object')

In [5]:
# looks:
df_cust

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...,...,...
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799


In [6]:
# data types:
df_cust.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [7]:
# basic stats:
df_cust.describe().round(1)

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.5,1.5,94632.9
std,59527.6,18.5,1.1,42473.8
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [8]:
# basic stats: gender distribution
df_cust.value_counts('Gender')

Gender
Male      104067
Female    102142
dtype: int64

In [9]:
# basic stats: fam status
df_cust.value_counts('fam_status')

fam_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
dtype: int64

### Missing Data Check

In [10]:
# checking for any missing values:
df_cust.isna().sum()

user_id             0
First Name      11259
Surnam              0
Gender              0
STATE               0
Age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

##### Missing values in first names are ok. We actually don't need the column, since user_id is enough to tell all users apart.

### Duplicated Data Check

In [11]:
# looking for duplicates:
df_dups = df_cust[df_cust.duplicated()]
df_dups

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income


##### No duplicates.

### Mixed-typed Data Check

In [12]:
# checking for mixed-typed data:
for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

First Name


In [13]:
# another check: show me data types of all columns
df_cust.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

##### The mixed data type comes from the missing values in 'First Name'. Since we don't need the column, no steps taken here.

## Summary: What needs to be addressed:

##### Dropping column 'First Name' and surname - columns for data privacy
##### changing id columns into strings
##### changing data type to datetime in column 'date_joined'

## Changing and Exporting the dataframe >> customers_wrangled.pkl

In [14]:
# changing ids into string variables:
df_cust['user_id'] = df_cust['user_id'].astype(str)

In [15]:
# changing the data type in 'date_joined' from object to datetime:
df_cust['date_joined'] = pd.to_datetime(df_cust['date_joined'])

In [16]:
# checking the data types:
df_cust.dtypes

user_id                 object
First Name              object
Surnam                  object
Gender                  object
STATE                   object
Age                      int64
date_joined     datetime64[ns]
n_dependants             int64
fam_status              object
income                   int64
dtype: object

In [17]:
# creating a wrangled dataframe, dropping columns 'First Name' and 'Surnam'
df_cust_clean = df_cust.drop(columns = ['First Name','Surnam' ])
df_cust_clean

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Female,Maryland,26,2017-01-01,1,married,40374
...,...,...,...,...,...,...,...,...
206204,168073,Female,North Carolina,44,2020-04-01,1,married,148828
206205,49635,Male,Hawaii,62,2020-04-01,3,married,168639
206206,135902,Female,Missouri,66,2020-04-01,2,married,53374
206207,81095,Female,California,27,2020-04-01,1,married,99799


### Exporting the dataframe

In [18]:
# exporting the dataframe:
df_cust_clean.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'customers_wrangled.pkl'))