# 2 IC Data Consistency Checks - products

### This script contains the following points
#### 01 Import Libraries
#### 02 Import Data
#### 03 First Look at Data
#### 04 Consistency Checks
#### 05 Export Data

# 01 Import Libraries

In [9]:
# Import Libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [10]:
# Set the data path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [11]:
# Import the "wrangled_products" file

prods_wr = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'wrangled_products.csv'), index_col = False)

# 03 First Look at Data

In [12]:
# Shape of "prods_wr"

prods_wr.shape

(49693, 6)

In [13]:
# First few rows of "prods_wr"

prods_wr.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [14]:
# Drop "Unnamed: 0" column

prods_wr = prods_wr.drop(columns =['Unnamed: 0'])

In [15]:
prods_wr.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [16]:
# Data types of "prods_wr"

prods_wr.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

# 04 Consistency Checks

In [17]:
# Descriptive statistics of "prods_wr"

prods_wr.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


The maximum value for "prices" is 99.999$ which is clearly too high.

##### Fix "prices" values

In [18]:
# Check for outliers (prices > $100)

prods_wr.loc[prods_wr['prices'] > 100]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [19]:
# Mark outliers as missing

prods_wr.loc[prods_wr['prices'] > 100, 'prices'] = np.nan

In [20]:
# Check for success by looking for max price value

prods_wr['prices'].max()

25.0

In [21]:
prods_wr['prices'].describe()

count    49691.000000
mean         7.682268
std          4.200242
min          1.000000
25%          4.100000
50%          7.100000
75%         11.200000
max         25.000000
Name: prices, dtype: float64

### 01 Mixed-Type Data

In [22]:
# Check for mixed-type data

for col in prods_wr.columns.tolist():
    weird = (prods_wr[[col]].map(type) != prods_wr[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (prods_wr[weird]) > 0:
        print (col)

product_name


In [23]:
prods_wr['product_name'].dtype

dtype('O')

The column "product_name" seems to have mixed-type data. When checking for the data type, the results shows that this column has the data type "string".

### 02 Missing Values

In [24]:
# Check for missing values in "prods_wr"

prods_wr.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            2
dtype: int64

There are 16 missing values in the "product_name" column and 2 missing values in the "prices" column. The latter is due to the fact that I marked the outlier values as missing.

Since the column "product_name" has missing data, it is possible that that is the reason why the column showed up as mixed-type data.

In [25]:
# Create a subset with only missing values in "prods_wr"

nan = prods_wr[prods_wr['product_name'].isnull() == True]

In [26]:
nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [27]:
# Create a subset of the data with no missing values

In [28]:
# Create a subset with no missing values

prods_wr_clean = prods_wr[prods_wr['product_name'].isnull() == False]

In [36]:
# Check again for mixed-type data

for col in prods_wr_clean.columns.tolist():
    weird = (prods_wr_clean[[col]].map(type) != prods_wr_clean[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (prods_wr_clean[weird]) > 0:
        print (col)

After creating a new dataframe without the rows with missing data, there is no more mixed-type data.

In [29]:
# Check the row number
prods_wr_clean.shape

(49677, 5)

This new dataframe has 16 rows less, the exact amount of rows containing missing values in the "prices" column.

### 03 Duplicate Data

In [30]:
# Create a new subset with only duplicates

prods_wr_dups = prods_wr[prods_wr.duplicated()]

In [31]:
prods_wr_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [32]:
# Create a dataframe without duplicates

prods_wr_no_dups = prods_wr_clean.drop_duplicates()

In [33]:
# Number of rows after consistency check

prods_wr_no_dups.shape

(49672, 5)

# 05 Export Data

In [34]:
# Export "prods_wr"

prods_wr_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'checked_products.csv'))