# Products EDA

This file contains the names of the products with their corresponding `product_id`. Furthermore the aisle and deparment are included.

## 01 Import

In [1]:
#Import libriaries
import numpy as np
import pandas as pd 
import os

In [2]:
# Assign folder path
path = r'/Users/peanutcookie/instacart-book/'

In [3]:
# Import .csv file
df = pd.read_csv(os.path.join(path, '_csv-raw', 'products.csv'), index_col = False)

## 02 Dataframe exploration

In [4]:
# Dataframe shape
df.shape

(49693, 5)

In [5]:
# Return dataframe head
df.head(5)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
# Return dataframe tail
df.tail(5)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7
49692,49688,Fresh Foaming Cleanser,73,11,13.5


In [7]:
# Return Dataframe columns names and types of the data their store
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49693 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49693 non-null  int64  
 1   product_name   49677 non-null  object 
 2   aisle_id       49693 non-null  int64  
 3   department_id  49693 non-null  int64  
 4   prices         49693 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.9+ MB


## 03 Dataframe Cleansing

### Mixed values


In [8]:
# Check for mixed values
print("Mixed data")
for col in df.columns.tolist():
  mixed_products = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1) 
  if len (df[mixed_products]) > 0:
    print (col + ": True")
  else: 
            print (col + ": False")

Mixed data
product_id: False
product_name: True
aisle_id: False
department_id: False
prices: False


`product_name` contains mixed values. Fixxing the issue in the Data types check.

### Data types

In [9]:
# Return dataframe data types
df.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [10]:
## Correcting dataypes - assignig Str data type to order_id, user_id, order_number to receive correct staistics records
df = df.astype({"product_id":'str', "aisle_id":'str', "department_id":'str'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49693 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49693 non-null  object 
 1   product_name   49677 non-null  object 
 2   aisle_id       49693 non-null  object 
 3   department_id  49693 non-null  object 
 4   prices         49693 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.9+ MB


### Duplicates

In [11]:
# Duplicates
duplicated_rows = df[df.duplicated()]

In [12]:
duplicated_rows

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [13]:
df = df.drop_duplicates()

In [14]:
# Dataframe check for duplicates
df[df.duplicated()].shape

(0, 5)

### Missing Values

In [15]:
# Search for missing values

nan_product_name = df[pd.isnull(df.product_name)]
nan_shape = nan_product_name.shape

print("Data Frame dimensions", nan_shape)
print ("Table with missing product names:")
nan_product_name.sort_index(ascending=True)

Data Frame dimensions (16, 5)
Table with missing product names:


Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [16]:
# Dropping missing values
df.dropna(subset = ['product_name'], inplace = True)

In [17]:
# Dataframe check for missing values
df[pd.isnull(df.product_name)].shape

(0, 5)

### Dropping columns

In [18]:
# Drop aisle_id
df = df.drop('aisle_id', axis=1)

## 04 Export check

In [19]:
df.shape

(49672, 4)

In [20]:
df.dtypes

product_id        object
product_name      object
department_id     object
prices           float64
dtype: object

## 04 Export

In [21]:
df.to_pickle(os.path.join(path, '_database', 'products.pkl'))