# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Importing Data

In [13]:
path = r'/Users/docopeland/04 Instacart Basket Analysis'
products = pd.read_csv(os.path.join(path, '02 Data', 'original data', 'products.csv'))

# Data Observations

In [14]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [15]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49693 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49693 non-null  int64  
 1   product_name   49677 non-null  object 
 2   aisle_id       49693 non-null  int64  
 3   department_id  49693 non-null  int64  
 4   prices         49693 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.9+ MB


In [16]:
products.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


# Data Wrangling

In [17]:
#changing product_id, aisle_id, and department_id to strings
products['product_id'] = products['product_id'].astype('str')
products['aisle_id'] = products['aisle_id'].astype('str')
products['department_id'] = products['department_id'].astype('str')

In [18]:
#there's no need to rename or delete any columns

# Consistency Checks

In [8]:
# checking for mixed data types
for col in products.columns.tolist():
  weird = (products[[col]].applymap(type) != products[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (products[weird]) > 0:
    print (col)

product_name


In [20]:
products['product_name'].dtype

dtype('O')

In [21]:
# the mixed values seems from the NaN value
products['product_name'].value_counts(dropna = False)

NaN                                                     16
Black House Coffee Roasty Stout Beer                     2
Adore Forever Body Wash                                  2
Gluten Free Organic Peanut Butter & Chocolate Cereal     2
Fiber 4g Gummy Dietary Supplement                        2
                                                        ..
Kosher Dill Mini Pickles                                 1
Chili Rice Scooter                                       1
All Natural Chicken Apple Sausage                        1
Quartered Artichoke Hearts                               1
Fresh Foaming Cleanser                                   1
Name: product_name, Length: 49673, dtype: int64

In [36]:
#checking for null values, there are the 16 product names that we saw as NaNs before
products.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [41]:
products[products['product_name'].isnull() == True]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [42]:
#removing the rows where product_name is NaN
products = products[products['product_name'].isnull() == False]`

In [43]:
#checking to see if there's any null values now, there are now
products.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64

In [44]:
# rechecking for mixed data types since those values from before were the NaNs
for col in products.columns.tolist():
  weird = (products[[col]].applymap(type) != products[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (products[weird]) > 0:
    print (col)

In [45]:
#there are 5 duplicates
products.duplicated().sum()

5

In [51]:
#deleting the duplicated rows
products = products[products.duplicated() == False]

# Checking Data Again

In [52]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49672 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49672 non-null  object 
 1   product_name   49672 non-null  object 
 2   aisle_id       49672 non-null  object 
 3   department_id  49672 non-null  object 
 4   prices         49672 non-null  float64
dtypes: float64(1), object(4)
memory usage: 2.3+ MB


In [53]:
products.describe()

Unnamed: 0,prices
count,49672.0
mean,9.993282
std,453.615536
min,1.0
25%,4.1
50%,7.1
75%,11.1
max,99999.0


# Exporting Data

In [54]:
products.to_csv(os.path.join(path, '02 Data', 'prepared data', 'products_clean.csv'))