# CONTENT LIST
01. Importing Libraries
02. Importing Data
03. Data Consistency Checks
04. Beginning of task 5

# 1.Importing Libraries

In [46]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [47]:
# Creating common folder path
path = r'C:\Users\Mukund\Desktop\Career Foundry\Instacart_Grocery_Basket_Analysis_May_2021'

In [48]:
path

'C:\\Users\\Mukund\\Desktop\\Career Foundry\\Instacart_Grocery_Basket_Analysis_May_2021'

In [49]:
# Importing orders data
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Prepared_data', 'orders_wrangled.csv'), index_col = False)

In [50]:
# Importing products data
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original_data', 'products.csv'), index_col = False)

In [10]:
# Creating test dataframe
df_test = pd.DataFrame()

In [11]:
# Creating mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [12]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [13]:
# Checking for mixed-type data in orders
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


# 3.Data Consistency Checks

In [14]:
# Checking for number of missing values in products
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [15]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [16]:
df_prods.shape

(49693, 5)

In [17]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [18]:
df_prods_clean.shape

(49677, 5)

In [19]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [20]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [21]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [74]:
# Exporting df_prods dataframe under Prepared Data folder
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data', 'Prepared_data', 'Products_Checked.csv'), index = False)

# 4. Beginning of task 5

Step 1) If you haven’t performed the consistency checks covered in this Exercise on your df_prods dataframe, do so now. - Done above

Step 2) Run the df.describe() function on your df_prods dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [32]:
# Checking for the data using describe function
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


The max of price column is very high 99999 which can be a mistake or human error while entering data. Also, count of the product_id column does not match with the max of product_id

Step 3) Check for mixed-type data in your df_ords dataframe.

In [51]:
# Checking for mixed-type data from orders
for col in df_ords.columns.tolist():
  mixed_type = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[mixed_type]) > 0:
    print (col)

No mixed-type data found

Step 5) Run a check for missing values in your df_ords dataframe

In [64]:
# Checking for missing values in orders data
df_ords.isnull().sum()

order_id                  0
user_id                   0
order_number              0
orders_day_of_week        0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

Only one column contains the missing data which is 'days_since_prior_order' This can be possible if customer is ordering for the first time.

Step 6) Address the missing values using an appropriate method.
In a markdown cell, explain why you used your method of choice.

In [57]:
# Getting the number of records for orders data
df_ords.shape

(3421083, 6)

First of all, the missing data is not the wrong one because there can be possibility of ordering for first time and there can be no records or days available in that column. So, first I will confirm with the client whether to remove this data or to impute mean/median or any other value. As of now I am going to impute 0 values for the empty records.

In [58]:
# Getting the data where 'days_since_prior_order' column not having missing values
df_nan_ords = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [59]:
df_nan_ords

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [60]:
# Updating empty records with 0 value
df_ords['days_since_prior_order'].fillna(0, inplace = True)

In [65]:
df_ords.shape

(3421083, 6)

In [67]:
df_ords.isnull().sum()

order_id                  0
user_id                   0
order_number              0
orders_day_of_week        0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

In [None]:
No missing data found after updating

Step 7) Run a check for duplicate values in your df_ords data.

In [68]:
# Checking for duplicates in orders data
df_ords_dups = df_ords[df_ords.duplicated()]

In [69]:
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


There are no duplicates found in orders data

In [72]:
#Exporting df_ords dataframe under Prepared Data folder
df_ords.to_csv(os.path.join(path, 'Data', 'Prepared_data', 'Orders_Checked.csv'),index=False)

In [73]:
df_ords

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,0.0
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0
