# 4.5 Data Consistency

### 01. Importing libraries

In [43]:
# import libraries
import pandas as pd
import numpy as np
import os


In [44]:
# shortcuts for importing dataframes
path = r"C:\Users\Asus\Documents\DA CareerFoundry\Part II - Data Immersion\Python - Anaconda\August 2025 Instacart Basket Analysis\02 Data"

In [45]:
# import products.csv
df_prods = pd.read_csv(os.path.join(path, 'Original Data', 'products.csv'), index_col = False)

In [46]:
# import orders_wrangled.csv
df_ords = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

### 02. Consistency Checks

In [47]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [48]:
# create a dataframe
df_test = pd.DataFrame()

In [49]:
# create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

The first command, df_test = pd.DataFrame(), creates a new dataframe called df_test. The second command, df_test['mix'] = ['a', 'b', 1, True], creates a new column, mix, within df_test and fills it with numeric, string, and boolean values

In [50]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [51]:
# The function for checking whether a dataframe contains any mixed-type columns is as follows:
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [52]:
# with chatgpt help:
for col in df_test.columns:
    weird = (df_test[col].map(type) != type(df_test[col].iloc[0]))
    if weird.any():
        print(col)

mix


In [53]:
# converting a column type into another:
df_test['mix'] = df_test['mix'].astype('str')

Note that this will convert a column’s data type from numeric to string, but there may be times where you need to go in the opposite direction, as well—from string to numeric. To change this, simply update the str within the astype() function to int64 or whichever numeric data type you want to use.

With your mixed-type data addressed, you’re one step closer to completing your consistency checks. Next up—missing values!

### 03. Missing Values

It isn’t rare to find missing values in your data. As you learned in previous Achievements, missing values can occur for two reasons: 1) data corruption, or 2) they were never recorded in the first place. It’s important that you investigate and address any missing values in your data when conducting an analysis in Python. Similar to mixed-type columns, they can break your functions and throw errors in your analytical procedures.

In [54]:
# finding missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

isnull() searches and finds empty values. sum() sums them. If the latter was not used, it would show up as boolean (true, false).

There's 16 missing values in product name.

In [55]:
# creating a data frame to see where those 16 missing values are:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [56]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


#### Ways of handling missing data

    1. Create a new variable that acts like a flag based on the missing value.
    2. Impute the value with the mean or median of the column (if the variable is numeric).
    3. Remove or filter out the missing data.


We'll remove the missing data. Here's how:

In [57]:
# number of rows and columns
df_prods.shape

(49693, 5)

In [58]:
# 16 rows in almost 50 000 is less than 1%. Let's remove them by creating a new dataframe. This time it's used " == False" because what's wanted is the non missing values.
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [59]:
df_prods_clean.shape

(49677, 5)

In [60]:
# another way of dropping the values would be: df_prods.dropna(subset = ['product_name'], inplace = True)

### 04. Finding Duplicates

In [61]:
# The following command will look for full duplicates within your dataframe:

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [62]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


#### Addressing duplicates

In [63]:
df_prods_clean.shape

(49677, 5)

In [64]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [65]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [66]:
# less five rows.

### 05. Exporting dataframe

In [67]:
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Prepared Data', 'cleaner_df_prods.csv'), index = False)

## Task Submission

#### Question:
Run the df.describe() function on your df_ords dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

    Tip: Keep an eye on min and max values!


In [68]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### Answer:
The max of 'orders_day_of_week' is 6 and min is zero, that is equivalent to 7 days of the week.
nothing more to add. The most popular day for purchase is Tuesday (2,77 ~ 3).
Users purchase 2 itens at 1:30pm, on average. 

#### Question:
Check for mixed-type data in your df_ords dataframe.

If you find mixed-type data, fix it. The column in question should contain observations of a single data type.

In [69]:
df_ords.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [70]:
# checking for mixed data type columns:
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

# for - loop -> for these elements, do this. weird is a new variable, working as a text, that checks wheter the dara types within the column are consistent.


  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)


According to tutor, the box "is just a warning that some updates will be implemented in pandas and that .applymap will not be supported in the future. It should be replaced by .map". 
It's not an error, it's a warning.

The result of the function was none/Null, therefore there are no mixed type columns.

In [71]:
# another way of checking it:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


#### Questions:
Run a check for missing values in your df_ords dataframe.

    In a markdown cell, report your findings and propose an explanation for any missing values you find.

Address the missing values using an appropriate method.

    In a markdown cell, explain why you used your method of choice.


In [72]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### Answer:
There are 206209 empty values in days_since_prior_order. Deleting them or imputing are not good resolutions, since these might be connected with first time purchases. Creating a new variable that acts like a flag based on the missing value is the best resolution. In the meantime, let's do a subset dataframe with the missing values:

In [73]:
df_ords_missing_values = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [74]:
df_ords_missing_values

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


I decided to leave these values as they are, since they stand for first time orders. 

#### Questions:
Run a check for duplicate values in your df_ords data.

    In a markdown cell, report your findings and propose an explanation for any duplicate values you find.

Address the duplicates using an appropriate method.

    In a markdown cell, explain why you used your method of choice.


In [75]:
# creating a data set for possible duplicates:
df_ords_dups = df_ords[df_ords.duplicated()]

In [76]:
# printing them:
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [77]:
# or, doing it otherway:
# this does not create a new row or dataframe, it's an index.
# it won't be saved unless I command it to.
df_ords[df_ords.duplicated()]

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


#### Answer:
No duplicates exist, apparently.

#### Exporting csv files

In [78]:
# exporting clean_df_prods; done it already above, before task submission
# df_prods_clean_no_dups.to_csv(os.path.join(path, 'Prepared Data', 'cleaner_df_prods.csv'), index = False)

Extra column "Unnamed: 0" Why it happens

When you save a DataFrame with to_csv() (or to_excel()), by default pandas also saves the index as the first column.

If you later reload that file with pd.read_csv() without telling pandas that the first column is the index, it imports the index as a normal column, named "Unnamed: 0".

So "Unnamed: 0" is basically just your old index.

In [79]:
# exporting clean_df_ords
df_ords.to_csv(os.path.join(path, 'Prepared Data', 'clean_df_ords.csv'), index = False)