# Project - "What's on the Menu?" pandas Data Cleaning

In [1]:
import numpy as np
import pandas as pd

# Load Data

Read the csv file that has been cleaned using OpenRefine (OR).

In [2]:
dish = pd.read_csv("./Dish_OR-clean.csv")

In [3]:
dish.head()

Unnamed: 0,id,name,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
0,1,"Consomme, Printaniere Royal",8,8,1897,1927,0.2,0.4
1,2,Chicken Gumbo,111,117,1895,1960,0.1,0.8
2,3,Tomato Aux Croutons,13,13,1893,1917,0.25,0.4
3,4,Onion Au Gratin,41,41,1900,1971,0.25,1.0
4,5,St Emilion,66,68,1881,1981,0.0,18.0


# Data Summary

In [4]:
print(dish.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423397 entries, 0 to 423396
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              423397 non-null  int64  
 1   name            423397 non-null  object 
 2   menus_appeared  423397 non-null  int64  
 3   times_appeared  423397 non-null  int64  
 4   first_appeared  423397 non-null  int64  
 5   last_appeared   423397 non-null  int64  
 6   lowest_price    394297 non-null  float64
 7   highest_price   394297 non-null  float64
dtypes: float64(2), int64(5), object(1)
memory usage: 25.8+ MB
None


# Data Profiling

Perform data profiling.

## Profile Dish ID

In [5]:
# Profile IDs
profile_id = dish["id"].describe(include='all')
print(f"IDs:\n{profile_id}")

IDs:
count    423397.000000
mean     264456.594900
std      150489.070889
min           1.000000
25%      132374.000000
50%      269636.000000
75%      397135.000000
max      515677.000000
Name: id, dtype: float64


In [6]:
# Count the number of null and non-null id rows
num_null_id = dish['id'].isnull().sum()
num_non_null_id = dish['id'].notnull().sum()
print(f"Number of NULL IDs: {num_null_id}")
print(f"Number of NON-NULL IDs: {num_non_null_id}")

Number of NULL IDs: 0
Number of NON-NULL IDs: 423397


## Profile Dish Name

In [7]:
# Profile Dish Name
profile_name = dish["name"].describe(include='all')
print(f"IDs:\n{profile_name}")

IDs:
count                   423397
unique                  343863
top       Fried Sweet Potatoes
freq                        48
Name: name, dtype: object


In [8]:
# Count the number of null and non-null id rows
num_null_name = dish['name'].isnull().sum()
num_non_null_name = dish['name'].notnull().sum()
print(f"Number of NULL Name: {num_null_name}")
print(f"Number of NON-NULL Name: {num_non_null_name}")

Number of NULL Name: 0
Number of NON-NULL Name: 423397


## Profile Dish times_appeared

In [9]:
times_appeared = dish["times_appeared"].describe(include='all')
print(f"IDs:\n{times_appeared}")

IDs:
count    423397.000000
mean          3.146794
std          29.962122
min          -6.000000
25%           1.000000
50%           1.000000
75%           1.000000
max        8484.000000
Name: times_appeared, dtype: float64


In [10]:
num_null = dish['times_appeared'].isnull().sum()
num_non_null = dish['times_appeared'].notnull().sum()
print(f"Number of NULL: {num_null}")
print(f"Number of NON-NULL: {num_non_null}")

Number of NULL: 0
Number of NON-NULL: 423397


## First Appear

In [11]:
first_appeared = dish["first_appeared"].describe(include='all')
print(f"IDs:\n{first_appeared}")

IDs:
count    423397.000000
mean       1675.514555
std         651.321461
min           0.000000
25%        1900.000000
50%        1914.000000
75%        1949.000000
max        2928.000000
Name: first_appeared, dtype: float64


In [12]:
num_null = dish['first_appeared'].isnull().sum()
num_non_null = dish['first_appeared'].notnull().sum()
print(f"Number of NULL: {num_null}")
print(f"Number of NON-NULL: {num_non_null}")

Number of NULL: 0
Number of NON-NULL: 423397


## Last Appear

In [13]:
last_appeared = dish["last_appeared"].describe(include='all')
print(f"IDs:\n{last_appeared}")

IDs:
count    423397.000000
mean       1679.299738
std         651.934580
min           0.000000
25%        1900.000000
50%        1917.000000
75%        1955.000000
max        2928.000000
Name: last_appeared, dtype: float64


In [14]:
num_null = dish['last_appeared'].isnull().sum()
num_non_null = dish['last_appeared'].notnull().sum()
print(f"Number of NULL: {num_null}")
print(f"Number of NON-NULL: {num_non_null}")

Number of NULL: 0
Number of NON-NULL: 423397


# Remove Duplicates

In [15]:
# Identify duplicate rows based on the id column
duplicate_rows = dish[dish['id'].duplicated()]

# Print the duplicate rows
print(duplicate_rows)

Empty DataFrame
Columns: [id, name, menus_appeared, times_appeared, first_appeared, last_appeared, lowest_price, highest_price]
Index: []


In [16]:
# Remove duplicates based on the id column
no_dup_dish = dish.drop_duplicates(subset='id')

In [17]:
print(no_dup_dish.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 423397 entries, 0 to 423396
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              423397 non-null  int64  
 1   name            423397 non-null  object 
 2   menus_appeared  423397 non-null  int64  
 3   times_appeared  423397 non-null  int64  
 4   first_appeared  423397 non-null  int64  
 5   last_appeared   423397 non-null  int64  
 6   lowest_price    394297 non-null  float64
 7   highest_price   394297 non-null  float64
dtypes: float64(2), int64(5), object(1)
memory usage: 29.1+ MB
None


# Fix times_appeared negative values

In [18]:
# Change negative values in times_appeared to 0
dish['times_appeared'] = dish['times_appeared'].where(dish['times_appeared'] >= 0, 0)

In [19]:
times_appeared = dish["times_appeared"].describe(include='all')
print(f"IDs:\n{times_appeared}")

IDs:
count    423397.000000
mean          3.146872
std          29.962110
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max        8484.000000
Name: times_appeared, dtype: float64


# Fix first_appeared dates

In [20]:
# Update first_appeared column with conditions
dish['first_appeared'] = dish['first_appeared'].clip(lower=1840, upper=2008)

In [21]:
first_appeared = dish["first_appeared"].describe(include='all')
print(f"IDs:\n{first_appeared}")

IDs:
count    423397.000000
mean       1916.638954
std          40.496916
min        1840.000000
25%        1900.000000
50%        1914.000000
75%        1949.000000
max        2008.000000
Name: first_appeared, dtype: float64


# Fix first_appeared dates

In [22]:
# Update first_appeared column with conditions
dish['last_appeared'] = dish['last_appeared'].clip(lower=1840, upper=2008)

In [23]:
last_appeared = dish["last_appeared"].describe(include='all')
print(f"IDs:\n{last_appeared}")

IDs:
count    423397.000000
mean       1919.315399
std          41.204832
min        1840.000000
25%        1900.000000
50%        1917.000000
75%        1955.000000
max        2008.000000
Name: last_appeared, dtype: float64


# Write the clean csv file

In [24]:
# Create a CSV file from the dataframe
dish.to_csv('Dish_clearn.csv', index=False)