In [1]:
import pandas as pd

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv("data/raw/recipes_sample.csv")

In [3]:
# print all columns and verify if any missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    5000 non-null   int64  
 1   Name                        5000 non-null   object 
 2   AuthorId                    5000 non-null   int64  
 3   AuthorName                  5000 non-null   object 
 4   CookTime                    4183 non-null   object 
 5   PrepTime                    5000 non-null   object 
 6   TotalTime                   5000 non-null   object 
 7   DatePublished               5000 non-null   object 
 8   Description                 5000 non-null   object 
 9   Images                      5000 non-null   object 
 10  RecipeCategory              4998 non-null   object 
 11  Keywords                    4846 non-null   object 
 12  RecipeIngredientQuantities  5000 non-null   object 
 13  RecipeIngredientParts       5000 

In [4]:
# Drop all columns that is not required for analysis
clean_df = df.drop(["AuthorName", "AuthorId", "DatePublished", "CookTime", "PrepTime"], axis=1)

In [5]:
# fill NA in rating with 0 for 0 review
clean_df['AggregatedRating'].fillna(0, inplace=True)
clean_df['ReviewCount'].fillna(0, inplace=True)

In [6]:
# Splitting the 'RecipeYield' column into two new columns
clean_df[['Serving', 'Unit']] = clean_df['RecipeYield'].str.split(' ', n=1, expand=True)

# Converting 'Serving' column to integer
clean_df['Serving'] = clean_df['Serving'].str.extract('(\d+)').astype(float)

# Dropping the original 'RecipeYield' column and the 'Unit' column
clean_df.drop(['RecipeYield', 'Unit'], axis=1, inplace=True)

In [7]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    5000 non-null   int64  
 1   Name                        5000 non-null   object 
 2   TotalTime                   5000 non-null   object 
 3   Description                 5000 non-null   object 
 4   Images                      5000 non-null   object 
 5   RecipeCategory              4998 non-null   object 
 6   Keywords                    4846 non-null   object 
 7   RecipeIngredientQuantities  5000 non-null   object 
 8   RecipeIngredientParts       5000 non-null   object 
 9   AggregatedRating            5000 non-null   float64
 10  ReviewCount                 5000 non-null   float64
 11  Calories                    5000 non-null   float64
 12  FatContent                  5000 non-null   float64
 13  SaturatedFatContent         5000 

In [8]:
# Fill NA in "RecipeServings" with "Serving" value where "RecipeServings" is NA and "Serving" is not null
clean_df.loc[clean_df['RecipeServings'].isna() & clean_df['Serving'].notna(), 'RecipeServings'] = clean_df['Serving']

# Convert 'RecipeServings' column to integer type if necessary
clean_df['RecipeServings'] = clean_df['RecipeServings'].astype(float)

In [9]:
# Fill NA in "Keywords" with values from "RecipeIngredientParts"
clean_df['Keywords'].fillna(clean_df['RecipeIngredientParts'], inplace=True)

In [10]:
clean_df = clean_df.drop(["Serving"], axis=1)

In [11]:
# Fill NA in "RecipeServings" with 1
clean_df['RecipeServings'].fillna(1, inplace=True)

In [12]:
# Remove rows with NA in "RecipeCategory"
clean_df = clean_df.dropna(subset=['RecipeCategory'])
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4998 entries, 0 to 4999
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    4998 non-null   int64  
 1   Name                        4998 non-null   object 
 2   TotalTime                   4998 non-null   object 
 3   Description                 4998 non-null   object 
 4   Images                      4998 non-null   object 
 5   RecipeCategory              4998 non-null   object 
 6   Keywords                    4998 non-null   object 
 7   RecipeIngredientQuantities  4998 non-null   object 
 8   RecipeIngredientParts       4998 non-null   object 
 9   AggregatedRating            4998 non-null   float64
 10  ReviewCount                 4998 non-null   float64
 11  Calories                    4998 non-null   float64
 12  FatContent                  4998 non-null   float64
 13  SaturatedFatContent         4998 non-n

In [14]:
# Save clean data to a new CSV file
clean_df.to_csv("data/clean/recipes.csv", index=False)