In [1]:
import pandas as pd

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv("data/raw/recipes_sample.csv")

In [3]:
# print all columns and verify if any missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    5000 non-null   int64  
 1   Name                        5000 non-null   object 
 2   AuthorId                    5000 non-null   int64  
 3   AuthorName                  5000 non-null   object 
 4   CookTime                    4183 non-null   object 
 5   PrepTime                    5000 non-null   object 
 6   TotalTime                   5000 non-null   object 
 7   DatePublished               5000 non-null   object 
 8   Description                 5000 non-null   object 
 9   Images                      5000 non-null   object 
 10  RecipeCategory              4998 non-null   object 
 11  Keywords                    4846 non-null   object 
 12  RecipeIngredientQuantities  5000 non-null   object 
 13  RecipeIngredientParts       5000 

In [4]:
# Drop all columns that is not required for analysis
clean_df = df.drop(["RecipeId", "AuthorName", "AuthorId", "DatePublished", "CookTime", "PrepTime"], axis=1)

In [5]:
# fill NA in rating with 0 for 0 review
clean_df['AggregatedRating'].fillna(0, inplace=True)
clean_df['ReviewCount'].fillna(0, inplace=True)

In [6]:
# Splitting the 'RecipeYield' column into two new columns
clean_df[['Serving', 'Unit']] = clean_df['RecipeYield'].str.split(' ', n=1, expand=True)

# Converting 'Serving' column to integer
clean_df['Serving'] = clean_df['Serving'].str.extract('(\d+)').astype(float)

# Dropping the original 'RecipeYield' column and the 'Unit' column
clean_df.drop(['RecipeYield', 'Unit'], axis=1, inplace=True)

In [7]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Name                        5000 non-null   object 
 1   TotalTime                   5000 non-null   object 
 2   Description                 5000 non-null   object 
 3   Images                      5000 non-null   object 
 4   RecipeCategory              4998 non-null   object 
 5   Keywords                    4846 non-null   object 
 6   RecipeIngredientQuantities  5000 non-null   object 
 7   RecipeIngredientParts       5000 non-null   object 
 8   AggregatedRating            5000 non-null   float64
 9   ReviewCount                 5000 non-null   float64
 10  Calories                    5000 non-null   float64
 11  FatContent                  5000 non-null   float64
 12  SaturatedFatContent         5000 non-null   float64
 13  CholesterolContent          5000 

In [8]:
# Fill NA in "RecipeServings" with "Serving" value where "RecipeServings" is NA and "Serving" is not null
clean_df.loc[clean_df['RecipeServings'].isna() & clean_df['Serving'].notna(), 'RecipeServings'] = clean_df['Serving']

# Convert 'RecipeServings' column to integer type if necessary
clean_df['RecipeServings'] = clean_df['RecipeServings'].astype(float)

In [9]:
# Fill NA in "Keywords" with values from "RecipeIngredientParts"
clean_df['Keywords'].fillna(clean_df['RecipeIngredientParts'], inplace=True)

In [10]:
clean_df = clean_df.drop(["Serving"], axis=1)

In [11]:
# Fill NA in "RecipeServings" with 1
clean_df['RecipeServings'].fillna(1, inplace=True)

In [12]:
# Remove rows with NA in "RecipeCategory"
clean_df = clean_df.dropna(subset=['RecipeCategory', 'Description', 'RecipeIngredientQuantities'])
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4998 entries, 0 to 4999
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Name                        4998 non-null   object 
 1   TotalTime                   4998 non-null   object 
 2   Description                 4998 non-null   object 
 3   Images                      4998 non-null   object 
 4   RecipeCategory              4998 non-null   object 
 5   Keywords                    4998 non-null   object 
 6   RecipeIngredientQuantities  4998 non-null   object 
 7   RecipeIngredientParts       4998 non-null   object 
 8   AggregatedRating            4998 non-null   float64
 9   ReviewCount                 4998 non-null   float64
 10  Calories                    4998 non-null   float64
 11  FatContent                  4998 non-null   float64
 12  SaturatedFatContent         4998 non-null   float64
 13  CholesterolContent          4998 non-n

In [13]:
# clean_df.head()

# Function to extract hours and minutes from a duration string
def extract_hours_minutes(duration_str):
    # Remove "PT" prefix
    duration_str = duration_str.replace("PT", "")

    # Initialize hours and minutes
    hours = 0
    minutes = 0

    # Split into hours and minutes if present
    if 'H' in duration_str:
        hours, duration_str = duration_str.split('H')
        hours = int(hours)
    if 'M' in duration_str:
        minutes = duration_str.replace('M', '')
        minutes = int(minutes)

    return hours, minutes

# Apply the function to the "TotalTime" column
clean_df['TotalTime_hours'], clean_df['TotalTime_minutes'] = zip(*clean_df['TotalTime'].apply(extract_hours_minutes))
clean_df.head()

In [14]:
# Remove "PT" prefix
clean_df['TotalTime'] = clean_df['TotalTime'].str.replace("PT", "")
clean_df.head()

Unnamed: 0,Name,TotalTime,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,...,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions
0,Easy Chicken and Biscuits,30M,This is an easy recipe that can be done in 30 ...,character(0),One Dish Meal,"""< 30 Mins""","c(""1"", ""1"", ""1"", ""1/4"", ""1/4"", ""4"", ""2"", ""1"")","c(""milk"", ""dried thyme leaves"", ""pepper"", ""ham"")",0.0,0.0,...,17.0,5.3,58.6,1644.3,37.0,1.5,6.5,20.8,5.0,"c(""In a 3-quart shallow baking dish mix soups,..."
1,Spring Gnocchi With Asparagus and Shrimp,25M,A perfect blend of the heartiest of winter pas...,character(0),Weeknight,"c(""< 30 Mins"", ""Easy"")","c(""32 -36"", ""4"", ""1"", ""1"", ""1"", ""2"", ""1"", ""4"",...","c(""butter"", ""lemon, juice and zest of"", ""salt""...",0.0,0.0,...,11.0,6.4,121.2,1018.0,8.5,3.4,2.4,16.3,6.0,"c(""Boil a large pot of salted water and add gn..."
2,Romano Grits,20M,"This recipe can be multiplied by 2, 3, 4. The...",character(0),Breakfast,"c(""Very Low Carbs"", ""Low Protein"", ""Low Choles...","c(""2"", ""1/2"", ""1/4"", ""1"")","c(""water"", ""yellow corn grits"", ""salt"", ""pecor...",5.0,1.0,...,3.9,2.4,14.7,466.0,3.7,0.1,0.1,4.9,2.0,"c(""Place the water in a medium sauce pan over ..."
3,Lemon Chess Pie,50M,Make and share this Lemon Chess Pie recipe fro...,character(0),Pie,"c(""Dessert"", ""< 60 Mins"")","c(""1 1/2"", ""2"", ""4"", ""1"", ""1/2"", ""1"")","c(""sugar"", ""butter"", ""eggs"", ""fresh lemon rind"")",4.0,1.0,...,12.8,4.5,113.4,172.4,48.5,0.9,37.8,4.6,8.0,"c(""Pre heat oven to 400."", ""Cream together but..."
4,Slow Cooked Chicken and Dressing,4H25M,This is so delicious! If you want an easy com...,character(0),One Dish Meal,"c(""Chicken"", ""Poultry"", ""Meat"", ""Kid Friendly""...","c(""1"", ""6"", ""8"", ""2"", ""2"", ""1"", ""3"", ""4"", ""2"",...","c(""chicken broth"", ""onion"", ""celery ribs"", ""eg...",0.0,0.0,...,27.6,11.8,191.3,1275.0,22.5,1.2,3.0,24.8,1.0,"c(""Combine first 11 ingredients in a large bow..."


In [15]:
import re

# Remove "c" at the beginning and parentheses from each value in the "RecipeIngredientParts" column
#clean_df['RecipeIngredientParts'] = clean_df['RecipeIngredientParts'].str.replace(r'^c|[()]', '', regex=True)

# Remove "c" at the beginning and parentheses from each value in the "RecipeIngredientParts" column
clean_df['Keywords'] = clean_df['Keywords'].str.replace(r'^c|[()]', '', regex=True)
clean_df['RecipeIngredientQuantities'] = clean_df['RecipeIngredientQuantities'].str.replace(r'^c|[()]', '', regex=True)
clean_df['RecipeIngredientParts'] = clean_df['RecipeIngredientParts'].str.replace(r'^c|[()]', '', regex=True)
clean_df['RecipeInstructions'] = clean_df['RecipeInstructions'].str.replace(r'^c|[()]', '', regex=True)
clean_df['Images'] = clean_df['Images'].str.replace(r'^c|[()]', '', regex=True)

In [16]:
# Split the strings by comma, explode the resulting lists, and get unique items
keyword_item = clean_df['Keywords'].str.split(',').explode().str.strip().unique()

# Convert to list
keyword_item_list = keyword_item.tolist()

# Print the length of the list
print("Number of unique items:", len(keyword_item_list))

Number of unique items: 574


In [17]:
# Clean up each string (remove leading/trailing spaces, convert to lowercase, and strip double quotes) before converting to set
unique_keyword_item_set = set(map(lambda x: x.strip().lower().strip('"'), keyword_item_list))



# Convert the set back to a list if needed
unique_keyword_item_list = list(unique_keyword_item_set)

# Print the length of the list
print("Number of unique items:", len(unique_keyword_item_list))

Number of unique items: 560


In [18]:
# Print the entire list
unique_keyword_item_list

['red onion',
 'fresh parsley leaves',
 'cuban',
 'chili powder',
 'indonesian',
 'white rice',
 'black pepper',
 'buttermilk',
 'tempeh',
 'orange marmalade',
 'basil leaves',
 'chestnuts',
 'whole wheat flour',
 'dehydrator',
 'turkey',
 'easy',
 'pasta shells',
 'oregano leaves',
 'garlic clove',
 'corn',
 'ground ginger',
 'graham crackers',
 'white sugar',
 'fresh ginger',
 'pumpkin',
 'cooked rice',
 'evaporated milk',
 'granulated sugar',
 'fresh corn',
 'kiwi',
 'green bell pepper',
 'red bell pepper',
 'peanut butter',
 'mango',
 'salted butter',
 'tex mex',
 'apple',
 'bar cookie',
 'cracked pepper',
 'heavy whipping cream',
 'green onion tops',
 'german',
 'broil/grill',
 'okra',
 'freezer',
 'zucchini',
 'linguine',
 'cold water',
 'green onion',
 'japanese',
 'southwest asia middle east',
 'pakistani',
 'long grain rice',
 'large shrimp',
 '2% low-fat milk',
 'fresh kale',
 'italian-style tomatoes',
 'oregano',
 'fresh parsley',
 'healthy',
 'breads',
 'strawberries',
 'eg

In [19]:
# Iterate through the list and print keywords containing "low", "high", or "quick"
for keyword in unique_keyword_item_list:
    if "vegan" in keyword or "low sodium" in keyword or "low carbs" in keyword or "low cholesterol" in keyword or "high protein" in keyword or "low protein" in keyword:
        print(keyword)

low cholesterol
low sodium chicken broth
vegan
low protein
very low carbs
high protein


In [20]:
# Define the substrings of interest
substrings_of_interest = ["vegan", "low sodium", "low carbs", "low cholesterol", "high protein", "low protein"]

# Initialize a dictionary to store the counts
substring_counts = {substring: 0 for substring in substrings_of_interest}

# Iterate through the list and count keywords containing the specified substrings
for keyword in unique_keyword_item_list:
    for substring in substrings_of_interest:
        if substring in keyword:
            substring_counts[substring] += 1

# Print the counts for each substring
for substring, count in substring_counts.items():
    print(f"Count of '{substring}': {count}")

Count of 'vegan': 1
Count of 'low sodium': 1
Count of 'low carbs': 1
Count of 'low cholesterol': 1
Count of 'high protein': 1
Count of 'low protein': 1


In [21]:
# Define the substrings of interest
substrings_of_interest = ["pizza", "beef", "chicken", "turkey", "pork", "ribs", "rib", "steak", "salmon", "cod", "mahi mahi"]

# Initialize a dictionary to store the counts
substring_counts = {substring: 0 for substring in substrings_of_interest}

# Iterate through the list and count keywords containing the specified substrings
for keyword in unique_keyword_item_list:
    for substring in substrings_of_interest:
        if substring in keyword:
            substring_counts[substring] += 1

# Print the counts for each substring
for substring, count in substring_counts.items():
    print(f"Count of '{substring}': {count}")

Count of 'pizza': 0
Count of 'beef': 7
Count of 'chicken': 13
Count of 'turkey': 3
Count of 'pork': 1
Count of 'ribs': 1
Count of 'rib': 2
Count of 'steak': 1
Count of 'salmon': 3
Count of 'cod': 0
Count of 'mahi mahi': 1


In [22]:
# print("Number of unique keywords:", num_unique_keywords)
# # Print the first 500 unique keywords
# print("Unique keywords:")
# for idx, keyword in enumerate(unique_keywords):
#     if idx < 500:
#         print(keyword)
#     else:
#         break

In [23]:
clean_df.head()


Unnamed: 0,Name,TotalTime,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,...,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeInstructions
0,Easy Chicken and Biscuits,30M,This is an easy recipe that can be done in 30 ...,haracter0,One Dish Meal,"""< 30 Mins""","""1"", ""1"", ""1"", ""1/4"", ""1/4"", ""4"", ""2"", ""1""","""milk"", ""dried thyme leaves"", ""pepper"", ""ham""",0.0,0.0,...,17.0,5.3,58.6,1644.3,37.0,1.5,6.5,20.8,5.0,"""In a 3-quart shallow baking dish mix soups, m..."
1,Spring Gnocchi With Asparagus and Shrimp,25M,A perfect blend of the heartiest of winter pas...,haracter0,Weeknight,"""< 30 Mins"", ""Easy""","""32 -36"", ""4"", ""1"", ""1"", ""1"", ""2"", ""1"", ""4"", ""...","""butter"", ""lemon, juice and zest of"", ""salt"", ...",0.0,0.0,...,11.0,6.4,121.2,1018.0,8.5,3.4,2.4,16.3,6.0,"""Boil a large pot of salted water and add gnoc..."
2,Romano Grits,20M,"This recipe can be multiplied by 2, 3, 4. The...",haracter0,Breakfast,"""Very Low Carbs"", ""Low Protein"", ""Low Choleste...","""2"", ""1/2"", ""1/4"", ""1""","""water"", ""yellow corn grits"", ""salt"", ""pecorin...",5.0,1.0,...,3.9,2.4,14.7,466.0,3.7,0.1,0.1,4.9,2.0,"""Place the water in a medium sauce pan over hi..."
3,Lemon Chess Pie,50M,Make and share this Lemon Chess Pie recipe fro...,haracter0,Pie,"""Dessert"", ""< 60 Mins""","""1 1/2"", ""2"", ""4"", ""1"", ""1/2"", ""1""","""sugar"", ""butter"", ""eggs"", ""fresh lemon rind""",4.0,1.0,...,12.8,4.5,113.4,172.4,48.5,0.9,37.8,4.6,8.0,"""Pre heat oven to 400."", ""Cream together butte..."
4,Slow Cooked Chicken and Dressing,4H25M,This is so delicious! If you want an easy com...,haracter0,One Dish Meal,"""Chicken"", ""Poultry"", ""Meat"", ""Kid Friendly"", ...","""1"", ""6"", ""8"", ""2"", ""2"", ""1"", ""3"", ""4"", ""2"", ""...","""chicken broth"", ""onion"", ""celery ribs"", ""eggs...",0.0,0.0,...,27.6,11.8,191.3,1275.0,22.5,1.2,3.0,24.8,1.0,"""Combine first 11 ingredients in a large bowl...."


In [24]:
# Save clean data to a new JSON file
clean_df.to_json("data/clean/cleanSamplerecipes.json", orient="records")

In [25]:
# Save clean data to a new CSV file
clean_df.to_csv("data/clean/cleanSamplerecipes.csv", index=False)

In [26]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define features (X) and target variable (y)
X = clean_df[['RecipeServings', 'AggregatedRating', 'ReviewCount']]  # Features
y = clean_df['RecipeCategory']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and training the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Making predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest classifier:", accuracy)

Accuracy of Random Forest classifier: 0.194
