### Import packages

In [1]:
import os
import json
import pandas as pd
import re
import nltk
nltk.download('wordnet')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import networkx as nx
from collections import defaultdict
import re

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DCL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Aggrigate and Load Data

In [2]:
folder = "data"
all_data = []

for file in os.listdir(folder):
    if file.endswith(".json"):
        with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
            data = json.load(f)
            all_data.extend(data)

In [3]:
df = pd.json_normalize(all_data)
df.head()

Unnamed: 0,Country,Cuisine,Link,Total_Time,Servings,Ingredients,Total_Rating,Rating_Count,Nutrition_Facts.Calories,Nutrition_Facts.Fat,Nutrition_Facts.Carbs,Nutrition_Facts.Protein
0,Amish and Mennonite,Amish Beef and Noodles,https://www.allrecipes.com/amish-beef-and-nood...,2 hrs 35 mins,6,"[2 1/2poundsbeef chuck, cut into 2-inch cubes,...",4.6,29,518,17g,21g,69g
1,Amish and Mennonite,Baked Oatmeal,https://www.allrecipes.com/recipe/51013/baked-...,50 mins,8,"[3cupsrolled oats, 1cupbrown sugar, or to tast...",4.6,1048,393,15g,59g,7g
2,Amish and Mennonite,Amish White Bread,https://www.allrecipes.com/recipe/6788/amish-w...,2 hrs 25 mins,24,"[2cupswarm water (110 degrees F/45 degrees C),...",4.8,6640,168,3g,31g,4g
3,Amish and Mennonite,Cheesy Amish Breakfast Casserole,https://www.allrecipes.com/recipe/229150/chees...,1 hr 15 mins,12,"[1poundsliced bacon, diced, 1mediumsweet onion...",4.8,998,314,23g,12g,22g
4,Amish and Mennonite,Best Vinegar Coleslaw,https://www.allrecipes.com/recipe/59318/amish-...,20 mins,8,"[1large headcabbage, cored and finely shredded...",4.8,534,224,9g,35g,2g


### Data preprocessing

In [4]:
df = df.replace("N/A", pd.NA)
# Total number of Null values
print(f"Null vlaues before treating them: {df.isnull().sum()}")
# Filling the null values of calories with -1 to indicate unavilable value
print(df.shape)
df.dropna(inplace=True)
print(f"\nNull vlaues after treating them: {df.isnull().sum()}")
print(df.shape)

Null vlaues before treating them: Country                       0
Cuisine                       0
Link                          0
Total_Time                  183
Servings                    132
Ingredients                   0
Total_Rating                229
Rating_Count                229
Nutrition_Facts.Calories    168
Nutrition_Facts.Fat         193
Nutrition_Facts.Carbs       171
Nutrition_Facts.Protein     175
dtype: int64
(2459, 12)

Null vlaues after treating them: Country                     0
Cuisine                     0
Link                        0
Total_Time                  0
Servings                    0
Ingredients                 0
Total_Rating                0
Rating_Count                0
Nutrition_Facts.Calories    0
Nutrition_Facts.Fat         0
Nutrition_Facts.Carbs       0
Nutrition_Facts.Protein     0
dtype: int64
(2118, 12)


In [5]:
# Data types of the columns before treating them
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2118 entries, 0 to 2458
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Country                   2118 non-null   object
 1   Cuisine                   2118 non-null   object
 2   Link                      2118 non-null   object
 3   Total_Time                2118 non-null   object
 4   Servings                  2118 non-null   object
 5   Ingredients               2118 non-null   object
 6   Total_Rating              2118 non-null   object
 7   Rating_Count              2118 non-null   object
 8   Nutrition_Facts.Calories  2118 non-null   object
 9   Nutrition_Facts.Fat       2118 non-null   object
 10  Nutrition_Facts.Carbs     2118 non-null   object
 11  Nutrition_Facts.Protein   2118 non-null   object
dtypes: object(12)
memory usage: 215.1+ KB


In [6]:
#Treating the neutrition columns
df["Nutrition_Facts.Protein"] = df["Nutrition_Facts.Protein"].str.replace("g", "", regex=False).astype(int)
df["Nutrition_Facts.Carbs"] = df["Nutrition_Facts.Carbs"].str.replace("g", "", regex=False).astype(int)
df["Nutrition_Facts.Fat"] = df["Nutrition_Facts.Fat"].str.replace("g", "", regex=False).astype(int)

In [7]:
df["Nutrition_Facts.Calories"] = df["Nutrition_Facts.Calories"].astype(int)
df["Rating_Count"] = pd.to_numeric(
    df["Rating_Count"].astype(str).str.replace(",", "", regex=False),
    errors="coerce"
)

df["Total_Rating"] = df["Total_Rating"].astype(float)
df["Servings"] = (
    df["Servings"]
    .astype(str)                         # ensure string
    .str.extract(r"(\d+)")               # extract the first number only
    .astype(int)                       # convert to float
)

In [8]:
# processing time

def parse_time(time_str):
    if pd.isna(time_str):
        return None
    time_str = str(time_str).lower()
    
    hours = 0
    minutes = 0
    
    # find hours
    hr_match = re.search(r'(\d+)\s*hr', time_str)
    if hr_match:
        hours = int(hr_match.group(1))
    
    # find minutes
    min_match = re.search(r'(\d+)\s*min', time_str)
    if min_match:
        minutes = int(min_match.group(1))
    
    return hours * 60 + minutes

df["Total_Time"] = df["Total_Time"].apply(parse_time)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2118 entries, 0 to 2458
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country                   2118 non-null   object 
 1   Cuisine                   2118 non-null   object 
 2   Link                      2118 non-null   object 
 3   Total_Time                2118 non-null   int64  
 4   Servings                  2118 non-null   int64  
 5   Ingredients               2118 non-null   object 
 6   Total_Rating              2118 non-null   float64
 7   Rating_Count              2118 non-null   int64  
 8   Nutrition_Facts.Calories  2118 non-null   int64  
 9   Nutrition_Facts.Fat       2118 non-null   int64  
 10  Nutrition_Facts.Carbs     2118 non-null   int64  
 11  Nutrition_Facts.Protein   2118 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 215.1+ KB


In [10]:
import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from rapidfuzz import process, fuzz

lemmatizer = WordNetLemmatizer()

# Units & descriptors
UNITS = ["teaspoon","teaspoons","tablespoon","tablespoons","cup","cups","pound","pounds",
         "ounce","ounces","gram","grams","kg","ml","liter","pinch","dash","quart","quarts",
         "clove","cloves","package","packages","can","cans","jar","jars","slice","slices",
         "sprig","sprigs","stick","sticks"]

DESCRIPTORS = ["chopped","sliced","diced","minced","optional","divided","garnish","fresh",
               "large","small","rounded","inch","pieces","cube","cubes","peeled","crushed",
               "ground","seeded","halved","thinly","finely","boneless","skinless","drained",
               "cooked","uncooked","frozen","shredded","powder","style","hot","ripe","medium"]

# Ingredient mapping (common variations -> canonical)
INGREDIENT_MAP = {
    "yellow onions":"onion","onions":"onion","green onions":"spring onion","scallions":"spring onion",
    "bell pepper":"pepper","peppers":"pepper",
    "beef chuck":"beef","ground beef":"beef","poundbeef":"beef","poundsbeef":"beef",
    "beef broth":"broth","chicken broth":"broth","vegetable broth":"broth","stock":"broth",
    "kosher salt":"salt","pinchsalt":"salt","salt":"salt",
    "black pepper":"pepper","peppercorns":"pepper","chili":"chili","jalapeno":"chili",
    "vegetable oil":"oil","olive oil":"oil","tablespoonolive":"oil","canola":"oil","butter":"butter",
    "all purpose flour":"flour","purpose flour":"flour","flour":"flour",
    "garlic powder":"garlic","clovesgarlic":"garlic","teaspoongarlic":"garlic",
    "tablespoongarlic":"garlic","garlic":"garlic",
    "beef bouillon paste":"bouillon","bouillon":"bouillon"
}

# Expanded Food Groups
FOOD_GROUPS = {
    "meat": [
        "beef","chicken","pork","lamb","turkey","duck","bacon","sausage","ham","veal","goat","mutton"
    ],
    "seafood": [
        "fish","salmon","tuna","shrimp","crab","lobster","oyster","clam","scallop","squid"
    ],
    "dairy": [
        "milk","butter","cheese","cream","yogurt","egg","buttermilk","ghee"
    ],
    "vegetable": [
        "onion","garlic","tomato","carrot","pepper","bell pepper","broccoli","spinach",
        "cabbage","celery","potato","corn","zucchini","cucumber","lettuce","mushroom",
        "cauliflower","ginger","peas","bean","chili","okra","beet","radish","eggplant"
    ],
    "fruit": [
        "apple","banana","orange","lemon","lime","grape","pineapple","pear","peach",
        "plum","mango","berry","strawberry","blueberry","raspberry","cherry",
        "pomegranate","apricot","fig","date"
    ],
    "grain": [
        "rice","flour","bread","pasta","oats","noodles","lentil","quinoa","barley",
        "couscous","cornmeal","tortilla","taco","saffron"
    ],
    "spice": [
        "pepper","cumin","coriander","cinnamon","turmeric","ginger","garlic","paprika",
        "clove","chili","nutmeg","mustard","cardamom","anise","allspice","fennel","bay leaf"
    ],
    "herb": [
        "parsley","cilantro","basil","thyme","rosemary","dill","mint","oregano","sage","tarragon"
    ]
}
PRIORITY_ORDER = ["meat","seafood","vegetable","grain","dairy","spice","herb","fruit"]

# Flatten canonical list for fuzzy matching
CANONICAL_INGREDIENTS = [item for group in FOOD_GROUPS.values() for item in group]

# -------- Cleaning Function --------
def clean_ingredient(text):
    text = text.lower()
    
    # fix glued numbers and letters (2cups -> 2 cups)
    text = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", text)
    text = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", text)

    # fix glued unit+word (cupsbeef -> cups beef)
    text = re.sub(rf"({'|'.join(UNITS)})(?=[a-z])", r"\1 ", text)

    # remove numbers/fractions
    text = re.sub(r"\d+/?\d*", "", text)

    # tokenize
    tokens = re.findall(r"[a-z]+", text)

    cleaned_tokens = []
    for tok in tokens:
        if tok in UNITS: continue
        if tok in DESCRIPTORS: continue
        tok = lemmatizer.lemmatize(tok)  # singularize
        if tok in INGREDIENT_MAP:
            tok = INGREDIENT_MAP[tok]
        cleaned_tokens.append(tok)

    return " ".join(cleaned_tokens).strip()

# -------- Food Group Detection (from raw text) --------
def detect_food_groups_from_raw(row):
    groups_present = {g: False for g in FOOD_GROUPS.keys()}
    joined = " ".join(row).lower()
    for group, items in FOOD_GROUPS.items():
        for item in items:
            if re.search(rf"{item}", joined):  # partial match
                groups_present[group] = True
                break
    return groups_present

# -------- Primary Ingredient (with fuzzy matching) --------
def get_primary_ingredient(ingredients, cutoff=80):
    for group in PRIORITY_ORDER:
        for item in FOOD_GROUPS[group]:
            # direct match
            if item in ingredients:
                return item
            # fuzzy match
            match = process.extractOne(item, ingredients, scorer=fuzz.partial_ratio)
            if match and match[1] >= cutoff:
                return item
    return ingredients[0] if ingredients else None

# -------- Main Processing Function --------
def process_ingredient_column(df, col="Ingredients"):
    processed = []
    group_flags = {g: [] for g in FOOD_GROUPS.keys()}
    primary_list = []

    for row in df[col]:
        if not isinstance(row, list):
            try:
                row = eval(row)   # handle stringified lists
            except:
                row = []

        # ---- Cleaned tokens
        cleaned = [clean_ingredient(i) for i in row]
        cleaned = [c for c in cleaned if c]
        cleaned = list(set(cleaned))
        processed.append(cleaned)

        # ---- Food group detection (from raw)
        groups = detect_food_groups_from_raw(row)
        for g, val in groups.items():
            group_flags[g].append(val)

        # ---- Primary ingredient (from cleaned + fuzzy)
        primary_list.append(get_primary_ingredient(cleaned))

    # ---- Add columns to df
    df["Processed_Ingredients"] = processed
    df["num_ingredients"] = df["Processed_Ingredients"].apply(len)
    df["primary_ingredient"] = primary_list
    for g in FOOD_GROUPS.keys():
        df[f"has_{g}"] = group_flags[g]

    return df


In [11]:
df = process_ingredient_column(df, "Ingredients")
df = df[df['primary_ingredient'] != 'soil for frying']
df = df[df['primary_ingredient'] != 'scold water']
df.drop(['Ingredients','Processed_Ingredients'],axis=1,inplace=True)

In [12]:
df.head()

Unnamed: 0,Country,Cuisine,Link,Total_Time,Servings,Total_Rating,Rating_Count,Nutrition_Facts.Calories,Nutrition_Facts.Fat,Nutrition_Facts.Carbs,...,num_ingredients,primary_ingredient,has_meat,has_seafood,has_dairy,has_vegetable,has_fruit,has_grain,has_spice,has_herb
0,Amish and Mennonite,Amish Beef and Noodles,https://www.allrecipes.com/amish-beef-and-nood...,155,6,4.6,29,518,17,21,...,11,beef,True,False,True,True,False,True,True,False
1,Amish and Mennonite,Baked Oatmeal,https://www.allrecipes.com/recipe/51013/baked-...,50,8,4.6,1048,393,15,59,...,10,goat,False,False,True,False,False,True,True,False
2,Amish and Mennonite,Amish White Bread,https://www.allrecipes.com/recipe/6788/amish-w...,145,24,4.8,6640,168,3,31,...,6,flour,False,False,False,False,False,True,False,False
3,Amish and Mennonite,Cheesy Amish Breakfast Casserole,https://www.allrecipes.com/recipe/229150/chees...,75,12,4.8,998,314,23,12,...,7,bacon,True,False,True,True,False,False,False,False
4,Amish and Mennonite,Best Vinegar Coleslaw,https://www.allrecipes.com/recipe/59318/amish-...,20,8,4.8,534,224,9,35,...,9,salmon,False,False,False,True,False,False,True,False


In [13]:
#Mapping countries
def map_cuisine_to_country(cuisine):

    mapping = {
        'Amish and Mennonite': 'United States',
        'Argentinian': 'Argentina',
        'Australian and New Zealander': 'Australia', # Primary point
        'Austrian': 'Austria',
        'Bangladeshi': 'Bangladesh',
        'Belgian': 'Belgium',
        'Brazilian': 'Brazil',
        'Cajun and Creole': 'United States',
        'Canadian': 'Canada',
        'Chilean': 'Chile',
        'Chinese': 'China',
        'Colombian': 'Colombia',
        'Cuban': 'Cuba',
        'Danish': 'Denmark',
        'Dutch': 'Netherlands',
        'Filipino': 'Philippines',
        'Finnish': 'Finland',
        'French': 'France',
        'German': 'Germany',
        'Greek': 'Greece',
        'Indian': 'India',
        'Indonesian': 'Indonesia',
        'Israeli': 'Israel',
        'Italian': 'Italy',
        'Jamaican': 'Jamaica',
        'Japanese': 'Japan',
        'Jewish': 'Israel', # Represents the modern nation-state
        'Korean': 'South Korea',
        'Lebanese': 'Lebanon',
        'Malaysian': 'Malaysia',
        'Norwegian': 'Norway',
        'Pakistani': 'Pakistan',
        'Persian': 'Iran',
        'Peruvian': 'Peru',
        'Polish': 'Poland',
        'Portuguese': 'Portugal',
        'Puerto Rican': 'United States', # US Territory
        'Russian': 'Russia',
        'Scandinavian': 'Sweden', # Primary point for the region
        'Soul Food': 'United States',
        'South African': 'South Africa',
        'Southern Recipes': 'United States',
        'Spanish': 'Spain',
        'Swedish': 'Sweden',
        'Swiss': 'Switzerland',
        'Tex-Mex': 'United States',
        'Thai': 'Thailand',
        'Turkish': 'Turkey',
        'Vietnamese': 'Vietnam'
    }
    # Return the mapped country, or the original cuisine if not found (shouldn't happen with this list)
    return mapping.get(cuisine, cuisine)

# Apply the mapping to create a new column
df['Country'] = df['Country'].apply(map_cuisine_to_country)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2117 entries, 0 to 2458
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Country                   2117 non-null   object 
 1   Cuisine                   2117 non-null   object 
 2   Link                      2117 non-null   object 
 3   Total_Time                2117 non-null   int64  
 4   Servings                  2117 non-null   int64  
 5   Total_Rating              2117 non-null   float64
 6   Rating_Count              2117 non-null   int64  
 7   Nutrition_Facts.Calories  2117 non-null   int64  
 8   Nutrition_Facts.Fat       2117 non-null   int64  
 9   Nutrition_Facts.Carbs     2117 non-null   int64  
 10  Nutrition_Facts.Protein   2117 non-null   int64  
 11  num_ingredients           2117 non-null   int64  
 12  primary_ingredient        2117 non-null   object 
 13  has_meat                  2117 non-null   bool   
 14  has_seafood  

### Saved final Dataset

In [15]:
df.to_csv("final_dataset.csv",index=False)
df.to_excel("final_dataset.xlsx",index=False)