# Dataset Analysis

In [13]:
import json
import numpy as np

### Loading the dataset

In [None]:
def load_dataset(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

english_N1000 = load_dataset('dataset/english_N1000_2025-02-11.json')
italian_from_eng_N1000 = load_dataset('dataset/italian_from_eng_N1000_2025-02-11.json')
italian_N50 = load_dataset('dataset/italian_N50_2024-11-29.json')
italian_N1000 = load_dataset('dataset/italian_N1000_2025-02-11.json')

dataset = {
    'english_N1000': english_N1000,
    'italian_from_eng_N1000': italian_from_eng_N1000,
    'italian_N50': italian_N50,
    'italian_N1000': italian_N1000
}

### Number of recipes in each partiton

In [None]:
for name, partition in dataset.items():
    print(f"Number of recipes in {name} partition: {len(partition)}")

Number of recipes in english_N1000: 671
Number of recipes in italian_from_eng_N1000: 686
Number of recipes in italian_N50: 32
Number of recipes in italian_N1000: 637


### Min, Max, Mean and Standard Deviation of steps in each partition

In [None]:
def min_max_mean_std_steps(dataset):
    mean_length = np.mean([len(el["steps"]) for el in dataset])
    max_length = np.max([len(el["steps"]) for el in dataset])
    min_length = np.min([len(el["steps"]) for el in dataset])
    standard_deviation = np.std([len(el["steps"]) for el in dataset])
    return min_length, max_length, mean_length, standard_deviation

for name, partition in dataset.items():
    min_length, max_length, mean_length, std_length = min_max_mean_std_steps(partition)
    print(f"{name} partition: min: {min_length}, max: {max_length}, mean: {mean_length}, std: {std_length} of steps")


english_N1000 partition: min: 0, max: 45, mean: 19.466467958271238, std: 6.996592264952894 of steps
italian_from_eng_N1000 partition: min: 0, max: 48, mean: 19.513119533527696, std: 7.121333072476808 of steps
italian_N50 partition: min: 6, max: 39, mean: 21.0, std: 8.147085368400163 of steps
italian_N1000 partition: min: 0, max: 42, mean: 18.970172684458397, std: 7.68803360578713 of steps


### Number of input examples generated (Sj+1)

In [24]:
for name, partition in dataset.items():
    for el in partition:
        if len(el["steps"]) == 0:
            print(f"Empty steps in {name} partition", el)
            

Empty steps in english_N1000 partition {'title': 'Soft polenta cake', 'recipe_id': 'Gz_29106', 'raw_text': 'Ingredients for a 24cm mold cups 3 cup Sugar 1 cup Eggs medium 4 cups 1.5 cup Butter 8 tbsp Powdered yeast for sweets 2.5 tsp Lemon peel 1 Fine salt to taste Presentation There are those who prefer polenta in a classic style or concia style, and we should remember that it\'s not always pure but also a great companion for sausages or beans . Today, however, we present you with the soft polenta cake, a delicious sweet version perfect for breakfast and snacks. Not everyone knows that cornmeal, as strange as it may seem, works wonderfully in non-salty versions, as in this case or when we talked about the Lombard amor polenta or pan meino . In the version we propose today, the cake with cornmeal will be soft and lemon-scented, but its texture will have a rough consistency, thanks to the coarse ground cornmeal. This type of coarse milling flour results in a more rustic finish. Exactly 