# Giallo Zafferano Dataset Analysis

In [1]:
import json
import numpy as np

### Loading the dataset

In [2]:
def load_dataset(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

english_N1000 = load_dataset('dataset/english_N1000_2025-02-11.json')
italian_from_eng_N1000 = load_dataset('dataset/italian_from_eng_N1000_2025-02-11.json')
italian_N50 = load_dataset('dataset/italian_N50_2024-11-29.json')
italian_N1000 = load_dataset('dataset/italian_N1000_2025-02-11.json')

dataset = {
    'english_N1000': english_N1000,
    'italian_from_eng_N1000': italian_from_eng_N1000,
    'italian_N50': italian_N50,
    'italian_N1000': italian_N1000
}

### Number of recipes in each partiton

In [3]:
for name, partition in dataset.items():
    print(f"Number of recipes in {name} partition: {len(partition)}")

Number of recipes in english_N1000 partition: 671
Number of recipes in italian_from_eng_N1000 partition: 686
Number of recipes in italian_N50 partition: 32
Number of recipes in italian_N1000 partition: 637


### Min, Max, Mean and Standard Deviation of steps in each partition

In [4]:
def min_max_mean_std_steps(dataset):
    mean_length = np.mean([len(el["steps"]) for el in dataset])
    max_length = np.max([len(el["steps"]) for el in dataset])
    min_length = np.min([len(el["steps"]) for el in dataset])
    standard_deviation = np.std([len(el["steps"]) for el in dataset])
    return min_length, max_length, mean_length, standard_deviation

for name, partition in dataset.items():
    min_length, max_length, mean_length, std_length = min_max_mean_std_steps(partition)
    print(f"{name} partition: min: {min_length}, max: {max_length}, mean: {mean_length}, std: {std_length} of steps")


english_N1000 partition: min: 0, max: 45, mean: 19.466467958271238, std: 6.996592264952894 of steps
italian_from_eng_N1000 partition: min: 0, max: 48, mean: 19.513119533527696, std: 7.121333072476808 of steps
italian_N50 partition: min: 6, max: 39, mean: 21.0, std: 8.147085368400163 of steps
italian_N1000 partition: min: 0, max: 42, mean: 18.970172684458397, std: 7.68803360578713 of steps


### Number of input examples (Sj+1) (Sj+2) (Sj+5) for each partiton

In [5]:
for name, partition in dataset.items():
    sj1 = 0
    sj2 = 0
    sj5 = 0
    for el in partition:
        sj1 += len(el["steps"]) - 1 if len(el["steps"]) > 1 else 0
        sj2 += len(el["steps"]) - 2 if len(el["steps"]) > 2 else 0
        sj5 += len(el["steps"]) - 5 if len(el["steps"]) > 5 else 0
    
    print(f"{name} partition: sj1: {sj1}, sj2: {sj2}, sj5: {sj5}")
            

english_N1000 partition: sj1: 12392, sj2: 11722, sj5: 9712
italian_from_eng_N1000 partition: sj1: 12701, sj2: 12016, sj5: 9961
italian_N50 partition: sj1: 640, sj2: 608, sj5: 512
italian_N1000 partition: sj1: 11455, sj2: 10826, sj5: 8943
