In [7]:
import json
import re
import numpy as np
with open("clean_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(len(data))
print(data[0].keys())


849
dict_keys(['_id', 'name', 'description', 'imageURL', 'cuisine', 'preTimeMinutes', 'cookTimeMinutes', 'totalTimeMinutes', 'servings', 'ingredients'])


In [8]:
for recipe in data:
    recipe["name"] = recipe["name"].strip()
    for ing in recipe["ingredients"]:
        ing["name"] = ing["name"].lower().strip()


In [9]:
name_lengths = [len(recipe["name"].split()) for recipe in data]
max_name = max(data, key=lambda r: len(r["name"].split()))
all_words = [word for r in data for word in r["name"].split()]


In [42]:
ingredient_counts = [len(recipe["ingredients"]) for recipe in data]

from collections import Counter
ingredient_counter = Counter()
for r in data:
    for ing in r["ingredients"]:
        ingredient_counter[ing["name"]] += 1
ingredient_counter.most_common(10)

missing_counts = {}
for recipe in data:
    for key, value in recipe.items():
        if value in [None, "", [], {}]:
            missing_counts[key] = missing_counts.get(key, 0) + 1


from collections import Counter

all_steps = []
for r in data:
    if "steps" in r and isinstance(r["steps"], list):
        all_steps.extend(r["steps"])

words = [w for step in all_steps for w in re.findall(r'\w+', step.lower())]
step_word_counts = Counter(words).most_common(20)




In [41]:
mean_len = np.mean(name_lengths)
median_len = np.median(name_lengths)
std_len = np.std(name_lengths)

# he so = 2
# cong thuc tinh outlier > mean + he_so. do_lech_chuan hoac < mean - he_so. do_lech_chuan
outlier_titles = [
    r for r in data
    if len(r["name"].split()) < mean_len - 2 * std_len or len(r["name"].split()) > mean_len + 2 * std_len
]

mean_ing = np.mean(ingredient_counts)
std_ing = np.std(ingredient_counts)

outlier_ingredients = [
    r for r in data
    if len(r["ingredients"]) > mean_ing + 2 * std_ing or len(r["ingredients"]) < mean_ing - 2 * std_ing
]


outlier_titles_info = [
    {"ten": r["name"], "so_tu": len(r["name"].split())}
    for r in outlier_titles
]


outlier_ingredients_info = [
    {"ten": r["name"], "so_nguyen_lieu": len(r["ingredients"])}
    for r in outlier_ingredients
]


In [43]:
result = {
    "tong_so_mon_an": len(data),

    # Tên món dài nhất theo số chữ
    "ten_mon_dai_nhat": max_name.get("name"),

    # Thống kê số chữ trong tên món
    "trung_binh_so_chu_trong_ten": round(mean_len, 2),
    "trung_vi_so_chu_trong_ten": median_len,
    "do_lech_chuan_so_chu_trong_ten": round(std_len, 2),

    # Thống kê nguyên liệu
    "so_nguyen_lieu_trung_binh": round(mean_ing, 2),
    "do_lech_chuan_nguyen_lieu": round(std_ing, 2),

    # Top 10 nguyên liệu phổ biến
    "top_nguyen_lieu_pho_bien": ingredient_counter.most_common(10),

    # Thống kê số trường bị trống
    "so_truong_bi_trong": dict(missing_counts),
    "ten_qua_dai": outlier_titles_info,
    "mon_an_qua_nhieu_nguyen_lieu": outlier_ingredients_info,
}

# In ra kết quả
import pprint
pprint.pprint(result, sort_dicts=False)


{'tong_so_mon_an': 849,
 'ten_mon_dai_nhat': 'Nấm Rơm Kho Quẹt, Rau Củ Luộc Và Cơm Cháy Áp Chảo',
 'trung_binh_so_chu_trong_ten': np.float64(5.27),
 'trung_vi_so_chu_trong_ten': np.float64(5.0),
 'do_lech_chuan_so_chu_trong_ten': np.float64(1.53),
 'so_nguyen_lieu_trung_binh': np.float64(8.43),
 'do_lech_chuan_nguyen_lieu': np.float64(3.08),
 'top_nguyen_lieu_pho_bien': [('đường', 307),
                              ('muối', 270),
                              ('dầu ăn', 230),
                              ('đường trắng', 225),
                              ('xốt nêm thịt heo knorr', 173),
                              ('tỏi', 166),
                              ('hành lá', 163),
                              ('nước tương', 161),
                              ('hạt nêm chay knorr nấm hương', 140),
                              ('cà rốt', 138)],
 'so_truong_bi_trong': {'preTimeMinutes': 849},
 'ten_qua_dai': [{'ten': 'Nấm Bào Ngư Chiên Giòn Xốt Bơ Tỏi Mật Ong',
                  'so_tu'