In [32]:
import json
import re
import numpy as np
from collections import Counter
with open("recipes_cleaned_title.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(len(data))
print(data[0].keys())


4090
dict_keys(['_id', 'name', 'description', 'imageURL', 'cuisine', 'preTimeMinutes', 'cookTimeMinutes', 'totalTimeMinutes', 'servings', 'ingredients', 'instructions', 'nutritionInfo', 'tags', 'createdAt', 'updatedAt'])


In [33]:
for recipe in data:
    recipe["name"] = recipe["name"].strip()
    for ing in recipe["ingredients"]:
        ing["name"] = ing["name"].lower().strip()


In [34]:
name_lengths = [len(recipe["name"].split()) for recipe in data]
max_name = max(data, key=lambda r: len(r["name"].split()))
all_words = [word for r in data for word in r["name"].split()]


In [35]:
ingredient_counts = [len(recipe["ingredients"]) for recipe in data]


ingredient_counter = Counter()
for r in data:
    for ing in r["ingredients"]:
        ingredient_counter[ing["name"]] += 1
ingredient_counter.most_common(10)

missing_counts = {}
for recipe in data:
    for key, value in recipe.items():
        if value in [None, "", [], {}]:
            missing_counts[key] = missing_counts.get(key, 0) + 1

all_steps = []
for r in data:
    if "steps" in r and isinstance(r["steps"], list):
        all_steps.extend(r["steps"])

words = [w for step in all_steps for w in re.findall(r'\w+', step.lower())]
step_word_counts = Counter(words).most_common(20)

In [36]:
mean_len = np.mean(name_lengths)
median_len = np.median(name_lengths)
std_len = np.std(name_lengths)

# he so = 2
k = 2
# cong thuc tinh outlier > mean + he_so. do_lech_chuan hoac < mean - he_so. do_lech_chuan
outlier_titles = [
    r for r in data
    if len(r["name"].split()) < mean_len - k * std_len or len(r["name"].split()) > mean_len + 2 * std_len
]

mean_ing = np.mean(ingredient_counts)
std_ing = np.std(ingredient_counts)

outlier_ingredients = [
    r for r in data
    if len(r["ingredients"]) > mean_ing + k * std_ing or len(r["ingredients"]) < mean_ing - 2 * std_ing
]


outlier_titles_info = [
    {"ten": r["name"], "so_tu": len(r["name"].split())}
    for r in outlier_titles
]


outlier_ingredients_info = [
    {"ten": r["name"], "so_nguyen_lieu": len(r["ingredients"])}
    for r in outlier_ingredients
]


In [37]:
result = {
    "tong_so_mon_an": len(data),

    # T√™n m√≥n d√†i nh·∫•t theo s·ªë ch·ªØ
    "ten_mon_dai_nhat": max_name.get("name"),

    # Th·ªëng k√™ s·ªë ch·ªØ trong t√™n m√≥n
    "trung_binh_so_chu_trong_ten": round(mean_len, 2),
    "trung_vi_so_chu_trong_ten": median_len,
    "do_lech_chuan_so_chu_trong_ten": round(std_len, 2),

    # Th·ªëng k√™ nguy√™n li·ªáu
    "so_nguyen_lieu_trung_binh": round(mean_ing, 2),
    "do_lech_chuan_nguyen_lieu": round(std_ing, 2),

    # Top 10 nguy√™n li·ªáu ph·ªï bi·∫øn
    "top_nguyen_lieu_pho_bien": ingredient_counter.most_common(10),

    # Th·ªëng k√™ s·ªë tr∆∞·ªùng b·ªã tr·ªëng
    "so_truong_bi_trong": dict(missing_counts),
    "ten_qua_dai": outlier_titles_info,
    "so luong ten qua dai":  len(outlier_titles_info),
    "mon_an_qua_nhieu_nguyen_lieu": outlier_ingredients_info,
    "so luong mon qua nhieu nguyen lieu": len(outlier_ingredients_info),
}

# In ra k·∫øt qu·∫£
import pprint
pprint.pprint(result, sort_dicts=False)


{'tong_so_mon_an': 4090,
 'ten_mon_dai_nhat': 'Salad Tr√°i C√¢y: Chu·ªëi + D∆∞a Leo + X√† L√°ch + Ph√¥ Mai Con '
                     'B√≤ C∆∞·ªùi V·ªã D√¢u + 1 √çt S·ªØa ƒê·∫∑c',
 'trung_binh_so_chu_trong_ten': np.float64(5.74),
 'trung_vi_so_chu_trong_ten': np.float64(5.0),
 'do_lech_chuan_so_chu_trong_ten': np.float64(2.45),
 'so_nguyen_lieu_trung_binh': np.float64(6.63),
 'do_lech_chuan_nguyen_lieu': np.float64(3.65),
 'top_nguyen_lieu_pho_bien': [('ƒë∆∞·ªùng', 590),
                              ('gia v·ªã', 473),
                              ('h√†nh l√°', 416),
                              ('mu·ªëi', 382),
                              ('d·∫ßu ƒÉn', 299),
                              ('t·ªèi', 295),
                              ('c√† r·ªët', 290),
                              ('h√†nh t√¢y', 232),
                              ('n∆∞·ªõc m·∫Øm', 229),
                              ('c√† chua', 210)],
 'so_truong_bi_trong': {'preTimeMinutes': 4089,
                        'tags

In [38]:
from collections import Counter
import re

# T√°ch t·∫•t c·∫£ c√°c t·ª´ trong t√™n m√≥n
all_words = []
for recipe in data:
    words = re.findall(r'\b\w+\b', recipe["name"].lower())
    all_words.extend(words)

word_freq = Counter(all_words)
common_words = word_freq.most_common(100)  # top 100 t·ª´ ph·ªï bi·∫øn
pprint.pprint(common_words, sort_dicts=False)

[('th·ªãt', 498),
 ('g√†', 484),
 ('chi√™n', 481),
 ('c∆°m', 418),
 ('t√¥m', 367),
 ('m√¨', 307),
 ('c√°', 305),
 ('canh', 291),
 ('ch√°o', 290),
 ('x√†o', 287),
 ('rau', 276),
 ('ƒë·∫≠u', 275),
 ('tr·ªôn', 251),
 ('n∆∞·ªõc', 250),
 ('tr·ª©ng', 245),
 ('ch√®', 237),
 ('n·∫•u', 235),
 ('b√°nh', 214),
 ('chua', 209),
 ('n·∫•m', 202),
 ('b√≤', 186),
 ('s·ªët', 180),
 ('s·ªØa', 180),
 ('c√†', 179),
 ('heo', 178),
 ('c·∫£i', 175),
 ('v√†', 160),
 ('rang', 155),
 ('h·∫°t', 146),
 ('x√¥i', 144),
 ('l√°', 143),
 ('m·∫Øm', 136),
 ('kho', 129),
 ('lu·ªôc', 129),
 ('xanh', 129),
 ('chay', 127),
 ('c·ªß', 124),
 ('n∆∞·ªõng', 120),
 ('nem', 120),
 ('h·∫ßm', 118),
 ('gi√≤n', 116),
 ('khoai', 115),
 ('ch·∫£', 113),
 ('tr√†', 111),
 ('ƒë·ªè', 108),
 ('t√¢y', 97),
 ('b·∫Øp', 94),
 ('ƒÉn', 88),
 ('b√≠', 88),
 ('bi·ªÉn', 86),
 ('t∆∞∆°i', 83),
 ('b√∫n', 82),
 ('ng·ªçt', 81),
 ('rong', 77),
 ('b√¨nh', 77),
 ('v·ªõi', 73),
 ('ti√™u', 73),
 ('xay', 71),
 ('m·ª±c', 69),
 ('tr√°i', 69),
 ('chu·ªëi', 68),
 ('ho

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

titles = [r["name"].lower() for r in data]

vectorizer = CountVectorizer(ngram_range=(2, 4), max_features=200)

X = vectorizer.fit_transform(titles)

ngrams_freq = zip(vectorizer.get_feature_names_out(), X.sum(axis=0).tolist()[0])

sorted_ngrams = sorted(ngrams_freq, key=lambda x: x[1], reverse=True)

for ngram, freq in sorted_ngrams:
    print(f"{ngram}: {freq}")

chi√™n gi√≤n: 95
g√† chi√™n: 90
c∆°m chi√™n: 89
rong bi·ªÉn: 65
rau c·ªß: 63
th·ªãt heo: 63
c√† chua: 61
c√† r·ªët: 61
th·ªãt b√≤: 59
m√¨ l√°: 57
n·∫•u t√¥m: 57
ch√°o th·ªãt: 56
t√¥m th·ªãt: 56
n∆∞·ªõc m·∫Øm: 54
ph√¥ mai: 52
ƒë√πi g√†: 52
th·∫≠p c·∫©m: 51
tr√°i c√¢y: 51
·ª©c g√†: 49
l√° t∆∞∆°i: 48
m√¨ l√° t∆∞∆°i: 48
b√≤ x√†o: 47
khoai t√¢y: 47
tr·ª©ng chi√™n: 44
y·∫øn m·∫°ch: 43
kho ti√™u: 42
canh chua: 40
h·∫°t sen: 40
chi√™n n∆∞·ªõc: 39
c√°nh g√†: 39
s·ªØa chua: 39
ƒë∆°n gi·∫£n: 38
chi√™n n∆∞·ªõc m·∫Øm: 37
c∆°m nh√†: 37
khoai lang: 37
rau c√¢u: 37
ch·∫£ c√°: 36
ƒë·∫≠u xanh: 36
c√° h·ªìi: 35
n∆∞·ªõc t∆∞∆°ng: 35
ƒë·∫≠u h≈©: 35
b√≠ ƒë·ªè: 34
b√°nh m√¨: 33
b√°nh tr√°ng: 33
n∆∞·ªõc √©p: 33
tr√† s·ªØa: 33
canh rau: 32
th·ªãt rang: 32
·ªõt chu√¥ng: 32
c√°nh g√† chi√™n: 31
th·ªãt g√†: 31
th·ªãt vai: 31
b·∫Øp c·∫£i: 30
h√†n qu·ªëc: 30
x√†o t√¥m: 30
ch√® d∆∞·ª°ng: 29
ch√® d∆∞·ª°ng nhan: 29
d∆∞·ª°ng nhan: 29
g·∫°o l·ª©t: 29
tr·ª©ng g√†: 29
ch√® tr√¥i: 28
g√† chi√™n n∆∞·ªõc: 28
g√† chi√™n n∆∞·ªõ

In [43]:
import random
black_list = [
    "c√°ch l√†m", "c·ªßa annie", "c·ªßa annie vo", "c·ªßa pham", "c·ªßa pham huyen", "c·ªßa b·∫øp", "c·ªßa rose", "c·ªßa rose truong",
    "b√≤n bon", "c·ªßa b√≤n bon", "c·ªßa minh", "c·ªßa b·∫£o", "c·ªßa huyen", "c·ªßa dory", "c·ªßa huyen le", "c·ªßa huyen le tran", "c·ªßa b·∫£o b√¨nh", "c·ªßa b·∫øp nh√†", "c·ªßa qu√¢n nguy·ªÖn", "c·ªßa b·∫øp nh√† c·ªçp", "c·ªßa nh√† v√¢n", "c·ªßa s∆°n panda", "c·ªßa ho√†ng", "l√†m m√≥n sp", "c·ªßa phan", "c·ªßa phan bao van", "c·ªßa minh hayes", "c·ªßa kim", "c·ªßa hu·ª≥nh", "c·ªßa hu·ª≥nh ph√°t", "c·ªßa kim dung", "c·ªßa b·∫øp c·ªßa", "c·ªßa qu·ª≥nh", "c·ªßa tr·∫ßn", "c·ªßa ho√†ng th·ªã", "c·ªßa b·∫øp c·ªßa qu·ª≥nh", "c·ªßa ho√†ng th·ªã t·ªë", "ho√†ng th·ªã t·ªë h√†", "c·ªßa b·∫øp hien dang", "th·ªã t·ªë h√†"
    , "thu m√®o", "c·ªßa h√¢n h√¢n", "cho b√©", "m√≥n", "nh√† c·ªçp", "eat clean", "c·ªßa", "le tran", "huyen", "vo", "truong", "hayes", "le tran", "bao van", "d·ªÖ l√†m", "?", "!", "th∆°m ph·ª©c", "ƒÉn d·∫∑m", ":"

]

def clean_title(title, blacklist):
    title = title.lower()

    # B·∫Øt t·∫•t c·∫£ c·ª•m 'c·ªßa <t√™n>'
    title = re.sub(r"c·ªßa [a-z√†-·ªπ\s]{2,30}", "", title)

    for phrase in blacklist:
        title = re.sub(r"\b" + re.escape(phrase) + r"\b", "", title)

    title = re.sub(r"\s+", " ", title).strip()
    return title.title()



for r in data:
    r["name"] = clean_title(r["name"], black_list)
with open("recipes_cleaned_title.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

    # Ki·ªÉm tra l·∫°i 5 d√≤ng ƒë·∫ßu sau khi clean


for r in random.sample(data, 50):
    print(r["name"])




Ch√°o H·∫°t V·ª° Th·ªãt G√† üçó N·∫•m ƒê√πi G√†
M·ª±c Mai X√†o ·ªöt Chu√¥ng .
üå± N∆∞·ªõc M√°t M·ªß Tr√¥m Ph·ªï Tai K·ª∑ T·ª≠
Ch√® Kh√∫c B·∫°ch Hoa Qu·∫£ üçíü•£
C√° Khoai N·∫•u Ng√≥t
Salad ƒê·∫≠u G√† ƒê·∫≠u ƒê·ªè Rong Nho
M·∫Øm T√©p Rang T√¥m Kh√¥, Th·ªãt
C√° M·∫Øm S·ªët C√† Chua (Chua Ng·ªçt) 19T (Luong Thu Hoai)
Tr√† T√°o Xanh B·∫°c H√†
C∆°m Tr·ª©ng M√¢y Ch·∫£ G√† X√∫c X√≠ch
C∆°m N·∫Øm S·ªët Choco
Ch√°o Th·ªãt B·∫±m C·∫£i Ng·ªçt
B√°nh Nh√¢n Ch·∫£ Gi√≤ Chi√™n Gi√≤n
Ng·ªìng Su H√†o X√†o Th·ªãt B√≤
·ª®c G√† S·ªët Cam
M·ª±c S·ªØa Rim N∆∞·ªõc M·∫Øm
G√† Chi√™n B·∫±ng Nckd
G√† √Åp Ch·∫£o S·ªët Chanh Leo, K√®m Salad
Rau Ti·∫øn Vua X√†o Heo R·ª´ng
Ch√°o Y·∫øn M·∫°ch B√≠ Ng√¥ (Ng√†y 8)
N∆∞·ªõc √âp C·ªß D·ªÅn V√† D∆∞a H·∫•u B·ªï M√°u ƒê·∫πp Da
Custard (Kem H·ªôt G√† Tr·ª©ng S·ªØa)
S·ªØa ƒê·∫≠u Xanh Mix K·ª∑ T·ª≠, T√°o ƒê·ªè
C√° Tr√™ Kho G·ª´ng .
X√¥i C·ªëm H·∫°t Sen
B·∫ßu Um H·ªôt V·ªãt L·ªôn
B√∫n Ch·∫£ Nem N∆∞·ªõng - Nhanh G·ªçn
Rau M·∫ßm ƒê√° Lu·ªôc
N∆∞·ªõc Cam Qu√Ωt V·∫Øt
B√°nh M√¨ X√° X