In [178]:
import os
import pandas as pd

base_path = './first_clean_recipe/'
files = [
    ('v1_JuanDiErShiLiaoZhuBing_eng.csv', 'v1_JuanDiErShiLiaoZhuBing.csv'),
    ('v1_JuanDiErZhuBanTangJian_eng.csv', 'v1_JuanDiErZhuBanTangJian.csv'),
    ('v1_JuanDiYiJuZhenYiZhuan_1_eng.csv', 'v1_JuanDiYiJuZhenYiZhuan_1.csv'),
    ('v1_JuanDiYiJuZhenYiZhuan_2_eng.csv', 'v1_JuanDiYiJuZhenYiZhuan_2.csv'),
    ('v1_JuanDiYiJuZhenYiZhuan_3_eng.csv', 'v1_JuanDiYiJuZhenYiZhuan_3.csv')
]

In [162]:
import re

pattern = re.compile(r'(?P<ingredient>[A-Za-z\s’\[\]“”\.-]+)\s*(?P<quantity>\([0-9A-Za-z\s;,“”\.-]+\))?')

#### Construct df for all recipes english and chinese

In [179]:
dfs = []
for file_name in files:
    df1 = pd.read_csv(os.path.join(base_path, file_name[0]), sep='|')
    df2 = pd.read_csv(os.path.join(base_path, file_name[1]), sep=',')
    df = pd.concat([df1, df2], axis=1)
    dfs.append(df)
full = pd.concat(dfs, ignore_index=True)
full.to_csv('./full_recipe.csv', sep='|', index=False, encoding='utf-8-sig')

#### Extracting ingredients from recipes

In [181]:
ingredients = pd.DataFrame(columns=['Food_Name', 'Food_Name_en', 'Ingredient', 'Ingredient_en', 'Amount'])

for index, row in full.iterrows():
    if row['Ingredients_en'] != row['Ingredients_en']:
        continue
    ingredient_list = row['Ingredients'].split(' ')
    ingredient_en = row['Ingredients_en']
    if ingredient_en[-1] == '.':
        ingredient_en = ingredient_en[:-1] 
    i = 0
    ingreds_en = []
    ingreds = []
    matches = pattern.finditer(ingredient_en)
    for match in matches:
        ingredient = match.group('ingredient')
        if ingredient != ingredient or ingredient == '':
            continue
        ingredient = ingredient.strip()
        if ingredient == '':
            continue
        if ingredient[-1] == '.':
            ingredient = ingredient[:-1]

        quantity = match.group('quantity')
        if quantity is not None:
            quantity = quantity.strip().replace('(', '').replace(')', '')
        
        ingreds_en.append((ingredient, quantity))
    
    i = 0
    while i < len(ingredient_list):
        ingredient = ingredient_list[i]
        if ingredient == '':
            i = i + 1
            continue
        i = i + 2
        ingreds.append(ingredient)
    
    for ingredient, ingredient_en in zip(ingreds, ingreds_en):
        amount = ingredient_en[1]
        ingredient_en = ingredient_en[0]
        ingredients = pd.concat([pd.DataFrame([[row['Food_Name'], row['Food_Name_en'], ingredient, ingredient_en, amount]], columns=ingredients.columns), ingredients], ignore_index=True)

ingredients.to_csv(f'./ingredient/all_ingredient.csv', sep='|', index=False, encoding='utf-8-sig')

#### Extracting effects recipes

In [182]:
effects = pd.DataFrame(columns=['Effect_en', 'Food_Name_en', 'Effect', 'Food_Name'])

for index, row in full.iterrows():
    if row['Effect_en'] != row['Effect_en']:
        continue
    es = row['Effect_en'].lower().replace('it ', '').replace('they', '').replace('.', '').split(', ')
    eng_effects = []
    for e in es:
        if e == '':
            continue
        if e.startswith('and'):
            e = e[3:]
        elif e.startswith('[and]'):
            e = e[5:]
        e = e.strip()
        eng_effects.append((e, row['Food_Name_en']))

    zh_effects = []
    for e in row['Effect'].split('，'):
        zh_effects.append((e, row['Food_Name']))
    
    for effect_en, effect_zh in zip(eng_effects, zh_effects):
        effects = pd.concat([pd.DataFrame([[effect_en[0], effect_en[1], effect_zh[0], effect_zh[1]]], columns=effects.columns), effects], ignore_index=True)
effects.to_csv(f'./effect.csv', index=False, sep='|', encoding='utf-8-sig')

#### Generating cooking method mapping

In [186]:
methods = pd.DataFrame(columns=['method_en', 'Food_Name_en', 'method', 'Food_Name'])

methods_chinese = pd.read_csv('./categorize/methods.csv')

method_translated = ['Boil', 'Simmer', 'Steam', 'Pan-fry', 'Bake', 'Braise', 'Broil', 'Stir-fry', 'Roast', 'Deep-fry']
method_mapping = {}

for i, e in enumerate(list(methods_chinese['method'].unique())):
    method_mapping[e] = method_translated[i]

for index, row in methods_chinese.iterrows():
    name = row['Food_Name']
    method = row['method']
    method_en = method_mapping[method]
    if full[full['Food_Name'] == name].empty:
        continue
    name_en = full[full['Food_Name'] == name]['Food_Name_en'].values[0]
    methods = pd.concat([pd.DataFrame([[method_en, name_en, method, name]], columns=methods.columns), methods], ignore_index=True)

methods.to_csv(f'./method.csv', index=False, sep='|', encoding='utf-8-sig')

In [None]:
effects_categories = {
    "治语言蹇涩": "Speech-related",
    "治言语蹇涩": "Speech-related",
    "治言语错谬": "Speech-related",
    "治燥热": "Heat-clearing",
    "治五脏风热": "Heat-clearing",
    "治心脾风热": "Heat-clearing",
    "治烦热": "Heat-clearing",
    "治诸风": "Heat-clearing",
    "治惊风": "Heat-clearing",
    "治风狂": "Heat-clearing",
    "去风湿": "Heat-clearing",
    "治五藏邪气": "Heat-clearing",
    "治肾虚劳损": "Heat-clearing",
    "治阳气衰败": "Heat-clearing",
    "治肺痿": "Heat-clearing",
    "治风痹不仁": "Heat-clearing",
    "治中热": "Heat-clearing",
    "治五心烦躁": "Heat-clearing",
    "治元藏虚弱": "Heat-clearing",
    "治久痔": "Gastrointestinal Issues",
    "治肠风": "Gastrointestinal Issues",
    "治大便常有血": "Gastrointestinal Issues",
    "治下血不止": "Gastrointestinal Issues",
    "治野鸡病": "Gastrointestinal Issues",
    "治肛门肿满": "Gastrointestinal Issues",
    "治大肠滞涩": "Gastrointestinal Issues",
    "治下痢": "Gastrointestinal Issues",
    "治赤白泄痢": "Gastrointestinal Issues",
    "治泄痢": "Gastrointestinal Issues",
    "治肠胃不固": "Gastrointestinal Issues",
    "治大肠滞涩": "Gastrointestinal Issues",
    "治胃脾虚弱": "Gastrointestinal Issues",
    "治下痢": "Gastrointestinal Issues",
    "治赤白泄痢": "Gastrointestinal Issues",
    "治泄痢": "Gastrointestinal Issues",
    "治肠胃不固": "Gastrointestinal Issues",
    "治消渴": "Gastrointestinal Issues",
    "治脾胃久冷，不思饮食": "Gastrointestinal Issues",
    "治心腹冷气冲胁肋痛": "Gastrointestinal Issues",
    "心腹冷气痛": "Gastrointestinal Issues",
    "治久患冷气": "Gastrointestinal Issues",
    "治心腹结痛": "Gastrointestinal Issues",
    "治骨蒸": "Musculoskeletal Issues",
    "治筋骨烦痛": "Musculoskeletal Issues",
    "治腰膝痛": "Musculoskeletal Issues",
    "治腰脊酸疼": "Musculoskeletal Issues",
    "益气力": "Musculoskeletal Issues",
    "补虚羸": "Musculoskeletal Issues",
    "益元气": "Musculoskeletal Issues",
    "补益五脏": "Musculoskeletal Issues",
    "壮筋骨": "Musculoskeletal Issues",
    "治中风": "Neurological and Mental Health",
    "治头眩": "Neurological and Mental Health",
    "治羸瘦": "Neurological and Mental Health",
    "治精神昏愦": "Neurological and Mental Health",
    "治言语蹇涩": "Neurological and Mental Health",
    "治心燥": "Neurological and Mental Health",
    "治燥热": "Neurological and Mental Health",
    "治心志不宁": "Neurological and Mental Health",
    "治忧愁不乐": "Neurological and Mental Health",
    "治言语错谬": "Neurological and Mental Health",
    "治风狂": "Neurological and Mental Health",
    "治癫痫": "Neurological and Mental Health",
    "治骨蒸": "Musculoskeletal Issues",
    "治背强": "Musculoskeletal Issues",
    "治筋骨烦痛": "Musculoskeletal Issues",
    "治腰膝痛": "Musculoskeletal Issues",
    "治骨髓伤败": "Musculoskeletal Issues",
    "治腰背疼痛": "Musculoskeletal Issues",
    "治咳嗽唾血": "Respiratory Issues",
    "治上气咳嗽": "Respiratory Issues",
    "治肺痿": "Respiratory Issues",
    "治咳嗽唾血": "Respiratory Issues",
    "治上气咳嗽": "Respiratory Issues",
    "治喘急": "Respiratory Issues",
    "调顺肺气": "Respiratory Issues",
    "治消小便数": "Genitourinary Issues",
    "治小便淋涩": "Genitourinary Issues",
    "治水肿": "Genitourinary Issues",
    "治小便癃闭不通": "Genitourinary Issues",
    "治小便不通": "Genitourinary Issues",
    "治小便频数": "Genitourinary Issues",
    "治小便淋涩": "Genitourinary Issues",
    "治小便癃闭不通": "Genitourinary Issues",
    "治小便不通": "Genitourinary Issues",
    "治小便频数": "Genitourinary Issues",
    "治小便涩少": "Genitourinary Issues",
    "治小便数": "Genitourinary Issues",
    "治十种水病不瘥": "Genitourinary Issues",
    "治腹中水癖": "Genitourinary Issues",
    "益精气": "General Health and Wellness",
    "强心志": "General Health and Wellness",
    "耳目聪明": "General Health and Wellness",
    "补中益气": "General Health and Wellness",
    "补脾胃": "General Health and Wellness",
    "补益肾气": "General Health and Wellness",
    "益气力": "General Health and Wellness",
    "补虚羸": "General Health and Wellness",
    "益元气": "General Health and Wellness",
    "补益五脏": "General Health and Wellness",
    "延年益寿": "General Health and Wellness",
    "补精髓": "General Health and Wellness",
    "补中": "General Health and Wellness",
    "健脾胃": "General Health and Wellness",
    "补下元": "General Health and Wellness",
    "补中益气": "General Health and Wellness",
    "补益": "General Health and Wellness",
    "补虚益气": "General Health and Wellness",
    "治皮肤热疮": "General Health and Wellness",
    "治精神昏愦": "General Health and Wellness",
    "治心志不宁": "General Health and Wellness",
    "治心气惊悸，郁结不乐": "General Health and Wellness",
    "治虚弱骨蒸": "General Health and Wellness",
    "治四肢无力": "General Health and Wellness",
    "治心烦不得睡卧": "General Health and Wellness",
    "治心烦": "General Health and Wellness",
    "治久冷": "General Health and Wellness",
    "治诸虚": "General Health and Wellness",
    "治五劳七伤": "General Health and Wellness",
    "治骨髓伤败": "General Health and Wellness",
    "治虚弱": "General Health and Wellness",
    "治劳伤": "General Health and Wellness",
    "治心腹邪气": "General Health and Wellness",
    "治五藏邪气": "General Health and Wellness",
    "治肾虚弱": "General Health and Wellness",
    "治骨伤败": "General Health and Wellness",
    "治瘦弱无力": "General Health and Wellness",
    "治卒患腰痛": "General Health and Wellness",
    "治腰脚疼痛": "General Health and Wellness",
    "治肾虚衰弱": "General Health and Wellness",
    "治腰脚无力": "General Health and Wellness",
    "治肾虚": "General Health and Wellness",
    "治不能久立": "General Health and Wellness",
    "治身重气乏": "General Health and Wellness",
    "治盗汗": "General Health and Wellness",
    "治少食，时复吐利": "General Health and Wellness",
    "治卒患腰眼疼痛者": "General Health and Wellness",
    "治元脏虚冷": "General Health and Wellness",
    "治腹内冷痛": "General Health and Wellness",
    "治腰脊酸疼": "General Health and Wellness",
    "益精气": "General Health and Wellness",
    "强志": "General Health and Wellness",
    "益气和中": "General Health and Wellness",
    "主伤中": "Others",
    "去一切不正之气": "Others",
    "辟瘟疫": "Others",
    "除寒湿": "Others",
    "解化酒毒": "Others",
    "治湿痹": "Others",
}

In [None]:
df = pd.read_csv('./categorize/effect.csv')

In [None]:
def super_contains(s, l):
    for e in l:
        if e in s:
            return True
    return False

In [None]:
for index, row in df.iterrows():
    if row['Effect'] != row['Effect']:
        continue
    df.loc[index, 'Category'] = effects_categories.get(row['Effect'], 'Others')
    if super_contains(row['Effect'], ['腰', '手', '足']):
        df.loc[index, 'Category'] = 'Musculoskeletal Issues'
    if super_contains(row['Effect'], ['渴', '舌焦', '生津', '乾']):
        df.loc[index, 'Category'] = 'Gastrointestinal Issues'
    if super_contains(row['Effect'], ['风', '治歌笑无度', '治神情恍惚']):
        df.loc[index, 'Category'] = 'Neurological and Mental Health'
    if super_contains(row['Effect'], ['虚', '劳', '瘦', '弱', '温中', '顺气', '聪明耳目']):
        df.loc[index, 'Category'] = 'General Health and Wellness'
    if super_contains(row['Effect'], ['嗽', '胸', '膈', '肺']):
        df.loc[index, 'Category'] = 'Respiratory Issues'
    if super_contains(row['Effect'], ['胃', '脾', '肠', '肝', '肾', '腹', '便', '痢疾']):
        df.loc[index, 'Category'] = 'Gastrointestinal Issues'

In [None]:
df.to_csv('./categorize/effect.csv', index=False, encoding='utf-8-sig')