#### For extracting ingredients and effect and creating category mapping from English to Chinese

In [11]:
import os
import pandas as pd

base_path = './first_clean_recipe/'
files = [
    ('v1_JuanDiErShiLiaoZhuBing_eng.csv', 'v1_JuanDiErShiLiaoZhuBing.csv'),
    ('v1_JuanDiErZhuBanTangJian_eng.csv', 'v1_JuanDiErZhuBanTangJian.csv'),
    ('v1_JuanDiYiJuZhenYiZhuan_1_eng.csv', 'v1_JuanDiYiJuZhenYiZhuan_1.csv'),
    ('v1_JuanDiYiJuZhenYiZhuan_2_eng.csv', 'v1_JuanDiYiJuZhenYiZhuan_2.csv'),
    ('v1_JuanDiYiJuZhenYiZhuan_3_eng.csv', 'v1_JuanDiYiJuZhenYiZhuan_3.csv')
]

In [12]:
import re

pattern = re.compile(r'(?P<ingredient>[A-Za-z\s’\[\]“”\.-]+)\s*(?P<quantity>\([0-9A-Za-z\s;,“”\.-]+\))?')

#### Construct df for all recipes english and chinese

In [13]:
dfs = []
for file_name in files:
    df1 = pd.read_csv(os.path.join(base_path, file_name[0]), sep='|')
    df2 = pd.read_csv(os.path.join(base_path, file_name[1]), sep=',')
    df = pd.concat([df1, df2], axis=1)
    dfs.append(df)
full = pd.concat(dfs, ignore_index=True)
full.to_csv('./full_recipe.csv', sep='|', index=False, encoding='utf-8-sig')

#### Extracting ingredients from recipes

In [181]:
ingredients = pd.DataFrame(columns=['Food_Name', 'Food_Name_en', 'Ingredient', 'Ingredient_en', 'Amount'])

for index, row in full.iterrows():
    if row['Ingredients_en'] != row['Ingredients_en']:
        continue
    ingredient_list = row['Ingredients'].split(' ')
    ingredient_en = row['Ingredients_en']
    if ingredient_en[-1] == '.':
        ingredient_en = ingredient_en[:-1] 
    i = 0
    ingreds_en = []
    ingreds = []
    matches = pattern.finditer(ingredient_en)
    for match in matches:
        ingredient = match.group('ingredient')
        if ingredient != ingredient or ingredient == '':
            continue
        ingredient = ingredient.strip()
        if ingredient == '':
            continue
        if ingredient[-1] == '.':
            ingredient = ingredient[:-1]

        quantity = match.group('quantity')
        if quantity is not None:
            quantity = quantity.strip().replace('(', '').replace(')', '')
        
        ingreds_en.append((ingredient, quantity))
    
    i = 0
    while i < len(ingredient_list):
        ingredient = ingredient_list[i]
        if ingredient == '':
            i = i + 1
            continue
        i = i + 2
        ingreds.append(ingredient)
    
    for ingredient, ingredient_en in zip(ingreds, ingreds_en):
        amount = ingredient_en[1]
        ingredient_en = ingredient_en[0]
        ingredients = pd.concat([pd.DataFrame([[row['Food_Name'], row['Food_Name_en'], ingredient, ingredient_en, amount]], columns=ingredients.columns), ingredients], ignore_index=True)

ingredients.to_csv(f'./ingredient/all_ingredient.csv', sep='|', index=False, encoding='utf-8-sig')

#### Extracting effects recipes

In [182]:
effects = pd.DataFrame(columns=['Effect_en', 'Food_Name_en', 'Effect', 'Food_Name'])

for index, row in full.iterrows():
    if row['Effect_en'] != row['Effect_en']:
        continue
    es = row['Effect_en'].lower().replace('it ', '').replace('they', '').replace('.', '').split(', ')
    eng_effects = []
    for e in es:
        if e == '':
            continue
        if e.startswith('and'):
            e = e[3:]
        elif e.startswith('[and]'):
            e = e[5:]
        e = e.strip()
        eng_effects.append((e, row['Food_Name_en']))

    zh_effects = []
    for e in row['Effect'].split('，'):
        zh_effects.append((e, row['Food_Name']))
    
    for effect_en, effect_zh in zip(eng_effects, zh_effects):
        effects = pd.concat([pd.DataFrame([[effect_en[0], effect_en[1], effect_zh[0], effect_zh[1]]], columns=effects.columns), effects], ignore_index=True)
effects.to_csv(f'./effect.csv', index=False, sep='|', encoding='utf-8-sig')

#### Generating cooking method mapping

In [186]:
methods = pd.DataFrame(columns=['method_en', 'Food_Name_en', 'method', 'Food_Name'])

methods_chinese = pd.read_csv('./categorize/methods.csv')

method_translated = ['Boil', 'Simmer', 'Steam', 'Pan-fry', 'Bake', 'Braise', 'Broil', 'Stir-fry', 'Roast', 'Deep-fry']
method_mapping = {}

for i, e in enumerate(list(methods_chinese['method'].unique())):
    method_mapping[e] = method_translated[i]

for index, row in methods_chinese.iterrows():
    name = row['Food_Name']
    method = row['method']
    method_en = method_mapping[method]
    if full[full['Food_Name'] == name].empty:
        continue
    name_en = full[full['Food_Name'] == name]['Food_Name_en'].values[0]
    methods = pd.concat([pd.DataFrame([[method_en, name_en, method, name]], columns=methods.columns), methods], ignore_index=True)

methods.to_csv(f'./method.csv', index=False, sep='|', encoding='utf-8-sig')

#### Generating category mapping

In [14]:
cateogry = pd.DataFrame(columns=['category_en', 'Food_Name_en', 'category', 'Food_Name'])

category_chinese = pd.read_csv('./categorize/categories.csv')

category_mapping = {}
category_translated = ['Paste', 'Pan-fry', 'Dish', 'Thick soup', 'Conge', 'Meat', 'Soup', 'Noodles', 'Rice noodles', 'Pancake', 'Thick liquid', 'Oil', 'Tea', 'Wonton', 'Steamed bun']

for i, e in enumerate(list(category_chinese['category'].unique())):
    category_mapping[e] = category_translated[i]

for index, row in category_chinese.iterrows():
    name = row['Food_Name']
    category = row['category']
    category_en = category_mapping[category]
    if full[full['Food_Name'] == name].empty:
        continue
    name_en = full[full['Food_Name'] == name]['Food_Name_en'].values[0]
    cateogry = pd.concat([pd.DataFrame([[category_en, name_en, category, name]], columns=cateogry.columns), cateogry], ignore_index=True)

cateogry.to_csv(f'./category.csv', index=False, sep='|', encoding='utf-8-sig')

#### Generating ingredient mapping

In [21]:
ingredient_cateogry = pd.DataFrame(columns=['Ingredient_en', 'category', 'Ingredient'])
ingredients = pd.read_csv('./ingredient/all_ingredient.csv', sep='|')

ingredient_category_chinese = pd.read_csv('./categorize/ingredient_category.csv')

for index, row in ingredient_category_chinese.iterrows():
    name = row['Ingredient']
    category = row['category']
    if ingredients[ingredients['Ingredient'] == name].empty:
        continue
    name_en = ingredients[ingredients['Ingredient'] == name]['Ingredient_en'].values[0]
    ingredient_cateogry = pd.concat([pd.DataFrame([[name_en, category, name]], columns=ingredient_cateogry.columns), ingredient_cateogry], ignore_index=True)

ingredient_cateogry.to_csv(f'./ingredient_category.csv', index=False, sep='|', encoding='utf-8-sig')

#### Generating effect mapping

In [23]:
effect_category = pd.DataFrame(columns=['Effect_en', 'category', 'Effect'])
effects = pd.read_csv('./effect.csv', sep='|')

effect_category_chinese = pd.read_csv('./categorize/effect.csv')

for index, row in effect_category_chinese.iterrows():
    name = row['Effect']
    category = row['Category']
    if effects[effects['Effect'] == name].empty:
        continue
    name_en = effects[effects['Effect'] == name]['Effect_en'].values[0]
    effect_category = pd.concat([pd.DataFrame([[name_en, category, name]], columns=effect_category.columns), effect_category], ignore_index=True)

effect_category.to_csv(f'./effect_category.csv', index=False, sep='|', encoding='utf-8-sig')