In [None]:
from recipe_tagging.db import get_serverless_spark_session
from pydantic import BaseModel
from typing import Literal
import pandas as pd
from fuzzywuzzy import fuzz

In [2]:
class Args(BaseModel):
    language: Literal["danish", "norwegian", "swedish"]
    env: Literal["dev", "test", "prod"]

    @property
    def language_id(self) -> int:
        if self.language == "norwegian":
            return 1
        elif self.language == "swedish":
            return 5
        elif self.language == "danish":
            return 6
        else:
            raise ValueError("Invalid language")

In [3]:
args = Args(language="norwegian", env="prod")

In [4]:
args = Args(language="norwegian", env="prod")

language_id = 1 if args.language == "norwegian" else 5 if args.language == "swedish" else 6
spark = get_serverless_spark_session()

In [113]:
import pickle
import numpy as np

In [115]:
from recipe_tagging.predict.mappings import cuisine_mapping

# Model training

In [None]:
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from recipe_tagging.predict.mappings import cuisine_mapping

import nltk
nltk.download('wordnet')


In [35]:

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

## New model training


In [5]:
df = spark.sql(
    f"""
    with recipes as ( 
    select
        pk_dim_recipes
        , main_recipe_id
        , recipe_id
        , recipe_name
        , recipe_main_ingredient_id
        , recipe_main_ingredient_name_local
        , recipe_difficulty_level_id
        , cooking_time_from
        , cooking_time_to
        , language_id
        , is_in_recipe_universe
    from gold.dim_recipes dr
    ),

    taxonomies as (
        select
            pk_dim_taxonomies
            , taxonomy_id
            , language_id
            , taxonomy_name_local
        from gold.dim_taxonomies
        where taxonomy_type_name not like 'seo_%'
    ),

    taxonomy_list as (
        select
            recipes.recipe_id
            , concat_ws(', ', collect_list(taxonomies.taxonomy_name_local)) as taxonomy_name_list
            , concat_ws(', ', collect_list(taxonomies.taxonomy_id)) as taxonomy_id_list
            , size(collect_set(taxonomies.taxonomy_name_local)) as number_of_taxonomies
        from recipes
        left join gold.bridge_dim_recipes_dim_taxonomies as bridge
            on recipes.pk_dim_recipes = bridge.fk_dim_recipes
        left join taxonomies
            on bridge.fk_dim_taxonomies = taxonomies.pk_dim_taxonomies
        group by 1
    ),

    generic_ingredients as (
        select
            portions.recipe_id
            , ingredients.generic_ingredient_id
            , ingredient_translations.generic_ingredient_name
            , ingredient_translations.language_id
        from silver.pim__recipe_portions as portions
        left join silver.pim__chef_ingredient_sections as sections
            on portions.recipe_portion_id = sections.recipe_portion_id
        left join silver.pim__chef_ingredients as ingredients
            on
                sections.chef_ingredient_section_id
                = ingredients.chef_ingredient_section_id
        left join silver.pim__generic_ingredient_translations as ingredient_translations
            on ingredients.generic_ingredient_id = ingredient_translations.generic_ingredient_id
        where ingredients.generic_ingredient_id is not null
    ),

    generic_ingredient_list as (
        select
        recipes.recipe_id
        , concat_ws(', ', collect_set(generic_ingredients.generic_ingredient_name)) as generic_ingredient_name_list
        , concat_ws(', ', collect_set(generic_ingredients.generic_ingredient_id)) as generic_ingredient_id_list
        , size(collect_set(generic_ingredients.generic_ingredient_id)) as number_of_ingredients
        from recipes
        left join generic_ingredients
            on recipes.recipe_id = generic_ingredients.recipe_id
            and recipes.language_id = generic_ingredients.language_id
            group by 1
    ),

    recipe_allergens as (
        select
            recipes.recipe_id
            , dap.preference_name_combinations
            , dap.allergen_name_combinations
            , dap.taste_name_combinations_excluding_allergens
        from recipes
        left join intermediate.int_recipe_preferences_unioned irp
            on recipes.recipe_id = irp.recipe_id
        left join gold.dim_all_preference_combinations dap
            on irp.preference_combination_id = dap.pk_preference_combination_id
    ),

    nutritional_info as (
        select
            recipe_id
            , recipe_portion_id
            , portion_size
            , protein_gram_per_portion
            , carbs_gram_per_portion
            , fat_gram_per_portion
            , sat_fat_gram_per_portion
            , sugar_gram_per_portion
            , sugar_added_gram_per_portion
            , fiber_gram_per_portion
            , salt_gram_per_portion
            , fg_fresh_gram_per_portion
            , fg_proc_gram_per_portion
            , total_kcal_per_portion
        from mlgold.ml_recipe_nutritional_facts
        where portion_size = 4
    )

    select
        recipes.recipe_id
        , recipes.recipe_name
        , recipes.recipe_main_ingredient_name_local
        , recipes.language_id
        , recipes.cooking_time_to
        , taxonomy_list.taxonomy_name_list
        , generic_ingredient_list.generic_ingredient_name_list
        , recipe_allergens.preference_name_combinations
        , case 
        when nutritional_info.total_kcal_per_portion < 
        case 
            when recipes.language_id = 1 then 750 --norway
            when recipes.language_id = 5 then 550 -- sweden
            when recipes.language_id = 6 then 600 --denmark
        end
        and (nutritional_info.fg_fresh_gram_per_portion+nutritional_info.fg_proc_gram_per_portion) > 150 then true 
        else false
        end as is_low_calorie
        , case when nutritional_info.fiber_gram_per_portion > 10 then true else false end as is_high_fiber
        , case when nutritional_info.fat_gram_per_portion < (total_kcal_per_portion*0.3/9) then true else false end as is_low_fat
        , case when nutritional_info.sugar_gram_per_portion < (total_kcal_per_portion*0.07/4) then true else false end as is_low_sugar
    from recipes
    left join taxonomy_list
        on recipes.recipe_id = taxonomy_list.recipe_id
    left join generic_ingredient_list
        on recipes.recipe_id = generic_ingredient_list.recipe_id
    left join recipe_allergens
        on recipes.recipe_id = recipe_allergens.recipe_id
    left join nutritional_info
        on recipes.recipe_id = nutritional_info.recipe_id
    where recipes.is_in_recipe_universe = false
    and recipes.language_id = {language_id}
    order by recipes.recipe_id desc
    limit 15000
    """
)

In [6]:
df = df.toPandas()

In [8]:
# norwegian cuisine related taxonomies
cuisine_keywords_norwegian = {
    'African': ['Afrika'],
    'American': ['Amerikansk Bbq', 'Amerika'],
    'Arabian': ['Marocco'],
    'Asian': ['Asia'],
    'British': ['England'],
    'Chinese': ['Kina'],
    'Danish': ['Tyskland'],
    'French': ['Frankrike'],
    'Fusion Food': ['Fusion'],
    'Greek': ['Hellas'],
    'Hawaiian': [],
    'Indian': ['India'],
    'Italian': ['Italia'],
    'Japanese': ['Japan'],
    'Korean': ['Korea'],
    'Lebanese': ['Libanon'],
    'Mediterranean': ['Middelhavet'],
    'Mexican': ['Mexico'],
    'Middle Eastern': ['Midtøsten'],
    'Norwegian': ['Nordisk', 'Husmannskost'],
    'Persian': [],
    'South American': ['Sør- Amerika', 'Karibien', 'Karibia', 'Latin- Amerika'],
    'Spanish': ['Paella', 'Spania', 'Portugal'],
    'Swedish': [],
    'Tex-mex': ['Tex Mex '],
    'Thai': ['Thai', 'Thailand'],
    'Vietnamese': ['Vietnam']
}

In [9]:
#swedish cuisine related taxonomies
cuisine_keywords_swedish = {
    'African': ['Afrika'],
    'American': ['American Bbq', 'Amerika', 'Kalifornien'],
    'Arabian': ['Marocko'],
    'Asian': ['Asiatisk', 'Asian', 'Asien'],
    'British': ['England'],
    'Chinese': ['Kina'],
    'Danish': ['Danska'],
    'French': ['Frankrike'],
    'Fusion Food': ['Fusion'],
    'Greek': ['Grekisk', 'Grekland', 'Grekiskt'],
    'Hawaiian': [],
    'Indian': ['Indian', 'Indien'],
    'Italian': ['Italien'],
    'Japanese': ['Japan'],
    'Korean': ['Korea'],
    'Lebanese': ['Libanon'],
    'Mediterranean': ['Mediterranean', 'Medelhavet', 'Medelhav'],
    'Mexican': ['Mexiko', 'Mexikanskt'],
    'Middle Eastern': ['Middle Eastern', 'Mellanöstern'],
    'Norwegian': ['Norska'],
    'Persian': ['Persisk', 'Persiska'],
    'South American': ['Sydamerika', 'Latinamerika', 'Karibien'],
    'Spanish': ['Spanien', 'Paella'],
    'Swedish': ['Husmanskost', 'Nordisk'],
    'Tex-mex': ['Tex Mex', 'Tex-mex'],
    'Thai': ['Thailand', 'Thai'],
    'Vietnamese': ['Vietnam']
}

In [8]:
# danish cuisine related taxonomies
cuisine_keywords_danish = {
    'African': ['Afrika'],
    'American': ['Usa', 'American Bbq', 'Californien'],
    'Arabian': ['Marokko'],
    'Asian': ['Asiatisk Mad', 'Asien', 'Asian'],
    'British': ['England'],
    'Chinese': ['Kina'],
    'Danish': ['Husmandskost', 'Nordisk', 'Tyskland'],
    'French': ['Frankrig'],
    'Fusion Food': ['Fusion'],
    'Greek': ['Grækenland'],
    'Hawaiian': [],
    'Indian': ['Indisk Mad', 'Indien'],
    'Italian': ['Italien'],
    'Japanese': ['Japan'],
    'Korean': ['Korea'],
    'Lebanese': ['Libanon'],
    'Mediterranean': ['Middelhavet', 'Middelhav', 'Mediterranean'],
    'Mexican': ['Mexicansk Mad', 'Mexico'],
    'Middle Eastern': ['Mellemøsten'],
    'Norwegian': [],
    'Persian': [],
    'South American': ['Sydamerika', 'Caribien'],
    'Spanish': ['Spanien', 'Portugal', 'Paella'],
    'Swedish': [],
    'Tex-mex': ['Tex Mex'],
    'Thai': ['Thailand'],
    'Vietnamese': ['Vietnam']
}

### snorkel labels

In [None]:
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel

In [16]:
ABSTAIN = -1

In [15]:
CUISINE_LABELS = {}
i = 0
for key in cuisine_mapping.keys():
    CUISINE_LABELS[key] = i
    i += 1


In [19]:
def keyword_lookup(x, keywords):
    x_lower = x.lower()
    return any(keyword.lower() in x_lower for keyword in keywords)

def make_keyword_lf(label, keywords, fields=('recipe_name', 'taxonomy_name_list')):
    def lf(x):
        for field in fields:
            if keyword_lookup(getattr(x, field), keywords):
                return label
        return ABSTAIN
    return LabelingFunction(name=f"lf_{label}", f=lf)

In [20]:
lfs = []

for key, _ in cuisine_mapping.items():
    lfs.append(make_keyword_lf(CUISINE_LABELS[key], cuisine_keywords_norwegian[key] + cuisine_mapping[key][language_id])) # MAKE LANGUGAE SPECIFIC

In [None]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df)
label_model = LabelModel(cardinality=len(CUISINE_LABELS), verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=100, seed=45)

In [23]:
preds_train = label_model.predict(L_train)

In [24]:
df["weak_label"] = label_model.predict(L_train)

In [25]:
CUISINE_LABELS_REVERSED = {v: k for k, v in CUISINE_LABELS.items()}
df["cuisine_name"] = df["weak_label"].map(CUISINE_LABELS_REVERSED).fillna("Unknown")

In [None]:
df.cuisine_name.value_counts()

In [28]:
# drop all duplicated recipe_names from df
df = df.drop_duplicates(subset=['recipe_name', 'cuisine_name'])

In [None]:
df[df['cuisine_name'].str.contains('Swedish')][['recipe_name', 'cuisine_name']].head(60)

### exact mapped labels

In [None]:
for key, _ in cuisine_mapping.items():
    print(key)
    print(list(map(lambda x: x.lower(), cuisine_keywords_danish[key] + cuisine_mapping[key][language_id])))

In [13]:
df['cuisine_name'] = ''

In [None]:
for index, row in df.iterrows():
    matched_cuisines = set()
    recipe_name = row['recipe_name'].lower()
    taxonomy_list = row['taxonomy_name_list'].lower()
    
    for key in cuisine_mapping.keys():
        keywords = [k.lower() for k in (cuisine_keywords_danish[key] + cuisine_mapping[key][language_id])]
        if any(keyword in recipe_name for keyword in keywords):
            matched_cuisines.add(key)
        if any(keyword in taxonomy_list for keyword in keywords):
            matched_cuisines.add(key)
    
    df.loc[index, 'cuisine_name'] = ', '.join(matched_cuisines) if matched_cuisines else df.loc[index, 'cuisine_name']


In [27]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [19]:
tst = df.copy()

In [None]:
tst.cuisine_name.value_counts()

In [None]:
tst[tst['cuisine_name'].str.contains('British, Norwegian')][['recipe_name', 'cuisine_name']].head(60)

In [28]:
def clean_cuisine(cuisine_str):
    specific_asian_cuisines = {'Indian', 'Japanese', 'Chinese', 'Vietnamese', 'Korean', 'Thai'}
    specific_mediterranean_cuisines = {'Greek', 'Spanish', 'French', 'Italian'}
    specific_fusion = {'Fusion Food'}
    specific_african = {'Lebanese', 'Persian'}
    cuisines = [c.strip() for c in cuisine_str.split(',')]

    if len(cuisines) >= 4:
        return 'Fusion Food'
    
    for cuisine in specific_african:
        if cuisine in cuisines:
            return cuisine
    
    for cuisine in specific_fusion:
        if cuisine in cuisines:
            return cuisine

    for cuisine in specific_asian_cuisines:
        if cuisine in cuisines:
            return cuisine
    
    for cuisine in specific_mediterranean_cuisines:
        if cuisine in cuisines:
            return cuisine
    
    return ', '.join(cuisines)

In [29]:
tst['cuisine_name'] = tst['cuisine_name'].apply(clean_cuisine)

In [32]:
# drop rows where cuisine_name is empty
tst = tst[tst['cuisine_name'] != '']
#drop rows where cuisine name has value count < 10
cuisine_counts = tst['cuisine_name'].value_counts()
tst = tst[tst['cuisine_name'].isin(cuisine_counts[cuisine_counts >= 10].index)]


In [33]:
tst = tst.drop_duplicates(subset=['recipe_name', 'cuisine_name'])

In [34]:
#explode tst on cuisine_name (split on comma)
tst['cuisine_name'] = tst['cuisine_name'].str.split(', ')
tst = tst.explode('cuisine_name')


### train model

In [36]:
df = tst.copy()

In [8]:
import re
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [16]:
# drop all rows where cuisine_name is Unknown
df = df[df['cuisine_name'] != 'Unknown']

In [9]:
df['ingredients'] = df['generic_ingredient_name_list'].str.split(', ')

In [7]:
df = df.head()

In [11]:
def process_string(x):
    x = [" ".join([WordNetLemmatizer().lemmatize(q) for q in p.split()]) for p in x]
    x = list(map(lambda x: re.sub("[^a-zA-ZæøåäöÆØÅÄÖ]", " ", x), x))
    x = " ".join(x)                                
    x = x.lower()
    return x

In [12]:
df['ingredients'] = df['ingredients'].apply(process_string)

In [None]:
X = df['ingredients']
y = df['cuisine_name']

In [43]:
def tfidf_vectorizer(train, test=None):
    tfidf = TfidfVectorizer(
        stop_words=stopwords.words(args.language),
        ngram_range = ( 1 , 1 ),analyzer="word", 
        max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
    train = tfidf.fit_transform(train)
    if test is not None:
        test = tfidf.transform(test)
        return train, test, tfidf
    else:
        return train, tfidf

In [44]:
train_tfidf, tfidf = tfidf_vectorizer(X)
train = train_tfidf

In [None]:
def evalfn(C, gamma):
    s = SVC(C=float(C), gamma=float(gamma), kernel='rbf', class_weight='balanced', probability=True)
    f = cross_val_score(s, train, y, cv=3, scoring='f1_micro')
    return f.max()

opt = BayesianOptimization(evalfn, {'C': (0.1, 1000), 'gamma': (0.0001, 1)})
opt.maximize(n_iter=20, init_points=3)  

In [None]:
opt.max

In [None]:
clf = SVC(C=float(opt.max['params']['C']), gamma=float(opt.max['params']['gamma']), kernel='rbf', probability=True)
clf.fit(train, y)

In [None]:
args.language

In [49]:
os.makedirs('models', exist_ok=True)

pickle.dump(clf, open(f'models/{args.language}_model_keyword.pkl', 'wb'))
pickle.dump(tfidf, open(f'models/{args.language}_vectorizer_keyword.pkl', 'wb'))