In [1]:
import pandas as pd
import re

In [2]:
recipes_df = pd.read_csv('./resource/recipes.csv')
comment_df = pd.read_csv('./resource/reviews.csv')

In [3]:
comment_df.dropna(inplace=True)
recipes_df.dropna(inplace=True)

In [4]:
indexImage = recipes_df[( recipes_df['Images'] == 'character(0)')].index
recipes_df.drop(indexImage, inplace=True)

indexImage = recipes_df[(recipes_df['Images'] == 'c("")' )].index
recipes_df.drop(indexImage, inplace=True)


In [5]:
recipes_df.to_csv('./out/recipes.csv', sep=',', encoding='utf-8', index=False)

### Insert to database

In [2]:
import pymongo 

In [29]:
import re

def convert_to_array(string):
    # Remove leading and trailing quotation marks if present
    string = string.strip('"\'')

    # Check if the input string is empty
    if not string:
        return []  # Return an empty list if the input is empty

    # Check if the input string contains "c(" and ")"
    if 'c(' in string and ')' in string:
        # Use regular expression to split the string at commas, considering quotes and spaces
        array = re.findall(r'"[^"]*"|[^",\s]+', string)

        array[:] = (value for value in array if value != '')        
        array[:] = (value for value in array if value != 'c(')        
        array[:] = (value for value in array if value != ')')        
        array[:] = (value for value in array if value != ' ')        
        # Remove any leading or trailing whitespace from each element
        array = [s.strip('"') for s in array]
        return list(set(array))
    else:
        return list(set([string]))  # Return the input string as a single-element list

In [31]:
client = pymongo.MongoClient("mongodb://root:123456@localhost:27017/?authMechanism=DEFAULT")
db = client["IR"] 
col = db["recipe"] 

cleaned_recipe_collection = db['cleaned_recipe']

for recipe in col.find({}): 
    cleaned_recipe = {
        'RecipeId': recipe['RecipeId'] ,
        'Name': recipe['Name']  ,
        'AuthorId': recipe['AuthorId'] ,
        'AuthorName': recipe['AuthorName'] ,
        'CookTime':  recipe['CookTime'] ,
        'PrepTime':  recipe['PrepTime'],
        'TotalTime': recipe['TotalTime'] ,
        'DatePublished': recipe['DatePublished'] ,
        'Description': recipe['Description'],
        'Images': convert_to_array(recipe['Images']),
        'RecipeCategory': recipe['RecipeCategory'],
        'Keywords': convert_to_array(recipe['Keywords']),
        'RecipeIngredientQuantities': convert_to_array(recipe['RecipeIngredientQuantities']),
        'RecipeIngredientParts':  convert_to_array(recipe['RecipeIngredientParts']),
        'AggregatedRating': recipe['AggregatedRating'],
        'Calories': recipe['Calories'],
        'FatContent': recipe['FatContent'],
        'SaturatedFatContent': recipe['SaturatedFatContent'],
        'CholesterolContent': recipe['CholesterolContent'],
        'SodiumContent': recipe['SodiumContent'],
        'CarbohydrateContent': recipe['CarbohydrateContent'],
        'FiberContent': recipe['FiberContent'],
        'SugarContent': recipe['SugarContent'],
        'ProteinContent': recipe['ProteinContent'],
        'RecipeServings': recipe['RecipeServings'],
        'RecipeYield': recipe['RecipeYield'],
        'RecipeInstructions':  convert_to_array(recipe['RecipeInstructions']),
    }

    # print(cleaned_recipe)
    cleaned_recipe_collection.insert_one(cleaned_recipe)


In [3]:
client = pymongo.MongoClient("mongodb://root:123456@localhost:27017/?authMechanism=DEFAULT")
db = client["IR"] 
col = db["recipe"] 
cleaned_recipe_collection = db['cleaned_recipe']

### Try Query

In [4]:
myquery = { "RecipeCategory": "Quick Breads" }
results = cleaned_recipe_collection.find(myquery).limit(5)

for match in results:
    print(match)

{'_id': ObjectId('65d5e48d8598535be43eb8f5'), 'RecipeId': '220', 'Name': 'Chocolate Zucchini Bread', 'AuthorId': '1604', 'AuthorName': 'itsjustme', 'CookTime': 'PT1H', 'PrepTime': 'PT25M', 'TotalTime': 'PT1H25M', 'DatePublished': '1999-08-07T09:18:00Z', 'Description': 'Make and share this Chocolate Zucchini Bread recipe from Food.com.', 'Images': ['https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/22/0/2rp90sY9Q96l93ATXIsR_zucchinnibread1%20(1%20of%201).jpg', 'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/22/0/pic6dBbjd.jpg', 'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/22/0/zrkZ5Dc3RdyIotK1LwV7_IMG_4718.JPG', 'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/22/0/NiKUyiO5Tt2T3IQzUpRi_IMG_4717.JPG', 'https://img.sndimg.com/food/image/upload/w_555,h_416,c_fit,fl_progressive,q_95/v1/img/recipes/22/0/picV

### Put the recipe into elasticsearch

In [5]:
from elasticsearch import Elasticsearch
import json
from bson import ObjectId
from elasticsearch import helpers


In [6]:
class ElasticIndexer:
    def __init__(self):
        self.es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")
        self.mongo_client = pymongo.MongoClient("mongodb://root:123456@localhost:27017/?authMechanism=DEFAULT")
        self.mongo_collection = self.mongo_client['IR']['cleaned_recipe']

    def run_indexer(self):
        self.es_client.indices.create(index='simple', ignore=400)
        self.es_client.indices.delete(index='simple', ignore=[400, 404])
        actions = []
        
        for doc in self.mongo_collection.find({}):
            _id = str(doc['_id'])
            del doc["_id"]
            action = {
                "_index": 'recipe',
                "_id": _id,
                "_source": doc
            }
            actions.append(action)

        response = helpers.bulk(self.es_client, actions)
        return response


In [7]:
es = ElasticIndexer()
# es.run_indexer()

In [8]:
es_client = Elasticsearch('https://localhost:9200', basic_auth=("elastic", "6E0GWL_MEddnKJWCnk*M"),
                    ca_certs="./http_ca.crt")

In [9]:
query = {
    "query_string": {
        "query": "Sweet Rolls Easy"
    }
}

results = es_client.search(index='recipe', query=query)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,Breakfast Egg Rolls,"[Brunch, < 60 Mins]","[onion, green pepper, eggs, cheese, water, mil...",13.042132
1,Honey Orange Butter,"[Beginner Cook, Inexpensive, < 15 Mins, Low Pr...","[honey, butter, margarine]",12.088933
2,Easy Crunchy Toffee,"[Easy, Beginner Cook, For Large Groups]","[butter, chocolate chips, brown sugar]",11.833335
3,The Best Sweet Potato Casserole,"[Beginner Cook, Inexpensive, Potato, Kid Frien...","[vanilla, butter, salt, brown sugar, eggs, wal...",10.93132
4,Sweet Kielbasa,"[Potluck, For Large Groups, Meat]","[kielbasa, brown sugar, water, apple cider, on...",10.792521
5,Hawaiian Bread Ham &amp; Cheese Rolls,[For Large Groups],"[butter, prepared yellow mustard, deli ham, sw...",10.75276
6,Sweet Potato Pancakes With Caramel Sauce,"[< 30 Mins, Potato, Yam/Sweet Potato, Vegetabl...","[butter, canned sweet potatoes, ground allspic...",10.670794
7,Sweet Rolls,"[Sweet, Kid Friendly, Thanksgiving, For Large ...","[active dry yeast, butter, ground cinnamon, sa...",10.655209
8,Mini Cajun Burgers With Easy R&eacute;moulade,"[Cajun, < 30 Mins]","[creole mustard, sausage, green leaf lettuce, ...",10.290481
9,Coconut Shrimp With Guava Sweet and Sour Sauce,"[< 30 Mins, Summer, Polynesian, Fruit, Weeknight]","[coconut flakes, panko breadcrumbs, white vine...",10.101153


### Recommendations

In [10]:
# Get top 5 similar to the selected dish

query = { 
   "more_like_this":{
    # Breakfast Eggcake ID:'65d5e4928598535be43ec668' 
    "fields":["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],"like":[{"_id": '65d5e4928598535be43ec668'}],"min_term_freq":1,"min_doc_freq":5,"max_query_terms":20
    }
}

results = es_client.search(index='recipe', query=query, size=5)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,Breakfast Sandwich,"[Beginner Cook, Kid Friendly, < 15 Mins, Stove...","[butter, bagel, ham, sausages, cream cheese, c...",24.269608
1,Breakfast Brownies,"[Kid Friendly, < 60 Mins, Toddler Friendly, Ea...","[skim milk, applesauce, egg, Bisquick, sugar]",22.52652
2,On-The-Go Breakfast Sandwich,"[Beginner Cook, Microwave, Pork, Meat, < 15 Mi...","[cheese, English muffin, milk, egg, bacon]",22.49757
3,Twiced Baked Potato Skins,"[Potato, Kid Friendly, < 15 Mins, Vegetable, C...","[monterey jack and cheddar cheese blend, parsl...",22.332865
4,Simplified Banana Oat Pancakes,"[Beginner Cook, Inexpensive, Kid Friendly, < 1...","[nutmeg, banana, all-purpose flour, canola oil...",22.148333


### Personalized by interestedCategory

In [11]:
users = db['users']

user = users.find({'username': 'KanK'})
user_df = pd.DataFrame(user)

In [12]:
user_df['interestedCategory'][0]

['Beginner Cook', 'Kid Friendly', 'Easy']

In [13]:
query = {
    "match": {
        "Keywords": {
        "query": ' '.join(user_df['interestedCategory'][0])
        }
    }
}

results = es_client.search(index='recipe', query=query, size=15)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,--V's Kicked up Baked Beans (Slow Cooker),"[Easy, Beginner Cook, Kid Friendly]","[barbecue sauce, pinto beans, ketchup, honey, ...",9.920821
1,Easy Grands Cheese Pizzas,"[Easy, Beginner Cook, Kid Friendly, < 30 Mins]",[mozzarella cheese],8.867212
2,Seasoned Goldfish Crackers,"[Easy, Beginner Cook, Kid Friendly, < 30 Mins]","[garlic powder, cayenne pepper, lemon-pepper s...",8.867212
3,Mama Mac's Bundt Pound Cake,"[Easy, Beginner Cook, Kid Friendly, < 60 Mins]","[vanilla, powdered sugar, eggs, instant lemon ...",8.867212
4,Loco Hot Cocoa,"[Easy, Beginner Cook, Kid Friendly, < 15 Mins]","[instant coffee, milk]",8.867212
5,Annie's Melt-In-Your-Mouth Mahi (Fish),"[Easy, Beginner Cook, Kid Friendly, < 30 Mins]","[dried dill weed, fresh rosemary, dried rosema...",8.867212
6,Garlic Shrimp and Orzo Salad,"[Easy, Beginner Cook, Kid Friendly, < 15 Mins]","[cucumber, green pepper, red onion, raw shrimp...",8.867212
7,2bleu's 2easy Boboli,"[Easy, Beginner Cook, Kid Friendly, < 30 Mins]","[active dry yeast, instant minced garlic, salt...",8.867212
8,Bolognese Style Pan Pizza,"[Easy, Beginner Cook, Kid Friendly, < 60 Mins]","[Italian sausage, ricotta cheese, lean ground ...",8.867212
9,Delicious Walking Tacos,"[Easy, Beginner Cook, Kid Friendly, < 15 Mins]","[lettuce, sour cream, ground beef, cheddar che...",8.867212


### Score by weight

In [73]:
query = {
    "dis_max": {
        "queries": [
            {
                "more_like_this": {
                    "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
                    # Anzac Biscuits
                    "like": [{"_id": "65d5e48d8598535be43eb8f3"}],
                    "min_term_freq": 1,
                    "min_doc_freq": 5,
                    "max_query_terms": 20,
                    "boost": 1.5
                }
            },
            {
                "more_like_this": {
                    "fields": ["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],
                    # TOFU KEBAB
                    "like": [{"_id": "65d5e48d8598535be43eb8f1"}],
                    "min_term_freq": 1,
                    "min_doc_freq": 5,
                    "max_query_terms": 20,
                    "boost": 1
                }
            }
            #   More Recipe in bookmarks 
            #   Does boost make sense or not?
        ]
    }
}


results = es_client.search(index='recipe', query=query, size=15)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score'])
results_df

Unnamed: 0,Name,Keywords,Ingredient,score
0,Anzac Biscuits,"[Beginner Cook, Inexpensive, < 60 Mins, For La...","[butter, plain flour, coconut, rolled oats, go...",74.87733
1,Anzac Biscuits,"[< 30 Mins, Cookie & Brownie, Australian, Grai...","[butter, plain flour, lemon rind, rolled oats,...",71.968285
2,Anzac Biscuits (Cookies),"[Inexpensive, Free Of..., Kid Friendly, Egg Fr...","[butter, plain flour, brown sugar, rolled oats...",71.25061
3,Chocolate Anzac Biscuits,"[Beginner Cook, < 30 Mins, Cookie & Brownie, F...","[butter, plain flour, rolled oats, dark cookin...",68.098434
4,Anzac Biscuits,"[< 30 Mins, Cookie & Brownie, Australian, Dess...","[butter, coconut, golden syrup, boiling water,...",65.13977
5,Anzac Biscuits With Macadamias (Australian),"[< 30 Mins, Coconut, Free Of..., Egg Free, Swe...","[butter, plain flour, white sugar, rolled oats...",56.711666
6,Healthy Vegetable Tofu Kebabs,"[< 30 Mins, Peppers, Vegetable, Beans]","[firm tofu, salt, green pepper, button mushroo...",51.347355
7,Crunchy Ginger Snaps (Similar to Arnotts Ginge...,"[Easy, For Large Groups]","[butter, self raising flour, eggs, golden syru...",44.89489
8,Whole Grain Cookies,"[Brunch, < 30 Mins, Summer, Cookie & Brownie, ...","[Weetabix, brown sugar, rolled oats, golden sy...",39.848045
9,Tofu and Bok Choy Stir Fry,"[Vegan, < 15 Mins, Chinese, Vegetable, Low Cho...","[bok choy, low sodium soy sauce, extra firm to...",39.541008


### Not interested topic?

In [97]:
prefer = ['Beginner Cook', 'Kid Friendly', 'Easy'] # boost up

un_prefer = ['Australian', 'For Large Groups', 'Oven'] # boost down

query = {
    "dis_max": {
        "queries": [
            {
                "query_string": {
                    "query": ' '.join(prefer),
                    "boost": 1.2
                }
            },
            {
                "query_string": {
                    "query": ' '.join(un_prefer),
                    "boost": 0.8
                }                
            }
        ]
    }
}

results = es_client.search(index='recipe', query=query, size=100)
results_df = pd.DataFrame([[hit["_source"]['Name'], hit["_source"]['Keywords'], hit["_source"]
['RecipeIngredientParts'], hit["_score"]] for hit in results['hits']['hits']], columns=['Name', 'Keywords', 'Ingredient', 
'score']).sort_values(by=['score'], ascending=True)

print('Unprefer recipe score')
print(results_df.head().to_markdown())
print('Prefer recipe score')
print(results_df.tail().to_markdown())

Unprefer recipe score
|    | Name                                  | Keywords                             | Ingredient                                                                                                                                                                                                                                            |   score |
|---:|:--------------------------------------|:-------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------:|
| 99 | Fresh Pumpkin (Canned, Puree, Frozen) | ['Easy', 'Inexpensive', '< 4 Hours'] | ['pumpkin']                                                                                                                                                                                                 

### Ranking evaluation 

In [49]:
query = {
   "more_like_this":{
        "fields":["Name", "Keywords", "RecipeIngredientParts", "RecipeCategory"],"like":[{"_id": "65d5e4928598535be43ec8fc"}],"min_term_freq":1,"min_doc_freq":5,"max_query_terms":20
    }
}

ratings = [
     { "_index": "recipe", "_id": "65d5e4928598535be43ec668", "rating": 0 },
     { "_index": "recipe", "_id": "65d5e48e8598535be43eba08", "rating": 3 },
     { "_index": "recipe", "_id": "65d5e4918598535be43ec3eb", "rating": 0 },
     { "_index": "recipe", "_id": "65d5e4928598535be43ec8fc", "rating": 5 },
     { "_index": "recipe", "_id": "65d5e49a8598535be43ee0c9", "rating": 2 },
     { "_index": "recipe", "_id": "65d5e49a8598535be43ee26f", "rating": 1 },
     { "_index": "recipe", "_id": "65d5e49b8598535be43ee65e", "rating": 4 },
     { "_index": "recipe", "_id": "65d5e49b8598535be43ee374", "rating": 5 },
]

requests = [
    { "id": "Query_1", "request": {"query": query} , 'ratings': ratings}
]

metric =  {
    "dcg": {
      "k": 20,
      "normalize": False
    }
}

results = es_client.rank_eval(index='recipe', requests=requests, metric= metric)
# mertic_score_df = pd.DataFrame()

# mertic_score_df
results['metric_score']

0.7879486051115807

In [50]:
results_df = pd.DataFrame([[hit['hit']["_id"], hit['hit']["_score"], hit["rating"]] for hit in results['details']['Query_1']['hits']], columns=['ID', 'Score', 'Rating'])

results_df

Unnamed: 0,ID,Score,Rating
0,65d5e4998598535be43edcb7,28.482512,
1,65d5e4988598535be43edaa9,24.64364,
2,65d5e4958598535be43ed1b5,22.636015,
3,65d5e49c8598535be43ee9b2,20.70703,
4,65d5e48e8598535be43ebb1d,20.245958,
5,65d5e48f8598535be43ebd8e,19.682268,
6,65d5e49c8598535be43ee9bd,19.554749,
7,65d5e49b8598535be43ee5bc,19.25882,
8,65d5e4968598535be43ed5fa,18.582226,
9,65d5e4968598535be43ed422,18.58067,
