In [30]:
import pandas as pd
from random import randint
import bm25s
from typing import List, Any
import numpy as np

In [10]:
class BM25Index:
    def __init__(self) -> None:
        self.bm25 = bm25s.BM25()

    def index(self, documents: List[str]) -> None:
        tokenized_docs = bm25s.tokenize(documents)
        self.bm25.index(tokenized_docs)

    def retrieve(self, queries: List[str], corpus: List[Any] | None = None, k=100):
        tokenized_queries = bm25s.tokenize(queries)
        documents, scores = self.bm25.retrieve(tokenized_queries, corpus=corpus, k=k)
        return documents, scores


def dict_to_slug(slug_components: dict) -> str:
    """Transforms a dict into a key-value slug"""
    return "-".join([f"{k}-{v}" for k, v in slug_components.items() if v is not None])


def slug_to_dict(slug: str) -> dict:
    """Transforms a key-value slug into a dict"""
    slug_elements = slug.split("-")
    return {slug_elements[2 * i]: slug_elements[2 * i + 1] for i in range(0, len(slug_elements) // 2)}


In [12]:
import mysql.connector
import pandas as pd

# MySQL connection details
mysql_host = 'mysql'
mysql_user = 'root'
mysql_password = 'rootpassword'
mysql_database = 'workshop_db'

# Create a connection to the MySQL database
conn = mysql.connector.connect(
    host=mysql_host,
    user=mysql_user,
    password=mysql_password,
    database=mysql_database
)

In [16]:
q = """
WITH data AS (
    SELECT 
        beers.id, beers.name, beers.abv, beers.ibu, beers.srm, beers.descript as beer_descr,
        brew.descript as brewer_descript, brew.name as brewery,
        styles.style_name
    FROM beers
    LEFT JOIN breweries as brew on brew.id = beers.brewery_id
    LEFT JOIN styles on styles.id = beers.style_id
), descriptions AS (
    SELECT 
        id,
        CONCAT('the beer ', name, ' from brewery ', brewery, ' (', brewer_descript, ') crafts the beer ', name, ' defined as ', beer_descr, '. Spec of the beer are: ABV=', abv, ', IBU=', ibu, ', SRM=', srm) as to_vectorize,
        name, brewery, style_name
    FROM data
)
SELECT 
    id, name, brewery, style_name, to_vectorize
FROM descriptions
WHERE True
;"""
df = pd.read_sql_query(q, con=conn)

  df = pd.read_sql_query(q, con=conn)


In [15]:
index = BM25Index()
index.index(df.to_vectorize)

In [23]:
feedbacks = [
{
   "query": "high abv belgian style ale",
   "id": 288,
   "click": True  
},
{
   "query": "winter ale with malty flavor",
   "id": 3921,
   "click": True  
},
{
   "query": "coffee porter under 6 abv",
   "id": 4091,
   "click": True  
},
{
   "query": "amber ale with caramel flavor",
   "id": 5321,
   "click": True  
},
{
   "query": "fruity christmas ale",
   "id": 4445,
   "click": True  
},
{
   "query": "IPA low alcohol",
   "id": 5889,
   "click": True  
},
{
   "query": "deep amber ale",
   "id": 4463,
   "click": False  
},
{
   "query": "strong bitter ale",
   "id": 416,
   "click": True  
},
{
   "query": "award winning scotch ale",
   "id": 4104,
   "click": True  
},
{
   "query": "porter imperial",
   "id": 5119,
    "click": True  
},
{
   "query": "high abv IPA",
   "id": 5889,
   "click": False  
},
{
   "query": "malty scotch ale",
   "id": 4104,
   "click": False  
},
{
   "query": "chocolate porter",
   "id": 4091,
   "click": False  
},
{
   "query": "light beer under 4 abv",
   "id": 4729,
   "click": False  
},
{
   "query": "pale ale under 5%",
   "id": 4834,
   "click": False  
},
{
   "query": "non-alcoholic christmas ale",
   "id": 4445,
   "click": False  
},
{
   "query": "American lager with fruity taste",
   "id": 3845,
   "click": False  
},
{
   "query": "pale ale with heavy hops",
   "id": 4038,
   "click": False  
},
{
   "query": "barley wine with caramel flavor",
   "id": 4162,
   "click": False  
},
{
   "query": "porter with spices",
   "id": 5119,
    "click": False  
},
{
   "query": "coffee stout with espresso",
   "id": 5504,
   "click": True  
},
{
   "query": "high abv barleywine",
   "id": 4754,
   "click": True  
},
{
   "query": "Belgian style tripel with yeast flavors",
   "id": 4209,
   "click": True  
},
{
   "query": "chocolate stout with low abv",
   "id": 4831,
   "click": True  
},
{
   "query": "amber bock with malty taste",
   "id": 4385,
   "click": True  
},
{
   "query": "light lager with fruity taste",
   "id": 4383,
   "click": False  
},
{
   "query": "bitter IPA with high abv",
   "id": 5777,
   "click": False  
},
{
   "query": "brown ale with roasted flavors",
   "id": 4549,
   "click": False  
},
{
   "query": "crisp pilsner",
   "id": 1244,
   "click": False  
},
{
   "query": "strong bock for winter",
   "id": 4385,
   "click": False  
},
]

# Simulate clics on a beer search engine

In [45]:
queries = [f["query"] for f in feedbacks]
queries += ["dry stout", "milky stout", "light IPA", "low alcool pils", "bitter sour", "fruity sour"]
queries += [
    "high ABV imperial stout",
    "low alcohol session IPA",
    "fruity Belgian tripel",
    "malty amber ale",
    "coffee porter with chocolate notes",
    "sour beer with wild yeast",
    "barrel-aged barleywine",
    "crisp pilsner under 5% ABV",
    "hoppy double IPA",
    "light wheat beer with citrus",
    "strong bock for winter",
    "spiced holiday ale",
    "smooth brown ale",
    "smoky rauchbier",
    "refreshing summer lager",
    "dark beer with caramel flavor",
    "milk stout with vanilla",
    "bitter pale ale with high IBU",
    "floral saison",
    "dry Irish stout with roasted malt",
    "non-alcoholic craft beer",
    "hazy New England IPA",
    "strong sour with fruit",
    "traditional German-style bock",
    "Belgian dubbel with rich malt",
    "imperial porter with coffee",
    "citrusy West Coast IPA",
    "light beer under 100 calories",
    "tropical flavored IPA",
    "peach-infused wheat beer",
    "amber ale with caramel notes",
    "porter with espresso",
    "dry hopped pale ale",
    "chocolate stout under 6% ABV",
    "herbal farmhouse ale",
    "triple IPA over 10% ABV",
    "oatmeal stout with smooth finish",
    "German-style hefeweizen with banana notes",
    "rye IPA with spicy finish",
    "refreshing blonde ale",
    "Belgian witbier with coriander and orange peel"
]


In [76]:
k_max = 20
serps, _ = index.retrieve(queries, k=k_max, corpus=df.id)# drop(columns=["to_vectorize"]).to_dict(orient="records"))

In [97]:
clic_log = []
for serp, query in zip(serps, queries):
    for _ in range(randint(5, 15)):
        clicked_index = int(np.clip(np.random.normal(loc=3, scale=2), a_min=0, a_max=k_max))
        datum = {
            "query": query,
            "serp": [{"id": _id, "position": i+1} for i, _id in enumerate(serp)],
            "clicked_id": serp[clicked_index]
        }
        clic_log.append(datum)
cliclog = pd.DataFrame(clic_log).sample(frac=1.0)
cliclog = cliclog.assign(user_id=cliclog.apply(lambda _: str(uuid.uuid1()), axis=1))

In [98]:
cliclog = pd.concat([cliclog.explode("serp").drop(columns="serp").reset_index(drop=True), pd.json_normalize(cliclog.explode("serp")["serp"]).reset_index(drop=True)], axis=1)
cliclog = cliclog.rename(columns={"id": "id_in_serp", "position": "pos_in_serp"})
cliclog

Unnamed: 0,query,clicked_id,user_id,id_in_serp,pos_in_serp
0,fruity sour,4442,ecfce536-7fc5-11ef-9d1e-0242ac120005,4442,1
1,fruity sour,4442,ecfce536-7fc5-11ef-9d1e-0242ac120005,475,2
2,fruity sour,4442,ecfce536-7fc5-11ef-9d1e-0242ac120005,481,3
3,fruity sour,4442,ecfce536-7fc5-11ef-9d1e-0242ac120005,5861,4
4,fruity sour,4442,ecfce536-7fc5-11ef-9d1e-0242ac120005,5697,5
...,...,...,...,...,...
15515,strong bitter ale,1603,ecfebaaa-7fc5-11ef-9d1e-0242ac120005,4917,16
15516,strong bitter ale,1603,ecfebaaa-7fc5-11ef-9d1e-0242ac120005,184,17
15517,strong bitter ale,1603,ecfebaaa-7fc5-11ef-9d1e-0242ac120005,5730,18
15518,strong bitter ale,1603,ecfebaaa-7fc5-11ef-9d1e-0242ac120005,5202,19


In [99]:
cliclog.to_csv("./beers_feedback.csv", index=False)

In [62]:
_df = pd.DataFrame(feedbacks).rename(columns={"click": "buy"})
_df["click"] = True

dfs_simulated = []
for _, row in _df.iterrows():
    if row.buy:
        if row.click:
            # consider them
            dfs_simulated += ([row.copy()] * randint(3, 10))
            # add non-bought samples 
            rr = row.copy()
            rr.buy = False
            dfs_simulated += ([rr] * randint(2, 5))
        if not row.click:
            print("buy and not clicked: should not happen")
    else: # not buy
        if row.click:
            # consider them
            dfs_simulated += ([row.copy()] * randint(3, 10))
            # add also non clicked items
            rr = row.copy()
            rr.click = False
            dfs_simulated += ([rr] * randint(10, 20))
        else:
            print("not bought not clicked : should not happen")
df = pd.DataFrame(dfs_simulated).reset_index(drop=True).sample(frac=1.0)

In [67]:
df.to_csv("./beers_feedback.csv")