In [None]:
import mysql.connector
import pandas as pd

# MySQL connection details
mysql_host = 'mysql'
mysql_user = 'student'
mysql_password = 'student'
mysql_database = 'workshop_db'

# Create a connection to the MySQL database
conn = mysql.connector.connect(
    host=mysql_host,
    user=mysql_user,
    password=mysql_password,
    database=mysql_database
)

# UC-1

In [None]:
# Q1
full_join_query = """SELECT count(*)
FROM beers
WHERE TRUE
;"""
df = pd.read_sql_query(full_join_query, con=conn)
df

In [None]:
# Q2
full_join_query = """SELECT brew.name as brewery, count(*)
FROM beers
JOIN breweries as brew on brew.id = beers.brewery_id
WHERE TRUE
GROUP BY brew.name
ORDER BY 2 DESC
LIMIT 10
;"""
df = pd.read_sql_query(full_join_query, con=conn)
df

In [None]:
# Q3
full_join_query = """SELECT beers.name, brew.name, abv
FROM beers
JOIN breweries as brew on brew.id = beers.brewery_id
WHERE TRUE
AND brew.country = 'France'
ORDER BY 3 DESC
LIMIT 10
;"""
df = pd.read_sql_query(full_join_query, con=conn)
df

In [None]:
# Q4
full_join_query = """SELECT brew.country, count(*) as nb_porter, AVG(ABV) as abv_mean
FROM beers
JOIN breweries as brew on brew.id = beers.brewery_id
JOIN styles on styles.id = beers.style_id
WHERE TRUE
AND styles.style_name = 'Porter'
GROUP BY brew.country
ORDER BY 2 DESC
;"""
df = pd.read_sql_query(full_join_query, con=conn)
df

In [None]:
# Q5 - observation : certaines opérations "simples" sont un peu compliquées à réaliser en SQL
q = """
WITH country_cnt AS (
    SELECT 
        brew.country AS country,
        COUNT(*) AS cnt
    FROM beers
    JOIN breweries AS brew ON brew.id = beers.brewery_id
    GROUP BY brew.country
    ORDER BY cnt DESC
), ranked_countries AS (
    SELECT
        country, cnt, ROW_NUMBER() OVER (ORDER BY cnt) as rnk
    FROM country_cnt
), nlines AS (
    SELECT count(*) as nn
    FROM country_cnt
), proxy_median AS (
    SELECT 
        country, cnt, POWER((rnk / nn) - 1/2, 2) as proxmed
    FROM ranked_countries
    LEFT JOIN nlines ON TRUE
)
SELECT * 
FROM proxy_median
ORDER BY proxmed ASC
LIMIT 1
;"""
df = pd.read_sql_query(q, con=conn)
df

# UC-2
Exo difficile

In [None]:
q = f"""
WITH clicked_pos AS (
    SELECT query, user_id, pos_in_serp as clicked_pos_in_serp
    FROM `beers_feedback` 
    WHERE TRUE
    AND clicked_id = id_in_serp
), 
db_and_clicked_and_seen AS (
    SELECT 
        beers_feedback.*, 
        CASE WHEN pos_in_serp <= clicked_pos_in_serp THEN 1 ELSE 0 END as seen,
        CASE WHEN pos_in_serp = clicked_pos_in_serp THEN 1 ELSE 0 END as clicked
    FROM beers_feedback
    LEFT JOIN clicked_pos on clicked_pos.user_id = beers_feedback.user_id
),
cascade_probas AS (
    SELECT
        query, id_in_serp, SUM(seen) as n_seen, SUM(clicked) as n_clicked, SUM(clicked)/SUM(seen) as click_proba_cascade
        FROM db_and_clicked_and_seen
        WHERE TRUE
        AND 
            seen = 1
        GROUP BY query, id_in_serp
        ORDER BY query, click_proba_cascade
)
SELECT 
    query, descript, click_proba_cascade
    FROM cascade_probas
    JOIN beers on beers.id = cascade_probas.id_in_serp
    WHERE TRUE
    AND length(descript) > 1
;"""
df = pd.read_sql_query(q, con=conn)
df

# UC-3 search from a query

**Observation :** On ne pourra pas aller bien loin en terme de souplesse dans la requête

In [None]:
QUERY = "stout"

q = f"""
WITH descriptions AS (
    SELECT 
        brew.name as brewery, beers.name as name, CONCAT(beers.descript, brew.descript) as descr
    FROM beers
    JOIN breweries as brew on brew.id = beers.brewery_id
    WHERE TRUE
    AND LENGTH(beers.descript) + LENGTH(beers.descript) > 2
)
SELECT *
FROM descriptions
WHERE True
AND descriptions.descr LIKE '%{QUERY}%'
;"""
pd.read_sql_query(q, con=conn)

# UC-4 vectorize items

Obligé de sortir de SQL pour faire 

In [None]:
q = """
WITH data AS (
    SELECT 
        beers.id, beers.name, beers.abv, beers.ibu, beers.srm, beers.descript as beer_descr,
        brew.descript as brewer_descript, brew.name as brewery,
        styles.style_name
    FROM beers
    LEFT JOIN breweries as brew on brew.id = beers.brewery_id
    LEFT JOIN styles on styles.id = beers.style_id
), descriptions AS (
    SELECT 
        id,
        CONCAT('the beer ', name, ' from brewery ', brewery, ' (', brewer_descript, ') crafts the beer ', name, ' defined as ', beer_descr, '. Spec of the beer are: ABV=', abv, ', IBU=', ibu, ', SRM=', srm) as to_vectorize
    FROM data
)
SELECT 
    id, to_vectorize
FROM descriptions
WHERE True
    AND id % 12 = 3
;"""
df = pd.read_sql_query(q, con=conn)

In [None]:
from typing import List
import requests

def batched(iterable, batch_size=16):
    l = len(iterable)
    for ndx in range(0, l, batch_size):
        yield iterable[ndx:min(ndx + batch_size, l)]

class Vectorizer:
    url = "http://vectorizer:8000/embed"
    
    @staticmethod
    def embed(texts: List[str]):
        return [Vectorizer._embed_one(tt) for tt in texts]
        
    @staticmethod
    def _embed_one(text: str):
        payload = {"text": text}
        
        response = requests.post(Vectorizer.url, json=payload)
        try:
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response.json()["vector"]
        except:
            return None


# UC-5 : answer question in corpa
Pas vraiment de possibilité native en SQL