In [48]:
from bs4 import BeautifulSoup
from httpx import AsyncClient
import re
import json
import pandas as pd

In [8]:
async def get_soup(url: str, client: AsyncClient) -> BeautifulSoup:
    resp = await client.get(url)
    resp.raise_for_status()
    html = resp.text
    return BeautifulSoup(html, 'html.parser')



In [105]:
roast_map = {
    'torr:Moyen': 'Medium'
}
def translate_roast(tags: list[str]) -> str:
    try:
        torr = next(filter(lambda tag: tag[0:4] == 'torr', tags))
        return roast_map.get(torr, torr)
    except StopIteration:
        return 'unknown'
    
def to_grams(weight: str) -> int:
    m = re.match('^(\d+)\s?(k?g)', weight)
    if not m:
        return None
    val = int(m[1])
    if m[2] == 'g':
        return val
    else:
        return 1000 * val


def to_dataframe(data: dict) -> pd.DataFrame:
    df = pd.DataFrame(data['variants'])
    df.drop(df.query('not available').index, inplace=True)
    df['grams'] = df.option1.apply(to_grams)
    df.dropna(subset='grams', inplace=True)
    df['price_per_gram'] = df['price'] / df['grams']
    df['name'] = data['title']
    df['description'] = BeautifulSoup(data['description'], 'html.parser').text
    df['roast'] = translate_roast(data['tags'])
    return df[['name', 'grams', 'price', 'price_per_gram', 'description']]

In [110]:
async with AsyncClient(base_url='https://www.brulerieduquai.com/en/collections/cafe') as client:
    dfs = []
    main_page = await get_soup('/', client)
    coffees = [a['href'].split('/')[-1] for a in main_page.find_all('a') 
               if a['href'].startswith('/en/collections/cafe/products/')][::2]
    for coffee in coffees:
        print(f"Processing {coffee}...")
        coffeesoup = await get_soup(f'/products/{coffee}', client)
        s = coffeesoup.find_all('script.bold-subscriptions-platform-script')
        s = [s for s in coffeesoup.find_all('script') if 'sswApp.product =' in s.text][0]
        ss: str = [ss for ss in s.text.split('\n') if 'sswApp.product' in ss][0]
        data = json.loads(ss[ss.index('=')+1:-1])
        df = to_dataframe(data)
        if not df.empty:
            dfs.append(df)

df = pd.concat(dfs).reset_index().drop(columns='index')
df

Processing calendrier-avent-cafe...
Processing blue-mountain...
Processing geisha-la-esmeralda...
Processing barbosa-90...
Processing el-injerto-90...
Processing pb-barbosa...
Processing legendary-geisha...
Processing cafe-apero-espresso...
Processing yemenia-khalid-kulaib...
Processing nyeri-kamoini...
Processing cafe-bresil-cerrado...
Processing bad-boy-espresso...
Processing bad-girl-espresso...
Processing yirgacheffe...
Processing harrar...
Processing cafe-assemblage-crema-special...


Unnamed: 0,name,grams,price,price_per_gram,description
0,Typica Jamaïca Blue Mountain Coffee,200,5000,25.0,Clifton Mount Estate is located 1300m on the e...
1,Geisha Panama La Esmeralda Coffee,200,5000,25.0,\n Harvest 2020 - Limited quantityComing from ...
2,Brazil Coffee - Barbosa 90,200,8600,43.0,This coffee was produced by the one and only D...
3,Brazil Coffee PB. Barbosa,200,2000,10.0,"The producer and owner of DBarbosa Coffee, Vit..."
4,Guatemala Coffee - Legendary Geisha,200,5999,29.995,"This coffee is produced by El Injerto, one of ..."
5,Apéro Coffee,340,1850,5.441176,This blend is inspired by fruity liqueurs and ...
6,Apéro Coffee,2000,9206,4.603,This blend is inspired by fruity liqueurs and ...
7,Brazil Coffee - Cerrado,340,1850,5.441176,Here is a coffee from the plantation of the Ba...
8,Brazil Coffee - Cerrado,2000,9206,4.603,Here is a coffee from the plantation of the Ba...
9,Bad Boy Espresso Coffee,340,1850,5.441176,"This blend is optimized for a comforting, roun..."


In [109]:
df.drop(columns='index')

Unnamed: 0,name,grams,price,price_per_gram,description
0,Typica Jamaïca Blue Mountain Coffee,200,5000,25.0,Clifton Mount Estate is located 1300m on the e...
1,Geisha Panama La Esmeralda Coffee,200,5000,25.0,\n Harvest 2020 - Limited quantityComing from ...
2,Brazil Coffee - Barbosa 90,200,8600,43.0,This coffee was produced by the one and only D...
3,Brazil Coffee PB. Barbosa,200,2000,10.0,"The producer and owner of DBarbosa Coffee, Vit..."
4,Guatemala Coffee - Legendary Geisha,200,5999,29.995,"This coffee is produced by El Injerto, one of ..."
5,Apéro Coffee,340,1850,5.441176,This blend is inspired by fruity liqueurs and ...
6,Apéro Coffee,2000,9206,4.603,This blend is inspired by fruity liqueurs and ...
7,Brazil Coffee - Cerrado,340,1850,5.441176,Here is a coffee from the plantation of the Ba...
8,Brazil Coffee - Cerrado,2000,9206,4.603,Here is a coffee from the plantation of the Ba...
9,Bad Boy Espresso Coffee,340,1850,5.441176,"This blend is optimized for a comforting, roun..."


In [63]:
roast_map = {
    'torr:Moyen': 'Medium'
}

def to_dataframe(data: dict) -> pd.DataFrame:
    df = pd.DataFrame(data['variants'])
    df['name'] = data['title']
    df['description'] = data['description']
    df['roast'] = roast_map.get(next(filter(lambda tag: tag[0:4] == 'torr', data['tags'])))
    df['grams'] = df.option1.apply(lambda weight: int(weight[0:-1]) if re.match('\d+\s?g', weight) else 1000*int(weight[0:-2]) )
    df['price_per_gram'] = df['price']/df['grams']
    return df[['name', 'grams', 'price', 'price_per_gram', 'description']]

Unnamed: 0,name,grams,price,price_per_gram,description
0,Créma Spécial Coffee,340,1485,4.367647,<div>Here is a blend inspired by a typical Ita...
1,Créma Spécial Coffee,1000,3700,3.7,<div>Here is a blend inspired by a typical Ita...
2,Créma Spécial Coffee,2000,6729,3.3645,<div>Here is a blend inspired by a typical Ita...


In [59]:
df

Unnamed: 0,id,title,option1,option2,option3,sku,requires_shipping,taxable,featured_image,available,...,weight,compare_at_price,inventory_quantity,inventory_management,inventory_policy,barcode,requires_selling_plan,selling_plan_allocations,grams,price_per_gram
0,212217933,340g,340g,,,10011,True,False,,True,...,356,,0,shopify,continue,877609000034.0,False,"[{'price_adjustments': [{'position': 1, 'price...",340,4.367647
1,44089131335933,1kg,1kg,,,10014,True,False,,True,...,356,,1,shopify,continue,,False,"[{'price_adjustments': [{'position': 1, 'price...",1000,3.7
2,212217937,2kg,2kg,,,10012,True,False,,True,...,2042,,0,shopify,continue,692065520007.0,False,"[{'price_adjustments': [{'position': 1, 'price...",2000,3.3645
