In [1]:
from food_pricing_bot.utils.storage import get_latest_dump_path, get_df, get_dump_path
from food_pricing_bot.bot import PLAY_REGEX
import pandas as pd
import numpy as np
import re
from typing import Union
from sklearn.metrics import mean_squared_error

In [2]:
print(PLAY_REGEX)

def get_fractional_price(s: str) -> Union[int, None]:
    match = re.match(PLAY_REGEX, s)
    
    if match is None:
        return None
    
    d = match.groupdict()
    if d.get("int") is None or d.get("dec") is None:
        return None
    elif d["int"] and d["dec"] == "":
        return int(d["int"]) * 100
    else:
        try:
            return int(d["int"]) * 100 + int(d["dec"])
        except ValueError:
            return None

assert get_fractional_price("€4.50") == 450
assert get_fractional_price("2,50") == 250
assert get_fractional_price("ciao") == None

^(?P<YoN>Sì!|No.)|\D*(?P<int>\d+)\D*(?P<dec>\d{0,2})\D*$


In [3]:
fpath = get_latest_dump_path()
print(f"The latest dump is located at: {fpath}")

The latest dump is located at: /home/ubuntu/Documents/thesis/food-pricing/data/experiment/answers/2022-09-02T10:39:24.921300+00:00.jsonl


In [4]:
# transform the answers in a format ready to be joined

df = pd.read_json(fpath, lines=True)
df.head()
df["data"] = df["data"].apply(lambda data: [(k, v) for k, v in data.items()])
df = df.explode("data", ignore_index=True)
df["item_id"], df["answer"], df["pred"] = zip(
    *df["data"].apply(lambda data: (data[0], data[1], get_fractional_price(data[1])))
)
df.drop(columns=["data"], inplace=True)
print(f"There is a total of {df['pred'].isna().sum()} unrecognised answers.")
df = df[~df["pred"].isna()]
df.head()

There is a total of 0 unrecognised answers.


Unnamed: 0,chat_id,item_id,answer,pred
0,1131376373,milano_milano-bocconi-navigli_lele-ristorante_40,25.0,2500
1,1131376373,milano_milano-morivione_deb-minimarket_411,4.0,400
2,1131376373,milano_milano-bocconi-navigli_midelizio_18,4.0,400
3,1131376373,civitanova-marche_civitanova-marche_lowengrube...,18.0,1800
4,1131376373,bologna_san-lazzaro_american-cake-dk-san-lazza...,6.0,600


In [5]:
# get the fact table

fact_df = get_df(split="test")

In [6]:
# merge the two dfs on item_id

df_merged = pd.merge(left=df, right=fact_df, how="left", on="item_id")
df_merged.head()

Unnamed: 0,chat_id,item_id,answer,pred,imgPath,price_fractional,lat,lon,txt,split
0,1131376373,milano_milano-bocconi-navigli_lele-ristorante_40,25.0,2500,food-delivery-crawler/data/images/deliveroo.it...,3200.0,45.464194,9.189635,Filetto di manzo con tartufo nero,test
1,1131376373,milano_milano-morivione_deb-minimarket_411,4.0,400,food-delivery-crawler/data/images/deliveroo.it...,230.0,45.440601,9.190522,Chiodi di garofano 50ge,test
2,1131376373,milano_milano-bocconi-navigli_midelizio_18,4.0,400,food-delivery-crawler/data/images/deliveroo.it...,350.0,45.464194,9.189635,"Lurisia Gazzosa 27,5cl",test
3,1131376373,civitanova-marche_civitanova-marche_lowengrube...,18.0,1800,food-delivery-crawler/data/images/deliveroo.it...,650.0,43.305523,13.72293,Brezel speck burro e obazda Tipico pane bavare...,test
4,1131376373,bologna_san-lazzaro_american-cake-dk-san-lazza...,6.0,600,food-delivery-crawler/data/images/deliveroo.it...,590.0,44.480569,11.424369,Fabolous cake Goloso gioco di consistenze - so...,test


In [7]:
merged_fpath = get_dump_path(fname="dump.parquet", folder="merged")
df_merged.to_parquet(merged_fpath)