In [None]:
# Import packages
import numpy as np
import pandas as pd
import re

In [None]:
# Read in data
df = pd.read_json("data/raw_data.json")
df["parsed_ingredients"] = None
df.head()

# Using Claude chat to process data


In [None]:
import json
import os
import requests
import uuid

from dotenv import load_dotenv
from loguru import logger
from tqdm import tqdm

In [None]:
load_dotenv(override=True)

API_URL = "https://claude.ai/api"

ORGANIZATION_ID = os.getenv("ORGANIZATION_ID")

COOKIES = {
    "sessionKey": os.getenv("SESSION_KEY")
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Content-Type": "application/json; charset=utf-8",
}

In [None]:
def claude_create_chat(name: str) -> str:
    uuid_ = uuid.uuid4()
    body = {
        "uuid": str(uuid_),
        "name": name,
    }

    r = requests.post(
        f"{API_URL}/organizations/{ORGANIZATION_ID}/chat_conversations",
        cookies=COOKIES,
        headers=HEADERS,
        json=body
    )

    return str(uuid_)


def claude_make_prompt(conv_id: str, user_question: str) -> str:
    # promt = f"\n\nHuman: {user_question}\n\nAssistant:"
    promt = user_question

    data = {
        "completion": {
            "prompt": promt,
            "timezone": "Europe/Moscow",
            "model": "claude-2",
        },
        "organization_uuid": ORGANIZATION_ID,
        "conversation_uuid": conv_id,
        "text": promt,
    }

    response = requests.post(
        f"{API_URL}/append_message",
        cookies=COOKIES,
        headers=HEADERS,
        data=json.dumps(data)
    )

    response.encoding = "utf-8"

    if response.status_code != 200:
        raise Exception(response.text)

    response_split = response.text.split("\n\n")
    if len(response_split) < 3:
        print(f"Failed to generate response for {user_question}")
        return None

    return json.loads(response_split[-3].split("data: ")[1])["completion"].strip()

In [None]:
# Create a chat for parsing ingredients
chat_id = claude_create_chat(
    "Convert recipe ingredients list from JSON to structured format"
)

In [None]:
main_prompt = """
В json ниже для каждого рецепта перечислены ингредиенты. Приведи их к структурированному виду: название ингредиента в именительном падеже без уточнений в скобках, количество, единица измерения. Например, ['молоко', 1, 'литр'].
"По вкусу" считай как единицу измерения.
Для пустых полей выводи null.
Выведи результат в виде массива массивов json внутри fenced code block.

Вот рецепты:
"""

In [None]:
def fix_claude_json(json_string: str) -> str:
    # Check if any square brackets are missing
    while json_string.count("[") != json_string.count("]"):
        # Find the first missing bracket
        missing_bracket = json_string.index("[") if json_string.count("[") < json_string.count("]") else json_string.index("]")
        # Insert the missing bracket
        json_string = json_string[:missing_bracket] + "[" + json_string[missing_bracket:]

    return json_string

In [None]:
recipes_per_message = 20

In [None]:
current_recipe = 0
with tqdm(total=df.shape[0] - current_recipe*recipes_per_message) as pbar:
    while current_recipe * recipes_per_message < df.shape[0]:
        low = current_recipe * recipes_per_message
        high = (current_recipe + 1) * recipes_per_message
        high = min(high, df.shape[0])

        # Get the current ingredients
        current_ingredients = df["ingredients"][low:high].to_json(force_ascii=False, orient="records")
        current_prompt = main_prompt + "\n" + str(current_ingredients)

        # Get the response from Claude
        response = claude_make_prompt(chat_id, current_prompt)
        json_block = response.split("```json")[1].split("```")[0].replace("\'", "\"")
        data = json.loads(fix_claude_json(json_block))

        # Update the dataframe
        for i, recipe_ingedients in enumerate(data):
            df["parsed_ingredients"] = df["parsed_ingredients"].astype(object)
            df.at[i, "parsed_ingredients"] = pd.Series(recipe_ingedients)

        current_recipe += 1
        pbar.update(high - low)

        df.to_json("data/structured_data.json", force_ascii=False, orient="records")

df.to_json("data/structured_data.json", force_ascii=False, orient="records")