python-experiments/make-text-cards.py

#!/usr/bin/env python
#
# Usage:
#   python make-text-cards.py <deck> <source-name> <input-text-file> <output-csv-file>

import csv
import json
from typing import Dict, List, Optional

from dotenv import load_dotenv
from markdown import markdown
from openai import OpenAI


# Load environment variables. Create a file named `.env` in the same directory as this file
# and add the following line to it:
#
# OPENAI_API_KEY="your-api-key"
load_dotenv()

def generate_cards(input_texts: List[str], *, source: Optional[str] = None) -> List[Dict[str, str]]:
    """Read in a file of text snippets and convert them into cards.

    Output fields should be:

    - Front: The original text
    - Back: The translation
    - Notes: Extra notes or context generated by the model for text marked with
      "[[...]]".
    - Source: The source of the text snippet.
    """

    # We need to build up a sample dialog between the "user" and the
    # "assistant", before asking our actual question. This "teaches" the model
    # how to respond, essentially by putting words into its mouth.
    system_message = """
You are a translator helping prepare Anki cards. You will be given short text in
Spanish, which will put onto the front of cards. Your job is to translate the
short text to English. Following the translation, you should briefly break break
down any phrases surrounded by [[ ]] and explain how they work. Do not include
any explanations if there are no [[ ]].
"""
    prompt_1 = "Tenía un alma de tigre."
    response_1 = {
        "translation": "He had a tiger's soul."
    }
    prompt_2 = "Ni [[siquiera]] hay una gramola."
    response_2 = {
        "translation": "There isn't even a jukebox.",
        "explanations": "- **siquiera:** The word \"siquiera\" in Spanish is used to add emphasis, typically in negative contexts, similar to the English word \"even.\" In this sentence, \"Ni siquiera\" translates directly to \"not even,\" emphasizing that there isn’t a jukebox at all.",
    }

    # Declare the function that the model should call.
    tools = [{
        "type": "function",
        "function": {
            "name": "add_data_to_card",
            "description": "Add the translation (and optionally explanations) to the current card.",
            "parameters": {
                "type": "object",
                "properties": {
                    "translation": { "type": "string" },
                    "explanations": { "type": "string" },
                },
                "required": ["translation"]
            }
        }
    }]

    # Generate the translations using GPT-3.5.
    client = OpenAI()

    result = []
    for input_text in input_texts:

        print(f"Input: {input_text}")
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": prompt_1},
                {"role": "function", "name": "add_data_to_card", "content": json.dumps(response_1)},
                {"role": "user", "content": prompt_2},
                {"role": "function", "name": "add_data_to_card", "content": json.dumps(response_2)},
                {"role": "user", "content": input_text},
            ],
            tools = tools,
            tool_choice = {"type": "function", "function": {"name": "add_data_to_card"}},
        )

        # Extract the tool call from the response.
        tool_calls = response.choices[0].message.tool_calls
        assert len(tool_calls) == 1
        args = json.loads(tool_calls[0].function.arguments)
        print(f"{json.dumps(args, indent=4)}")

        # Convert [[ and ]] to ** and **.
        front = input_text.replace("[[", "**").replace("]]", "**")

        # Convert the explanations to Markdown.
        if args.get("explanations"):
            explanations = markdown(args["explanations"])
        else:
            explanations = None

        result.append({
            "Front": markdown(front),
            "Back": markdown(args["translation"]),
            "Notes": explanations,
            "Source": source,
        })

    return result

def texts_to_csv(in_texts_path: str, out_csv_path: str, *, deck: str, source: Optional[str] = None) -> None:
    """Read in a file of text snippets separated by "\\n--\\n" and write the
    generated cards to a CSV file."""

    with open(in_texts_path, "r") as f:
        input_texts = f.read().strip().split("\n--\n")

    cards = generate_cards(input_texts, source=source)

    # Write CSV correctly using a library. Note that Anki imports work much
    # better if we provide a header.
    with open(out_csv_path, "w", newline="") as f:
        f.write(f"""#separator:Semicolon
#html:true
#notetype:Text Snippet
#deck:{deck}
#columns:""")
        writer = csv.DictWriter(f, fieldnames=["Front", "Back", "Notes", "Source"], delimiter=";")
        writer.writeheader()
        writer.writerows(cards)

# Command line entry point.
if __name__ == "__main__":
    import sys

    if len(sys.argv) != 5:
        print(f"Usage: {sys.argv[0]} <deck> <source-name> <input-text-file> <output-csv-file>")
        sys.exit(1)

    deck = sys.argv[1]
    source = sys.argv[2]
    in_texts_path = sys.argv[3]
    out_csv_path = sys.argv[4]

    texts_to_csv(in_texts_path, out_csv_path, deck=deck, source=source)