In [None]:
from pathlib import Path

from ai_xp.database import FileDatabase

inputs_lookup_dir_path = Path("../inputs").resolve()
outputs_lookup_dir_path = Path("../generated").resolve()

db = FileDatabase.from_paths(inputs_lookup_dir_path, outputs_lookup_dir_path)
db


In [None]:
db.transcript_dataframe.query("status == 'success'")

In [None]:
import pandas as pd
from ai_xp.utils import load_json


word_counts = {
    transcript_output_file_path: sum(
        len(line["text"].split())
        for line in load_json(transcript_output_file_path)["snippets"]
    )
    for transcript_output_file_path in db.transcript_dataframe.drop(("_", "_"))
    .query("status == 'success'")["path"]
    .to_list()
}
word_counts

In [None]:
df = pd.DataFrame.from_dict(word_counts, orient="index",columns=["word_count"])
df

See pricing in https://openai.com/api/pricing/

```raw
GPT-4.1 nano

Fastest, most cost-effective model for low-latency tasks
Pricing
Input:
$0.100 / 1M tokens
Cached input:
$0.025 / 1M tokens
Output:
$0.400 / 1M tokens
```

What are tokens and how to count them?

https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them

100 tokens ~= 75 words


In [None]:
import locale


locale.setlocale(locale.LC_NUMERIC, "fr_FR")
locale.setlocale(locale.LC_ALL, "fr_FR")
print(
    f"Total du nombre de mots de tous les transcripts disponibles : {df.sum().item():n}"
)
print(f"Nombre de mots moyen par transcript: {int(df.mean().item()):n}")
print(f"Nombre total de videos à traiter: {len(db.input_dataframe):n}")

estimated_total_input_word_count = int(df.mean().item() * len(db.input_dataframe))
print(
    f"Extrapolation du total de nombre de mots à traiter: {estimated_total_input_word_count:n}"
)


word_to_token_factor = 100/75

estimated_total_input_token_count = int(estimated_total_input_word_count * word_to_token_factor)
print(f"Nombre attendu de d'input tokens: {estimated_total_input_token_count:n}")

estimated_mean_output_word_count = 300
estimated_total_output_word_count = estimated_mean_output_word_count * len(db.input_dataframe)
estimated_total_output_token_count = int(estimated_total_output_word_count * word_to_token_factor)
print(f"Nombre attendu d'output tokens: {estimated_total_output_token_count:n}")

# OpenAI o3
# price_per_input_tokens = 10 / 1_000_000
# price_per_output_tokens = 40 / 1_000_000

# GPT-4.1 nano
price_per_input_tokens = 0.1 / 1_000_000
price_per_output_tokens = 0.4 / 1_000_000

estimated_input_price = (estimated_total_input_token_count * price_per_input_tokens)
estimated_output_price = (
    estimated_total_output_token_count * price_per_output_tokens
)
print(f"{estimated_input_price=:.2f} $")
print(f"{estimated_output_price=:.2f} $")
estimated_total_price = estimated_input_price + estimated_output_price
print(f"{estimated_total_price=:.2f} $")


In [None]:
len(db.input_dataframe)