# Visualize File Database

FileDatabase: Consolidate JSON input videos + search function + Search in LLM output


## Initialization

In [None]:
from pathlib import Path

from ai_xp.database import FileDatabase

inputs_lookup_dir_path = Path("../inputs").resolve()
outputs_lookup_dir_path = Path("../generated").resolve()

db = FileDatabase.from_paths(inputs_lookup_dir_path, outputs_lookup_dir_path)
db

## Demo

### Search

### Search Inputs

In [None]:
db.search(db.input_dataframe, "CoMmeNt")

In [None]:
db.search(db.input_dataframe, "monstre unicellulaire")


### Search LLM Outputs

In [None]:
db.search(db.llm_output_dataframe, "CoMmeNt")

### Visualize All Inputs

In [None]:
db.input_dataframe

### Visualize All Metadata

In [None]:
db.metadata_dataframe

### Visualize Successful Metadata

In [None]:
db.metadata_dataframe.query("status == 'success'")


### Visualize Unsuccessful Metadata

In [None]:
db.metadata_dataframe.query("status != 'success'")

### Visualize All Transcripts (Listing)

In [None]:
db.transcript_dataframe

### Visualize Successful Transcripts

In [None]:
db.transcript_dataframe.query("status == 'success'")


### Visualize Unsuccessful Transcripts

In [None]:
db.transcript_dataframe.query("status != 'success'")


Retry strategy: For NoTranscriptFound, try to query first available transcripts.

In [None]:
db.transcript_dataframe.query("status in ('NoTranscriptFound')")


Query uncommon errors:

(likely `VideoUnavailable` meaning the URL is malformed, or the video was deleted.)

In [None]:
db.transcript_dataframe.query(
    "(status not in ('success', 'NoTranscriptFound', 'TranscriptsDisabled', 'RequestBlocked'))"
)


### Visualize Inputs with missing Metadata

In [None]:
db.inputs_with_missing_metadata()


### Visualize Inputs with missing Transcripts

In [None]:
display(db.inputs_with_missing_transcripts())
indexer = ["en",]
display(db.inputs_with_missing_transcripts(indexer))
indexer = ["en", "manually_created"]
display(db.inputs_with_missing_transcripts(indexer))


### Visualize Inputs with missing Titles

In [None]:
db.input_dataframe[db.input_dataframe.title == ""]

### Visualize Inputs with "untitled" Title Slugs

Reasons: missing title OR unslugifiable titles

In [None]:
from ai_xp.utils import render_title_slug


title_slug_series = db.input_dataframe['title'].apply(render_title_slug)
title_slug_series
db.input_dataframe[title_slug_series == "untitled"]


### Visualize Inputs with unslugifiable Titles


In [None]:
db.input_dataframe[title_slug_series == "untitled"].loc[
    (
        db.input_dataframe[title_slug_series == "untitled"].index.difference(
            db.input_dataframe[db.input_dataframe.title == ""].index
        )
    )
]

### Visualize All LLM Outputs

In [None]:
db.llm_output_dataframe

### Visualize Inputs with missing LLM Outputs

Inputs with missing LLM Outputs

In [None]:
df = db.input_dataframe.drop(db.llm_output_dataframe.index.get_level_values("video_id"))
df


#### Get indexers: couples of language code and transcript source

In [None]:
db.get_transcript_language_and_source_indexer_couples()


#### Display all missing LLM output candidates for indexer couples.

In [None]:
indexers = db.get_transcript_language_and_source_indexer_couples()
dfs = db.find_missing_llm_outputs_candidates(indexers, keep_successful_only=True)
for key in dfs:
    language_code, source = key
    print(language_code, source)
    display(dfs[key])


In [None]:
db.llm_output_lookup_dir_path


### Visualize Successful LLM Outputs

In [None]:
db.get_success_df()


### Visualize Errored LLM Outputs

In [None]:
db.get_errors_df()


### Visualize Inputs With Missing Outputs

XXX Outdated

Purely missing output files are missing outputs.

Errored output files also are missing outputs. It relies on the error suffix in output markdown file names : `{title_slug}.{exc_name}.err.md` instead of `{title_slug}.md` for success.

In [None]:
db.inputs_with_missing_outputs().sort_values("exc_name")

Debug: Verify that latest markdown error files are used to provide error name.


In [None]:
db.inputs_with_missing_outputs().loc[
    (
        db.inputs_with_missing_outputs(keep="last")["exc_name"]
        != db.inputs_with_missing_outputs(keep="first")["exc_name"]
    )
]

Retry strategy: First download, no output file created yet (this is not a Retry but a Try strategy actually)

In [None]:
db.inputs_with_missing_outputs().query("output_path.isna()")


Retry strategy: For NoTranscriptFound, try to query first available transcripts.

In [None]:
db.inputs_with_missing_outputs().query(
    "exc_name in ('NoTranscriptFound')"
)


Query uncommon errors:

(likely `VideoUnavailable` meaning the URL is malformed, or the video was deleted.)

In [None]:
db.inputs_with_missing_outputs().query(
    "(exc_name not in ('NoTranscriptFound', 'TranscriptsDisabled', 'RequestBlocked'))"
    "and output_path.notna()"
)
