# Visualize File Database

FileDatabase: Consolidate JSON input videos + search function + Search in LLM output


## Initialization

In [None]:
from pathlib import Path

from ai_xp.database import FileDatabase

inputs_lookup_dir_path = Path("../inputs").resolve()
outputs_lookup_dir_path = Path("../generated").resolve()

db = FileDatabase.from_paths(inputs_lookup_dir_path, outputs_lookup_dir_path)
db

## Demo

### Search

### Search Inputs

In [None]:
db.search(db.input_dataframe, "CoMmeNt")

In [None]:
db.search(db.input_dataframe, "monstre unicellulaire")


### Search Outputs

In [None]:
db.search(db.llm_output_dataframe, "CoMmeNt")

### Visualize All Inputs

In [None]:
db.input_dataframe

### Visualize All Metadata

In [None]:
db.metadata_dataframe

### Visualize Successful Metadata

In [None]:
db.metadata_dataframe.query("status == 'success'")


### Visualize Unsuccessful Metadata

In [None]:
db.metadata_dataframe.query("status != 'success'")

### Visualize All Transcripts (Listing)

In [None]:
db.transcript_dataframe

### Visualize Inputs with missing Metadata

In [None]:
db.inputs_with_missing_metadata()


### Visualize Inputs with missing Transcripts

In [None]:
display(db.inputs_with_missing_transcripts())
indexer = ["en",]
display(db.inputs_with_missing_transcripts(indexer))
indexer = ["en", "manually_created"]
display(db.inputs_with_missing_transcripts(indexer))


In [None]:
# TODO eschalk Candidate for pipeline input -> transcript
# XXX extremely inefficient, but ensure freshest view on the filesystem
db = db.refresh()
db.fetch_metadata()


### Visualize Inputs with missing Titles

In [None]:
db.input_dataframe[db.input_dataframe.title == ""]

In [None]:
db.input_dataframe[db.input_dataframe.title_slug == "untitled"]

In [None]:
db.input_dataframe[db.input_dataframe.title_slug == "untitled"].loc[(
    db.input_dataframe[db.input_dataframe.title_slug == "untitled"].index.difference(
        db.input_dataframe[db.input_dataframe.title == ""].index
    )
)]

### Visualize All Outputs

In [None]:
db.llm_output_dataframe

### Visualize Successful Outputs

In [None]:
db.get_success_df()


### Visualize Errored Outputs

In [None]:
db.get_errors_df()


### Visualize Inputs With Missing Outputs

Purely missing output files are missing outputs.

Errored output files also are missing outputs. It relies on the error suffix in output markdown file names : `{title_slug}.{exc_name}.err.md` instead of `{title_slug}.md` for success.

In [None]:
db.inputs_with_missing_outputs().sort_values("exc_name")

Debug: Verify that latest markdown error files are used to provide error name.


In [None]:
db.inputs_with_missing_outputs().loc[
    (
        db.inputs_with_missing_outputs(keep="last")["exc_name"]
        != db.inputs_with_missing_outputs(keep="first")["exc_name"]
    )
]

Retry strategy: First download, no output file created yet (this is not a Retry but a Try strategy actually)

In [None]:
db.inputs_with_missing_outputs().query("output_path.isna()")


Retry strategy: For NoTranscriptFound, try to query first available transcripts.

In [None]:
db.inputs_with_missing_outputs().query(
    "exc_name in ('NoTranscriptFound')"
)


Query uncommon errors:

(likely `VideoUnavailable` meaning the URL is malformed, or the video was deleted.)

In [None]:
db.inputs_with_missing_outputs().query(
    "(exc_name not in ('NoTranscriptFound', 'TranscriptsDisabled', 'RequestBlocked'))"
    "and output_path.notna()"
)
