<a href="https://colab.research.google.com/github/davidlealo/vocacional-test/blob/main/Azure_Video_Indexer_to_jsonl_converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Azure Video Indexer to .jsonl converter
# Compatible con Google Colab

import zipfile
import os
import json
import shutil
from google.colab import files
from datetime import datetime

# --- Paso 1: Subir archivo ZIP ---
print("\u2B06\uFE0F Sube tu archivo ZIP exportado desde Azure Video Indexer")
uploaded = files.upload()

zip_path = next(iter(uploaded))
extract_dir = "azure_indexer_artifacts"

# --- Paso 2: Extraer contenido del ZIP ---
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"\n✅ Archivos extraídos en: {extract_dir}")

# --- Paso 3: Pedir metadatos generales ---
print("\n✏️ Ingresa información contextual para incluir como metadatos comunes")
clase = input("Nombre de la clase o sesión: ")
fecha = input("Fecha (YYYY-MM-DD): ")
curso = input("Curso o unidad (opcional): ")
notas = input("Notas adicionales (opcional): ")

# --- Paso 4: Funciones para procesar artefactos ---
def extract_transcript(path):
    with open(path, 'r') as f:
        data = json.load(f)
    entries = []
    for segment in data.get("recognizedPhrases", []):
        text = segment.get("display", "")
        if text:
            entries.append({
                "text": text,
                "source": "transcript",
                "metadata": {
                    "clase": clase,
                    "fecha": fecha,
                    "curso": curso,
                    "notas": notas,
                    "tipo": "transcript"
                }
            })
    return entries

def extract_ocr(path):
    with open(path, 'r') as f:
        data = json.load(f)
    entries = []
    for region in data.get("regions", []):
        for line in region.get("lines", []):
            text = line.get("text", "")
            if text:
                entries.append({
                    "text": text,
                    "source": "ocr",
                    "metadata": {
                        "clase": clase,
                        "fecha": fecha,
                        "curso": curso,
                        "notas": notas,
                        "tipo": "ocr"
                    }
                })
    return entries

def extract_labels(path):
    with open(path, 'r') as f:
        data = json.load(f)
    entries = []
    for label in data.get("labels", []):
        name = label.get("name", "")
        if name:
            entries.append({
                "text": name,
                "source": "visual_label",
                "metadata": {
                    "clase": clase,
                    "fecha": fecha,
                    "curso": curso,
                    "notas": notas,
                    "tipo": "label"
                }
            })
    return entries

# --- Paso 5: Procesar los tres artefactos clave ---
jsonl_entries = []
paths = os.listdir(extract_dir)

if "transcript.speechservices.json" in paths:
    jsonl_entries.extend(extract_transcript(os.path.join(extract_dir, "transcript.speechservices.json")))

if "ocr.json" in paths:
    jsonl_entries.extend(extract_ocr(os.path.join(extract_dir, "ocr.json")))

if "labels.computervision.json" in paths:
    jsonl_entries.extend(extract_labels(os.path.join(extract_dir, "labels.computervision.json")))

# --- Paso 6: Exportar .jsonl ---
output_file = f"video_indexed_{clase.replace(' ', '_')}.jsonl"
with open(output_file, "w") as f:
    for entry in jsonl_entries:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"\n⬇️ Archivo exportado: {output_file}")
files.download(output_file)

# Limpieza
shutil.rmtree(extract_dir)
os.remove(zip_path)
