<a href="https://colab.research.google.com/github/davidezadi/first/blob/master/Copy_of_ultimate_Gemini_srt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


# @title The Definitive Splitting & Patching Pipeline - V9.7 (Sanitized Filenames)

# @markdown ### ⚙️ Step 1: Intelligent Dependency Installation
try:
    import google.generativeai as genai
    from tqdm import tqdm
    from IPython.display import display, HTML
    print("✅ Required libraries already installed.")
except ImportError:
    print("⏳ Installing required libraries (tqdm, google-generativeai, ipython)...")
    !pip install -q google-generativeai tqdm ipython
    import google.generativeai as genai
    from tqdm import tqdm
    from IPython.display import display, HTML
    print("✅ Libraries installed successfully.")

# --- Import necessary libraries ---
from google.colab import drive
import subprocess, os, shutil, json, time, sys, logging, re
from pathlib import Path
from dataclasses import dataclass
import tempfile
from typing import List, NamedTuple, Optional, Dict, Tuple
from collections import defaultdict
import math
import concurrent.futures
import base64
import random

# @markdown ---
# @markdown ### 📂 Step 2: Define Your Workspace & Filenames
WORKSPACE_PATH = "/content/drive/MyDrive/GeminiProcessor"  # @param {type:"string"}
KEYS_FILENAME = "keys.json"  # @param {type:"string"}
PROMPT_FILENAME = "prompt.txt"  # @param {type:"string"}

# @markdown ---
# @markdown ### 🔧 Step 3: Set Processing Options
MODEL_NAME = "gemini-2.5-pro" # @param {type:"string"}
FORCE_REPROCESS = False #@param {type:"boolean"}
CHUNK_DURATION_MINUTES = 9.9999 #@param {type:"number"}
OUTPUT_FILENAME_SUFFIX = "_result" #@param {type:"string"}
# @markdown **PERFORMANCE SETTINGS:**
# @markdown Number of parallel processes. Set this to the number of API keys for maximum speed.
NUM_WORKERS = 10  # @param {type:"integer"}
# @markdown Number of times to retry a failed chunk before giving up.
MAX_RETRIES = 2  # @param {type:"integer"}


# @markdown ---
# @markdown ### ⚡ Step 4: Prevent Runtime Disconnects
ENABLE_KEEP_ALIVE = True  # @param {type:"boolean"}

# --- 1. CORE DATA STRUCTURES & WORKER SCRIPT ---

WORKER_SCRIPT_TEMPLATE = """
import google.generativeai as genai
import sys, time, base64, json, random
from pathlib import Path

def run():
    api_keys_b64 = sys.argv[1]
    model_name = sys.argv[2]
    prompt_b64 = sys.argv[3]
    audio_chunk_path_str = sys.argv[4]
    output_path_str = sys.argv[5]

    try:
        api_keys_json = base64.b64decode(api_keys_b64).decode('utf-8')
        api_keys = json.loads(api_keys_json)
        random.shuffle(api_keys)
    except Exception as e:
        print(f"Worker script failed to decode API keys: {e}", file=sys.stderr)
        sys.exit(1)

    prompt = base64.b64decode(prompt_b64).decode('utf-8')
    audio_chunk_path = Path(audio_chunk_path_str)
    output_path = Path(output_path_str)

    gemini_file_to_clean = None
    last_error = None
    success = False

    for api_key in api_keys:
        try:
            genai.configure(api_key=api_key)
            model = genai.GenerativeModel(model_name=model_name)

            for attempt in range(3):
                try:
                    uploaded_file = genai.upload_file(path=str(audio_chunk_path))
                    gemini_file_to_clean = uploaded_file
                    break # Success
                except Exception as e:
                    if attempt < 2:
                        time.sleep(5 * (attempt + 1))
                    else:
                        raise e # Propagate error to trigger key rotation

            response = model.generate_content([prompt, gemini_file_to_clean])

            if not response or not hasattr(response, 'text') or not response.text:
                raise ValueError("API returned an empty or invalid response.")

            output_path.write_text(response.text.strip(), encoding='utf-8')
            success = True
            break # Exit the key loop on success

        except Exception as e:
            key_preview = api_key[-4:]
            print(f"Attempt with key '...{key_preview}' failed: {e}", file=sys.stderr)
            last_error = e
            if gemini_file_to_clean:
                try:
                    genai.delete_file(gemini_file_to_clean.name)
                except Exception:
                    pass
                gemini_file_to_clean = None
            time.sleep(2)

    try:
        if success and gemini_file_to_clean:
            genai.delete_file(gemini_file_to_clean.name)
    except Exception:
        pass

    if not success:
        raise Exception(f"All API key attempts failed. Last error: {last_error}")

if __name__ == '__main__':
    try:
        run()
    except Exception as e:
        print(f"Worker script failed: {e}", file=sys.stderr)
        sys.exit(1)
"""

@dataclass(frozen=True)
class Config:
    workspace_path: Path; keys_filename: str; prompt_filename: str; output_suffix: str; model_name: str; prompt_template: str
    chunk_duration_seconds: float; retry_delay_seconds: int = 5
    @property
    def input_path(self) -> Path: return self.workspace_path / "input"
    @property
    def results_path(self) -> Path: return self.workspace_path / "results"
    @property
    def temp_chunk_path(self) -> Path: return self.workspace_path / "results" / "temp_chunks"
    @property
    def logs_path(self) -> Path: return self.workspace_path / "logs"

@dataclass(frozen=True)
class ProcessTask:
    media_path: Path; original_stem: str; chunk_index: int; time_offset_seconds: float

@dataclass
class FileInfo:
    duration: float; expected_chunks: int

class ValidatedSetup(NamedTuple):
    config: Config; api_keys: List[str]; log_file_path: Path

# --- NEW HELPER FUNCTION TO SANITIZE FILENAMES ---
def sanitize_stem(filename_stem: str) -> str:
    """Creates a filesystem-safe version of a filename stem."""
    s = re.sub(r'[^\w\s-]', '', filename_stem)
    s = re.sub(r'[\s-]+', '-', s).strip('-')
    return s[:150]

def try_patch_file(stem: str, config: Config, file_info_cache: Dict[str, FileInfo]):
    """
    Checks if a file is ready to be patched. Uses the SANITIZED stem for all lookups.
    """
    final_result_path = config.results_path / f"{stem}{config.output_suffix}.txt"
    if final_result_path.exists(): return

    info = file_info_cache.get(stem)
    if not info: return

    chunk_files = list(config.temp_chunk_path.glob(f"{stem}_chunk_*.txt"))
    if len(chunk_files) == info.expected_chunks:
        logging.info(f"✅ All {info.expected_chunks} chunks for '{stem}' are present. Patching, Validating, and Re-indexing now...")
        sorted_chunks = sorted(chunk_files, key=lambda p: int(re.search(r'_chunk_(\d+)', p.name).group(1)))

        TIMESTAMP_RE = re.compile(r"(?:(\d{1,2}):)?(\d{1,2}):(\d{1,2})[,.](\d{1,3})")
        final_blocks = []
        global_srt_index = 1

        for i, chunk_file in enumerate(sorted_chunks):
            offset_seconds = i * config.chunk_duration_seconds
            chunk_text = chunk_file.read_text(encoding='utf-8')
            srt_blocks = re.split(r'\n\s*\n', chunk_text.strip())

            for block in srt_blocks:
                lines = block.strip().split('\n')
                if len(lines) < 2: continue
                time_line = lines[1]
                try:
                    parts = time_line.split("-->")
                    if len(parts) != 2: continue
                    start_str, end_str = [s.strip() for s in parts]
                    start_match, end_match = TIMESTAMP_RE.match(start_str), TIMESTAMP_RE.match(end_str)
                    if not start_match or not end_match: continue

                    def calculate_new_time_str(match, offset):
                        h_str, m_str, s_str, ms_str = match.groups()
                        ms_str = ms_str.ljust(3, '0')
                        h = int(h_str) if h_str is not None else 0
                        m, s, ms = int(m_str), int(s_str), int(ms_str)
                        chunk_total_seconds = (h * 3600) + (m * 60) + s + (ms / 1000.0)
                        final_total_seconds = chunk_total_seconds + offset
                        if final_total_seconds < 0: final_total_seconds = 0
                        new_h, rem = divmod(final_total_seconds, 3600)
                        new_m, rem = divmod(rem, 60)
                        new_s, new_ms_frac = divmod(rem, 1)
                        new_ms = int(round(new_ms_frac * 1000))
                        if new_ms >= 1000: new_s += 1; new_ms -= 1000
                        if new_s >= 60: new_m +=1; new_s -= 60
                        if new_m >= 60: new_h +=1; new_m -= 60
                        return f"{int(new_h):02}:{int(new_m):02}:{int(new_s):02},{new_ms:03}"

                    new_start_str = calculate_new_time_str(start_match, offset_seconds)
                    new_end_str = calculate_new_time_str(end_match, offset_seconds)
                    new_time_line = f"{new_start_str} --> {new_end_str}"
                    text_lines = "\n".join(lines[2:])
                    valid_block = f"{global_srt_index}\n{new_time_line}\n{text_lines}"
                    final_blocks.append(valid_block)
                    global_srt_index += 1
                except (ValueError, IndexError):
                    continue

        final_text = "\n\n".join(final_blocks)
        final_result_path.write_text(final_text, encoding='utf-8')
        logging.info(f"✅ Saved final patched result for '{stem}'. Cleaning up temp chunks.")
        for f in chunk_files:
            try: f.unlink()
            except OSError as e: logging.warning(f"Could not delete temp chunk {f.name}: {e}")

def process_chunk_worker(task: ProcessTask, config: Config, api_keys: List[str], file_info_cache: Dict[str, FileInfo]) -> Tuple[str, Optional[ProcessTask]]:
    # The task.original_stem is now the SANITIZED stem
    log_prefix = f"[{task.original_stem}_chunk_{task.chunk_index}]"
    try:
        with tempfile.TemporaryDirectory() as local_worker_dir_str:
            local_worker_dir = Path(local_worker_dir_str)
            source_suffix = task.media_path.suffix.lower()
            compatible_audio_formats = ['.mp3', '.flac', '.wav', '.m4a', '.aac', '.ogg']
            audio_codec = ['-c:a', 'copy'] if source_suffix in compatible_audio_formats else ['-q:a', '0']
            output_extension = source_suffix if source_suffix in compatible_audio_formats else '.mp3'
            local_audio_chunk = local_worker_dir / f"chunk_audio{output_extension}"

            logging.info(f"{log_prefix} Extracting audio chunk (Mode: {'Copy' if '-c:a' in audio_codec else 'Encode'})...")
            command = ['ffmpeg', '-y', '-i', str(task.media_path), '-ss', str(task.time_offset_seconds), '-t', str(config.chunk_duration_seconds), '-vn'] + audio_codec + ['-map', 'a', str(local_audio_chunk)]
            subprocess.run(command, check=True, stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

            api_keys_json = json.dumps(api_keys)
            api_keys_b64 = base64.b64encode(api_keys_json.encode('utf-8')).decode('utf-8')
            prompt_b64 = base64.b64encode(config.prompt_template.encode('utf-8')).decode('utf-8')
            output_path = config.temp_chunk_path / f"{task.original_stem}_chunk_{task.chunk_index}{config.output_suffix}.txt"
            worker_script_path = local_worker_dir / "worker.py"
            worker_script_path.write_text(WORKER_SCRIPT_TEMPLATE)
            worker_command = [sys.executable, str(worker_script_path), api_keys_b64, config.model_name, prompt_b64, str(local_audio_chunk), str(output_path)]
            logging.info(f"{log_prefix} Starting isolated subprocess for API call...")
            result = subprocess.run(worker_command, capture_output=True, text=True)

            if result.returncode == 0:
                logging.info(f"{log_prefix} ✅ Subprocess finished successfully.")
                try_patch_file(task.original_stem, config, file_info_cache)
                return ('success', task)
            else:
                raise Exception(f"Subprocess failed with error: {result.stderr.strip()}")
    except Exception as e:
        logging.error(f"{log_prefix} 🚨 PROCESSING FAILED: {e}", exc_info=False)
        return ('failure', task)

def setup_logging(log_file_path: Path):
    log_formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
    root_logger = logging.getLogger(); root_logger.setLevel(logging.INFO)
    root_logger.handlers.clear()
    root_logger.addHandler(logging.FileHandler(log_file_path))
    root_logger.addHandler(logging.StreamHandler(sys.stdout))
    logging.info("Main process logging configured.")

def setup_and_validate(workspace_str, keys_str, prompt_str, chunk_min) -> ValidatedSetup:
    print("--- Phase 1: Setting up and validating environment ---")
    try: drive.mount('/content/drive', force_remount=True)
    except Exception as e: raise Exception(f"Google Drive mount failed: {e}. If 'Mountpoint... exists', use 'Runtime -> Disconnect and delete'.")
    workspace_path = Path(workspace_str)
    logs_path = workspace_path / "logs"
    logs_path.mkdir(exist_ok=True)
    log_file = logs_path / f"batch_log_{time.strftime('%Y%m%d-%H%M%S')}.log"
    setup_logging(log_file)
    for name, path in {'Workspace': workspace_path, 'Keys': workspace_path / keys_str, 'Prompt': workspace_path / prompt_str, 'Input': workspace_path / "input"}.items():
        if not path.exists(): raise FileNotFoundError(f"🚨 FATAL ERROR: {name} path ('{path}') not found.")
    logging.info("✅ Drive mounted and workspace paths verified.")
    try:
        api_keys = json.loads((workspace_path / keys_str).read_text(encoding='utf-8'))
        if not isinstance(api_keys, list) or not all(isinstance(k, str) and k for k in api_keys): raise ValueError("Keys file must be a JSON list of non-empty strings.")
        logging.info(f"✅ Loaded {len(api_keys)} API keys.")
    except Exception as e: raise ValueError(f"Could not load/validate '{keys_str}': {e}")
    prompt_text = (workspace_path / prompt_str).read_text(encoding='utf-8').strip()
    if not prompt_text: raise ValueError(f"'{prompt_str}' is empty.")
    if not shutil.which("ffprobe"): raise EnvironmentError("ffprobe not found.")
    config = Config(workspace_path=workspace_path, keys_filename=keys_str, prompt_filename=prompt_str, output_suffix=OUTPUT_FILENAME_SUFFIX, model_name=MODEL_NAME, prompt_template=prompt_text, chunk_duration_seconds=chunk_min * 60)
    config.results_path.mkdir(exist_ok=True); config.temp_chunk_path.mkdir(exist_ok=True)
    return ValidatedSetup(config=config, api_keys=api_keys, log_file_path=log_file)

def get_media_duration(file_path: Path) -> float:
    command = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', str(file_path)]
    result = subprocess.run(command, capture_output=True, text=True, check=True)
    return float(result.stdout.strip())

# --- REWRITTEN run_pipeline FUNCTION TO HANDLE SANITIZED FILENAMES ---
def run_pipeline(setup: ValidatedSetup) -> Dict[str, Path]:
    config, api_keys, log_file_path = setup
    VALID_EXTENSIONS = ['.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac']
    file_info_cache = {}
    safe_stem_map = {} # Maps safe_stem back to original Path object

    all_input_paths = [f for f in config.input_path.iterdir() if f.is_file() and f.suffix.lower() in VALID_EXTENSIONS]
    for f in all_input_paths:
        safe_stem = sanitize_stem(f.stem)
        safe_stem_map[safe_stem] = f

    if FORCE_REPROCESS:
        logging.warning("🔥 FORCE REPROCESS ENABLED: All existing final results will be ignored.")
        existing_final_results = set()
    else:
        existing_final_results = {
            sanitize_stem(f.stem) for f in all_input_paths
            if (config.results_path / f"{sanitize_stem(f.stem)}{config.output_suffix}.txt").exists()
        }

    all_input_files_to_process = [f for f in all_input_paths if sanitize_stem(f.stem) not in existing_final_results]

    if not all_input_files_to_process:
        logging.info("✅ No new input files to process.")
        return safe_stem_map

    all_tasks = []
    logging.info(f"Found {len(all_input_files_to_process)} new files to process. Analyzing and preparing tasks...")
    for file in all_input_files_to_process:
        try:
            safe_stem = sanitize_stem(file.stem)
            duration = get_media_duration(file)
            num_chunks = math.ceil(duration / config.chunk_duration_seconds)
            file_info_cache[safe_stem] = FileInfo(duration=duration, expected_chunks=num_chunks)
            existing_temp_chunks = {f.name for f in config.temp_chunk_path.glob(f"{safe_stem}_chunk_*.txt")}
            file_tasks_count = 0
            for i in range(num_chunks):
                chunk_filename = f"{safe_stem}_chunk_{i}{config.output_suffix}.txt"
                if chunk_filename not in existing_temp_chunks:
                    offset = i * config.chunk_duration_seconds
                    all_tasks.append(ProcessTask(media_path=file, original_stem=safe_stem, chunk_index=i, time_offset_seconds=offset))
                    file_tasks_count += 1
            logging.info(f" -> Found {file_tasks_count} new chunks to process for '{file.name}'.")
        except Exception as e:
            logging.error(f"Failed to prepare tasks for {file.name}: {e}")

    if not all_tasks:
        logging.info("✅ All chunks for all files are already processed.")
    else:
        num_workers = min(NUM_WORKERS, len(api_keys), len(all_tasks))
        logging.info(f"\nStarting processing for {len(all_tasks)} total new chunks with {num_workers} parallel workers.")
        tasks_to_process = all_tasks
        for attempt in range(MAX_RETRIES + 1):
            if not tasks_to_process: break
            if attempt > 0:
                logging.warning(f"\n--- 🔁 Retrying {len(tasks_to_process)} failed chunks (Attempt {attempt + 1}/{MAX_RETRIES + 1}) ---")
                time.sleep(5)
            failed_tasks_this_round = []
            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
                futures = {executor.submit(process_chunk_worker, task, config, api_keys, file_info_cache): task for task in tasks_to_process}
                progress_desc = f"Processing Chunks (Attempt {attempt+1})"
                for future in tqdm(concurrent.futures.as_completed(futures), total=len(tasks_to_process), desc=progress_desc):
                    try:
                        status, task = future.result()
                        if status == 'failure': failed_tasks_this_round.append(task)
                    except Exception as exc:
                        task = futures[future]
                        logging.error(f"A worker thread for task [{task.original_stem}_chunk_{task.chunk_index}] raised an unhandled exception: {exc}")
                        failed_tasks_this_round.append(task)
            tasks_to_process = failed_tasks_this_round

    logging.info("\n--- Final check for any completed files to patch ---")
    stems_with_chunks = {re.match(r'(.+)_chunk_', f.name).group(1) for f in config.temp_chunk_path.iterdir() if re.match(r'(.+)_chunk_', f.name)}
    final_check_cache = file_info_cache.copy()
    for stem in stems_with_chunks:
        if stem not in final_check_cache:
            try:
                original_file = safe_stem_map.get(stem)
                if not original_file: raise StopIteration
                duration = get_media_duration(original_file)
                num_chunks = math.ceil(duration / config.chunk_duration_seconds)
                final_check_cache[stem] = FileInfo(duration=duration, expected_chunks=num_chunks)
            except (StopIteration, Exception) as e:
                logging.warning(f"Could not find original input for safe_stem '{stem}' during final check: {e}")
    for stem in final_check_cache:
        try_patch_file(stem, config, final_check_cache)
    logging.info("Final check complete.")
    return safe_stem_map

# --- 4. SCRIPT ENTRY POINT ---
if __name__ == '__main__':
    if 'google.colab' in sys.modules and ENABLE_KEEP_ALIVE:
        js_code = "function ClickConnect(){ console.log('Keeping Colab session active...'); document.querySelector('colab-connect-button').click(); } setInterval(ClickConnect, 60000);"
        display(HTML(f"<script>{js_code}</script>")); print("✅ Runtime Keep-Alive enabled.")

    log_file_path = None
    all_targeted_files_map = {}
    try:
        validated_setup = setup_and_validate(WORKSPACE_PATH, KEYS_FILENAME, PROMPT_FILENAME, CHUNK_DURATION_MINUTES)
        log_file_path = validated_setup.log_file_path
        all_targeted_files_map = run_pipeline(validated_setup)
        logging.info("✅ Script finished successfully.")
    except Exception as e:
        print(f"\n🚨 SCRIPT HALTED: {e}")
        if logging.getLogger().hasHandlers():
            logging.critical(f"SCRIPT HALTED: {e}", exc_info=True)
    finally:
        # --- REWRITTEN SUMMARY LOGIC ---
        print("\n--- 📊 Final Processing Summary ---")
        if not all_targeted_files_map:
             print("No input files were found or targeted in this run.")
        else:
             config_for_summary = Config(workspace_path=Path(WORKSPACE_PATH), keys_filename="", prompt_filename="", output_suffix=OUTPUT_FILENAME_SUFFIX, model_name="", prompt_template="", chunk_duration_seconds=0)
             processed_safe_stems = {f.stem.replace(config_for_summary.output_suffix, "") for f in config_for_summary.results_path.glob(f"*{config_for_summary.output_suffix}.txt")}

             succeeded_files = []
             failed_files = []

             for safe_stem, original_path in all_targeted_files_map.items():
                 if safe_stem in processed_safe_stems:
                     succeeded_files.append(original_path.name)
                 else:
                     failed_files.append(original_path.name)

             if succeeded_files:
                print("\n✅ Successfully Processed and Patched:")
                for f_name in sorted(succeeded_files):
                    print(f"  - {f_name}")
             if failed_files:
                print("\n🚨 Files That May Have Failed (result not found):")
                for f_name in sorted(failed_files):
                    print(f"  - {f_name}")

        print("\n------------------------------------")
        if log_file_path and log_file_path.exists():
            print(f"\n--> Run ended. Detailed log available at: {log_file_path}")
        else:
            print("\n--> Run ended.")

In [None]:


# ===================================================================================
#
#  SRT SPLITTER & CLEANER (FINAL - ROBUST TIMESTAMP PARSING)
#
# ===================================================================================
# @markdown # **Step 1: Define Your Workspace**
# @markdown Confirm your main folder path and source file suffix below.
# @markdown ---
WORKSPACE_PATH = "/content/drive/MyDrive/GeminiProcessor" # @param {type:"string"}
SOURCE_FILE_SUFFIX = "_result" # @param {type:"string"}

# @markdown # **Step 4: Advanced Line Splitting (Readability-Aware)**
# @markdown ---
# @markdown Enable this to automatically split subtitles that are too long or too fast to read.
SPLIT_LONG_LINES = True #@param {type:"boolean"}
# @markdown Lines longer than this character count will be split.
MAX_LINE_LENGTH = 48 #@param {type:"integer"}
# @markdown Subtitles requiring a reading speed faster than this will be split.
# @markdown (CPS = Characters Per Second. Recommended: 15-22)
MAX_READING_SPEED_CPS = 20 #@param {type:"integer"}


# ===================================================================================
#  --- Initial Setup: Mount Google Drive ---
# ===================================================================================
import os, math
from pathlib import Path
from google.colab import drive
from IPython.display import display, HTML, clear_output

DRIVE_MOUNTED = False
try:
    drive.mount('/content/drive', force_remount=True)
    workspace_check = Path(WORKSPACE_PATH)
    if not workspace_check.is_dir():
        print(f"⚠️ Warning: WORKSPACE_PATH does not exist: {WORKSPACE_PATH}")
    else:
        print(f"✅ Google Drive mounted. Workspace ready at: {WORKSPACE_PATH}")
        DRIVE_MOUNTED = True
except Exception as e:
    print(f"🚨 Failed to mount Google Drive: {e}")

# ===================================================================================
#  --- Main Application Engine & UI ---
# ===================================================================================
if DRIVE_MOUNTED:
    import re, logging, textwrap, sys
    from tqdm.notebook import tqdm
    import ipywidgets as widgets

    # --- SETUP & GLOBAL CONSTANTS ---
    CHAR_SETS_MAP = {
        "Russian/Cyrillic": set("АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"),
        "Chinese/Japanese/Korean": set("".join(chr(c) for c in range(0x4E00, 0x9FFF + 1))),
        "Hindi/Devanagari": set("".join(chr(c) for c in range(0x0900, 0x097F + 1))),
        "Arabic": set("".join(chr(c) for c in range(0x0600, 0x06FF + 1))),
        "Emojis/Symbols": set("".join(chr(c) for c in range(0x1F300, 0x1F64F + 1)) + "★☆♪♫")
    }

    # --- CORE PROCESSING LOGIC ---
    def setup_logging(workspace_path):
        log_path = Path(workspace_path) / "srt_splitter_log.txt"
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', filename=log_path, filemode='w')
        if not any(isinstance(h, logging.StreamHandler) for h in logging.getLogger('').handlers):
            console = logging.StreamHandler(sys.stdout); console.setLevel(logging.INFO)
            console.setFormatter(logging.Formatter('%(message)s')); logging.getLogger('').addHandler(console)

    # --- CORRECTED Timecode Helper Functions ---
    def srt_time_to_ms(time_str):
        """Converts SRT time string to milliseconds, handling both HH:MM:SS,ms and MM:SS,ms."""
        # If no hours component is present, add it.
        if time_str.count(':') == 1:
            time_str = "00:" + time_str

        h, m, s, ms = map(int, re.split('[:,]', time_str))
        return (h * 3600 + m * 60 + s) * 1000 + ms

    def ms_to_srt_time(ms_total):
        ms = ms_total % 1000; total_seconds = ms_total // 1000
        s = total_seconds % 60; m = (total_seconds // 60) % 60
        h = total_seconds // 3600
        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

    def should_split_subtitle(text, start_ms, end_ms, max_len, max_cps):
        duration_sec = (end_ms - start_ms) / 1000.0
        if duration_sec <= 0: return False
        cps = len(text) / duration_sec
        return len(text) > max_len or cps > max_cps

    def split_subtitle_by_readability(text, start_ms, end_ms, max_len):
        lines = textwrap.wrap(text, width=max_len, break_long_words=False, replace_whitespace=False)
        if len(lines) <= 1: return [(text, ms_to_srt_time(start_ms), ms_to_srt_time(end_ms))]

        total_duration_ms = end_ms - start_ms
        total_chars = len("".join(lines))

        new_blocks = []; current_time_ms = start_ms
        for i, line in enumerate(lines):
            char_proportion = len(line) / total_chars if total_chars > 0 else 1/len(lines)
            chunk_duration_ms = total_duration_ms * char_proportion

            chunk_start_ms = current_time_ms
            chunk_end_ms = end_ms if i == len(lines) - 1 else current_time_ms + chunk_duration_ms

            new_blocks.append((line, ms_to_srt_time(int(chunk_start_ms)), ms_to_srt_time(int(chunk_end_ms))))
            current_time_ms = chunk_end_ms
        return new_blocks

    def process_single_file(filepath, config):
        try:
            content = filepath.read_text(encoding='utf-8').strip()
            if not content: return "skipped_empty"
            content = re.sub(r':(\d{3})\s*-->', r',\1 -->', content)
            content = re.sub(r':(\d{3})$', r',\1', content, flags=re.MULTILINE)
            matches = list(re.finditer(r'(?:\d{2}:)?\d{2}:\d{2},\d{3}\s*-->\s*(?:\d{2}:)?\d{2}:\d{2},\d{3}', content))
            if not matches: raise ValueError("No valid timestamps found.")

            srt_data = {p: [] for p in config['languages']}; blocklist = config.get('blocklist', set())
            for i, match in enumerate(matches):
                timestamp = match.group(0); start, end = match.end(), matches[i+1].start() if i+1 < len(matches) else len(content)
                texts = {p: "" for p in config['languages']}
                for line in content[start:end].strip().split('\n'):
                    if line.strip():
                        for p in config['languages']:
                            if line.startswith(p) and ':' in line: texts[p] = line.split(':', 1)[1].strip(); break
                if not any(texts.values()): continue

                for p, t in texts.items():
                    clean_t = "".join(c for c in t if c not in blocklist) if blocklist else t
                    if not clean_t: continue

                    if config['split_long_lines']:
                        start_str, end_str = timestamp.split(' --> ')
                        start_ms, end_ms = srt_time_to_ms(start_str), srt_time_to_ms(end_str)

                        if should_split_subtitle(clean_t, start_ms, end_ms, config['max_line_length'], config['max_cps']):
                            new_blocks = split_subtitle_by_readability(clean_t, start_ms, end_ms, config['max_line_length'])
                            for text_chunk, start_chunk_str, end_chunk_str in new_blocks:
                                new_ts = f"{start_chunk_str} --> {end_chunk_str}"
                                srt_data[p].append(f"{len(srt_data[p]) + 1}\n{new_ts}\n{text_chunk}")
                        else:
                            srt_data[p].append(f"{len(srt_data[p]) + 1}\n{timestamp}\n{clean_t}")
                    else:
                        srt_data[p].append(f"{len(srt_data[p]) + 1}\n{timestamp}\n{clean_t}")

            if not any(srt_data.values()): raise ValueError("No valid blocks were extracted.")
            base_name = filepath.stem.replace(config['source_suffix'], '')
            for p, blocks in srt_data.items():
                if blocks: (config['output_dir'] / f"{base_name}{config['languages'][p]}.srt").write_text('\n\n'.join(blocks) + '\n', 'utf-8')
            return "success"
        except Exception as e:
            logging.error(f"Failed to process '{filepath.name}': {e}"); return "failed"

    # --- DYNAMIC UI WIDGETS AND FUNCTIONS ---
    lang_checkboxes, char_checkboxes = {}, {}
    analyze_button = widgets.Button(description="Analyze Files", button_style='primary', icon='search', layout=widgets.Layout(width='200px'))
    process_button = widgets.Button(description="Run Processing", button_style='success', icon='cogs', layout=widgets.Layout(width='200px', visibility='hidden'))
    lang_box, char_box, status_area = widgets.VBox(), widgets.VBox(), widgets.Output()

    def on_analyze_click(b):
        b.disabled = True; lang_box.children, char_box.children = [], []; process_button.layout.visibility = 'hidden'
        with status_area:
            clear_output(wait=True); print("🚀 Analyzing files...")
            try:
                workspace = Path(WORKSPACE_PATH); results_dir = workspace / "results"
                dirs_to_scan = [results_dir, results_dir / "chunk_results"]
                files = [p for s_dir in dirs_to_scan if s_dir.is_dir() for p in s_dir.glob(f"*{SOURCE_FILE_SUFFIX}.txt")]
                if not files: print(f"❌ No source files found ending with '{SOURCE_FILE_SUFFIX}.txt'."); b.disabled = False; return

                prefixes, chars = set(), set()
                for f in files[:10]:
                    content = f.read_text('utf-8')
                    prefixes.update(re.findall(r'^\s*([a-zA-Z]{1,4}:)', content, re.MULTILINE))
                    for char in content:
                        for name, char_set in CHAR_SETS_MAP.items():
                            if char in char_set: chars.add(name)
                if not prefixes: print("⚠️ Could not detect language prefixes (like 'en:')."); b.disabled = False; return

                global lang_checkboxes, char_checkboxes
                lang_checkboxes = {p: widgets.Checkbox(v=True, description=p, indent=False) for p in sorted(list(prefixes))}
                char_checkboxes = {c: widgets.Checkbox(v=False, description=c, indent=False) for c in sorted(list(chars))}
                lang_box.children = [widgets.HTML("<b>Step 2: Select languages to extract</b>")] + list(lang_checkboxes.values())
                if char_checkboxes: char_box.children = [widgets.HTML("<b>Step 3: Select character sets to remove</b>")] + list(char_checkboxes.values())
                process_button.layout.visibility = 'visible'
                clear_output(wait=True); print("✅ Analysis complete. Review options and click 'Run Processing'.")
            except Exception as e: clear_output(wait=True); print(f"🚨 Analysis Error: {e}")
        b.disabled = False

    def on_process_click(b):
        b.disabled = True; analyze_button.disabled = True
        for cb in lang_checkboxes.values(): cb.disabled = True;
        for cb in char_checkboxes.values(): cb.disabled = True
        with status_area:
            clear_output(wait=True)
            try:
                print("⚙️ Preparing to process..."); selected_prefixes = [p for p, cb in lang_checkboxes.items() if cb.value]
                if not selected_prefixes: print("❌ No languages selected."); b.disabled = False; analyze_button.disabled = False; return

                languages = {p: f"_{p.replace(':', '').lower()}" for p in selected_prefixes}
                selected_chars = [c for c, cb in char_checkboxes.items() if cb.value]
                blocklist = set().union(*(CHAR_SETS_MAP[name] for name in selected_chars))
                workspace = Path(WORKSPACE_PATH); setup_logging(workspace)
                logging.info(f"Languages: {languages}"); logging.info(f"Chars to remove: {selected_chars or 'None'}")

                config = { "source_suffix": SOURCE_FILE_SUFFIX, "languages": languages, "blocklist": blocklist,
                           "split_long_lines": SPLIT_LONG_LINES, "max_line_length": MAX_LINE_LENGTH, "max_cps": MAX_READING_SPEED_CPS,
                           "output_dir": (workspace / "results" / "final_srt_files")}
                config['output_dir'].mkdir(exist_ok=True)

                results_dir = workspace / "results"; dirs_to_scan = [results_dir, results_dir / "chunk_results"]
                files = [p for s_dir in dirs_to_scan if s_dir.is_dir() for p in s_dir.glob(f"*{config['source_suffix']}.txt")]

                print(f"Found {len(files)} files. Starting..."); success_count = 0
                for f in tqdm(files, desc="Processing Files"):
                    if process_single_file(f, config) == "success": success_count += 1

                print("\n" + "="*50); print(" PROCESSING COMPLETE".center(50)); print("="*50)
                print(f"  ✅ Successfully processed: {success_count} files")
                print(f"  🚨 Failed or skipped: {len(files) - success_count} files")
                print(f"  📂 Final SRTs saved in: {config['output_dir']}")
                print(f"  ℹ️ Log saved at: {workspace / 'srt_splitter_log.txt'}")
            except Exception as e: clear_output(wait=True); print(f"🚨 Processing Error: {e}")

    # --- Attach Functions to Buttons & Display UI ---
    analyze_button.on_click(on_analyze_click)
    process_button.on_click(on_process_click)
    display(widgets.VBox([
        widgets.HTML("<b>Click 'Analyze Files' to configure the processor.</b>"),
        widgets.HBox([analyze_button, process_button]), lang_box, char_box, status_area
    ]))

In [None]:


# @markdown ### ⚙️ Step 5: Definitive SRT Splitter (Handles All Known Errors)
# @markdown This final version is the most robust. It automatically:
# @markdown 1.  Fixes invalid timestamp formats (e.g., `SS:ms` -> `SS,ms`).
# @markdown 2.  Handles both `HH:MM:SS,ms` and `MM:SS,ms` timestamps.
# @markdown 3.  Repairs missing SRT index numbers.
# @markdown 4.  Ignores incomplete "garbage" blocks at the end of a file.
# @markdown 5.  Saves all final, clean SRTs to a dedicated output folder.

import os
import re
from pathlib import Path
from google.colab import drive
import shutil

# --- Configuration (Self-Contained) ---
# @markdown **Ensure these values match the settings in your main script!**
WORKSPACE_PATH = "/content/drive/MyDrive/GeminiProcessor" # @param {type:"string"}
OUTPUT_FILENAME_SUFFIX = "_result" # @param {type:"string"}

# @markdown **Define the suffixes and the dedicated output folder.**
FINAL_SRT_FOLDER_NAME = "final_srt_files" # @param {type:"string"}
PHONETIC_FILE_SUFFIX = "_phonetic" # @param {type:"string"}
FARSI_FILE_SUFFIX = "_farsi" # @param {type:"string"}

# @markdown Check this box to delete the original `.txt` files after a successful split.
DELETE_ORIGINAL_TXT_FILES = False # @param {type:"boolean"}

def srt_splitter_main():
    """
    Finds dual-language text files, normalizes common AI errors, splits them
    into separate SRT files, saves to a dedicated folder, and cleans up.
    """
    print("--- Starting Definitive Dual-Language SRT Splitter ---")

    try:
        drive.mount('/content/drive', force_remount=True)
        print("✅ Google Drive mounted.")

        workspace = Path(WORKSPACE_PATH)
        results_dir = workspace / "results"
        chunk_results_dir = results_dir / "chunk_results"

        final_output_dir = results_dir / FINAL_SRT_FOLDER_NAME
        final_output_dir.mkdir(exist_ok=True)
        print(f"✅ Final SRT files will be saved to: '{final_output_dir.relative_to(workspace)}'")

        dirs_to_scan = [chunk_results_dir, results_dir]
        total_processed = 0
        total_skipped = 0

        for source_dir in dirs_to_scan:
            if not source_dir.is_dir() or source_dir == final_output_dir:
                continue

            print(f"\n🔍 Scanning directory: '{source_dir.relative_to(workspace)}'...")
            glob_pattern = f"*{OUTPUT_FILENAME_SUFFIX}.txt"
            txt_files = list(source_dir.glob(glob_pattern))

            if not txt_files:
                print(f"  ✅ No '{glob_pattern}' files found to process here.")
                continue

            print(f"  Found {len(txt_files)} source files to process.")
            for txt_path in txt_files:
                try:
                    content = txt_path.read_text(encoding='utf-8').strip()
                    if not content:
                        print(f"  ⚠️ Skipping '{txt_path.name}': File is empty.")
                        total_skipped += 1
                        continue

                    # --- NORMALIZATION STEP 1: Fix invalid timestamps (SS:ms -> SS,ms) ---
                    # Use a regex to find a colon followed by 3 digits right before the '-->'
                    content = re.sub(r':(\d{3})\s*-->', r',\1 -->', content)
                    # And do the same for the end timestamp
                    content = re.sub(r':(\d{3})$', r',\1', content, flags=re.MULTILINE)

                    phonetic_srt_blocks = []
                    farsi_srt_blocks = []

                    timestamp_pattern = r'(?:\d{2}:)?\d{2}:\d{2},\d{3}\s*-->\s*(?:\d{2}:)?\d{2}:\d{2},\d{3}'
                    matches = list(re.finditer(timestamp_pattern, content))

                    if not matches:
                        raise ValueError("After normalization, still no valid timestamps found.")

                    current_index = 1
                    for i, match in enumerate(matches):
                        timestamp = match.group(0)
                        start_pos = match.end()
                        end_pos = matches[i+1].start() if i + 1 < len(matches) else len(content)
                        block_text = content[start_pos:end_pos].strip()
                        text_lines = [line for line in block_text.split('\n') if line.strip()]

                        # --- NORMALIZATION STEP 2: Handle incomplete final blocks ---
                        if len(text_lines) < 2:
                            print(f"    ⚠️ Warning: Incomplete block after '{timestamp}' in '{txt_path.name}'. Ignoring it.")
                            continue # Skip this malformed block and continue

                        line_i = text_lines[0]
                        line_f = text_lines[1]

                        if not line_i.strip().startswith('I:') or not line_f.strip().startswith('f:'):
                            print(f"    ⚠️ Warning: Block after '{timestamp}' in '{txt_path.name}' lacks prefixes. Ignoring it.")
                            continue # Skip this malformed block

                        text_i = line_i.split(':', 1)[1].strip()
                        text_f = line_f.split(':', 1)[1].strip()

                        phonetic_srt_blocks.append(f"{current_index}\n{timestamp}\n{text_i}")
                        farsi_srt_blocks.append(f"{current_index}\n{timestamp}\n{text_f}")
                        current_index += 1

                    # Ensure we actually processed something before saving
                    if not phonetic_srt_blocks:
                         raise ValueError("No valid, complete blocks could be extracted from the file.")

                    base_name = txt_path.stem.replace(OUTPUT_FILENAME_SUFFIX, '')

                    phonetic_srt_path = final_output_dir / f"{base_name}{PHONETIC_FILE_SUFFIX}.srt"
                    phonetic_srt_path.write_text('\n\n'.join(phonetic_srt_blocks), encoding='utf-8')

                    farsi_srt_path = final_output_dir / f"{base_name}{FARSI_FILE_SUFFIX}.srt"
                    farsi_srt_path.write_text('\n\n'.join(farsi_srt_blocks), encoding='utf-8')

                    print(f"  ✅ Processed '{txt_path.name}' -> Saved 2 SRTs in '{FINAL_SRT_FOLDER_NAME}'")
                    total_processed += 1

                    if DELETE_ORIGINAL_TXT_FILES:
                        txt_path.unlink()
                        print(f"    🗑️  Deleted original source file.")

                except Exception as e:
                    print(f"  🚨 Error processing '{txt_path.name}': {e}. Skipping this file.")
                    total_skipped += 1
                    continue

        print("\n--- Splitter Process Summary ---")
        print(f"  Successfully processed and split: {total_processed} files")
        print(f"  Skipped or failed: {total_skipped} files")
        if DELETE_ORIGINAL_TXT_FILES and total_processed > 0:
            print("  Cleanup of original .txt source files is complete.")

    except Exception as e:
        print(f"\n🚨 A FATAL SCRIPT ERROR OCCURRED: {e}")
    finally:
        print("\n--> SRT splitter script finished.")

# --- Script Entry Point ---
if __name__ == '__main__':
    srt_splitter_main()

In [None]:


# @title 🤖 Power-User Downloader (Genuinely Smart URL Detection)
# @markdown ### New Feature: "Telegram Message Detective"
# @markdown The URL detection is now far more powerful. It can find URLs hidden in:
# @markdown - **Hyperlinks** (blue, clickable text)
# @markdown - **Captions** of photos and videos
# @markdown - **Link Previews** and forwarded messages

# =====================================================================================
# @markdown # 🤖 BOT CREDENTIALS
# =====================================================================================
TELEGRAM_BOT_TOKEN = "8386939225:AAE8HKCcY-fwgxc4b_eRdZ5J1J6ToBhjWsk"  # @param {type:"string"}
TELEGRAM_CHAT_ID = "5123471319"  # @param {type:"string"}

# =====================================================================================
# @markdown # ⚙️ DOWNLOAD SETTINGS & COMMANDS
# =====================================================================================
OUTPUT_DIRECTORY = "/content/drive/MyDrive/YT-DLP Downloads"  # @param {type:"string"}
FILENAME_TEMPLATE = "%(uploader)s - %(title)s [%(resolution)s] [%(id)s].%(ext)s" # @param {type:"string"}
VIDEO_QUALITY = "Best Available" # @param ["Best Available", "1080p (Full HD)", "720p (HD)"]
POST_PROCESSING = "None" #@param ["None", "Extract Audio (mp3)"]
ADDITIONAL_YTDLP_FLAGS = "" # @param {type:"string"}

# =====================================================================================
# SCRIPT (Do not modify below this line)
# =====================================================================================
import os
import shlex
import requests
import time
import subprocess
from urllib.parse import urlparse
from google.colab import drive

# --- 1. Install Dependencies ---
print("Step 1: Installing dependencies...")
os.system('pip install -q --force-reinstall --no-deps "https://github.com/yt-dlp/yt-dlp/archive/master.zip"')
os.system('pip install -q urlextract')
print("✅ Dependencies installed.")
print("-" * 50)

from urlextract import URLExtract

# --- Helper Functions ---
def send_telegram_message(text):
    if TELEGRAM_BOT_TOKEN == "PASTE_YOUR_BOT_TOKEN_HERE": return None
    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"; payload = {"chat_id": TELEGRAM_CHAT_ID, "text": text, "parse_mode": "Markdown", "disable_web_page_preview": True}
    try:
        response = requests.post(url, json=payload, timeout=10)
        return response.json()['result']['message_id'] if response.ok else None
    except: return None

def edit_telegram_message(text, message_id):
    if not message_id: return
    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/editMessageText"; payload = {"chat_id": TELEGRAM_CHAT_ID, "message_id": message_id, "text": text, "parse_mode": "Markdown", "disable_web_page_preview": True}
    try: requests.post(url, json=payload, timeout=10)
    except: pass

# <-- THE NEW, SMARTER URL DETECTIVE FUNCTION ---
def get_urls_from_update(update):
    """
    Intelligently extracts URLs from all possible fields in a Telegram update object,
    including text, captions, and hyperlink entities.
    """
    if not update: return []

    urls = set() # Use a set to automatically handle duplicates
    extractor = URLExtract()

    # Check the primary message object, or an edited message object
    message = update.get('edited_message') or update.get('message')
    if not message: return []

    # 1. Extract from plain text or caption
    text_content = message.get('text') or message.get('caption')
    if text_content:
        found_in_text = extractor.find_urls(text_content)
        urls.update(found_in_text)

    # 2. Extract from message entities (for hyperlinks)
    entities = message.get('entities') or message.get('caption_entities')
    if entities and text_content:
        for entity in entities:
            if entity['type'] == 'text_link':
                urls.add(entity['url'])
            elif entity['type'] == 'url':
                # For plain URLs that Telegram also marks as entities
                offset = entity['offset']
                length = entity['length']
                urls.add(text_content[offset:offset+length])

    return list(urls)

def get_domain_keyword(url):
    try:
        netloc = urlparse(url).netloc.lower()
        known_keywords = ['youtube', 'youtu.be', 'instagram', 'twitter', 'facebook', 'vimeo', 'dailymotion', 'twitch', 'x.com']
        for keyword in known_keywords:
            if keyword in netloc:
                if keyword == 'youtu.be': return 'youtube'
                if keyword == 'x.com': return 'twitter'
                return keyword
        return netloc.replace('www.', '').split('.')[0]
    except: return None

def find_cookie_file_by_keyword(keyword):
    if not keyword: return None, None
    drive_root = '/content/drive/MyDrive/'
    try:
        for filename in os.listdir(drive_root):
            full_path = os.path.join(drive_root, filename)
            if not os.path.isfile(full_path): continue
            if not (filename.lower().endswith('.txt') or filename.lower().endswith('.json')): continue
            if keyword.lower() in filename.lower(): return full_path, filename
    except: return None, None
    return None, None

def execute_download(command, url, message_id):
    process = subprocess.Popen(" ".join(command), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8', errors='replace')
    last_edit_time = 0
    for line in iter(process.stdout.readline, ''):
        if '[download]' in line and '%' in line and time.time() - last_edit_time > 5:
            progress_text = line.strip().replace("[download]", "📊").replace("ETA", "\n⏳ ETA")
            edit_telegram_message(f"🚀 **Downloading:**\n`{url}`\n\n`{progress_text}`", message_id)
            last_edit_time = time.time()
    process.wait()
    return process.returncode, process.stderr.read()

# --- Main Script Logic ---
print("Step 2: Connecting to Google Drive...");
try: drive.mount('/content/drive', force_remount=True); print("✅ Google Drive connected.")
except Exception as e: send_telegram_message(f"🚨 Bot failed: Could not connect to GDrive. Error: {e}"); import sys; sys.exit()
print("-" * 50)
STATE_FILE = '/content/drive/MyDrive/telegram_downloader_last_update.txt'; last_update_id = 0
try:
    with open(STATE_FILE, 'r') as f: last_update_id = int(f.read().strip())
    print(f"✅ Resuming from last processed message (ID: {last_update_id})")
except: print("ℹ️ No state file found.")
print("-" * 50)
print("Step 4: Fetching new messages..."); new_messages, new_offset = [], last_update_id
try:
    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/getUpdates?offset={last_update_id + 1}&timeout=10"
    response = requests.get(url, timeout=15).json()
    if response.get("ok"):
        new_messages = response.get("result", []); print(f"✅ Found {len(new_messages)} new message(s).")
        if new_messages: new_offset = new_messages[-1]['update_id']
    else: print(f"🚨 Telegram API error: {response.get('description')}")
except Exception as e: send_telegram_message(f"🚨 Bot failed: Could not connect to Telegram API. Error: {e}"); import sys; sys.exit()
print("-" * 50)

if not new_messages:
    send_telegram_message("🤷‍♀️ I'm all caught up! No new links to download.")
else:
    domains_requiring_cookies = set()
    total_urls_found = 0
    for update in new_messages:
        # <-- LOGIC CHANGE IS HERE ---
        urls_to_download = get_urls_from_update(update)
        if not urls_to_download: continue
        total_urls_found += len(urls_to_download)

        for url in urls_to_download:
            print(f"\nProcessing URL: {url}");
            message_id = send_telegram_message(f"🚀 **Starting download for:**\n`{url}`");
            domain_keyword = get_domain_keyword(url)

            base_command = ["yt-dlp", "-P", shlex.quote(OUTPUT_DIRECTORY), "-o", shlex.quote(FILENAME_TEMPLATE), "--newline", "--progress"]
            quality_map = {"1080p (Full HD)": "bestvideo[height<=1080]+bestaudio/best", "720p (HD)": "bestvideo[height<=720]+bestaudio/best"}
            if VIDEO_QUALITY in quality_map: base_command.extend(["-f", shlex.quote(quality_map[VIDEO_QUALITY])])
            if POST_PROCESSING == "Extract Audio (mp3)": base_command.extend(["-x", "--audio-format", "mp3"])
            if ADDITIONAL_YTDLP_FLAGS: base_command.extend(shlex.split(ADDITIONAL_YTDLP_FLAGS))

            cookie_path, cookie_filename = find_cookie_file_by_keyword(domain_keyword)

            def attempt_download(use_cookies=False):
                cmd = list(base_command)
                if use_cookies and cookie_path: cmd.extend(["--cookies", shlex.quote(cookie_path)])
                cmd.append(shlex.quote(url))
                return execute_download(cmd, url, message_id)

            if domain_keyword in domains_requiring_cookies:
                print(f"⚠️ Domain '{domain_keyword}' is flagged. Attempting direct download with cookies.")
                if not cookie_path:
                    edit_telegram_message(f"🚨 **Download Failed!**\n`{url}`\n\n*Error:* Domain requires login, but no cookie file found.", message_id); continue
                returncode, stderr = attempt_download(use_cookies=True)
            else:
                returncode, stderr = attempt_download(use_cookies=False)

            if returncode == 0:
                edit_telegram_message(f"✅ **Download Complete!**\n`{url}`", message_id); continue

            retry_keywords = ['sign in', 'login required', 'private', '403', '401', '429', 'log in', 'unavailable', 'members-only', 'confirm your age', 'unreachable']
            if any(keyword in stderr.lower() for keyword in retry_keywords):
                print(f"Auth error detected. Flagging domain '{domain_keyword}' and retrying.")
                domains_requiring_cookies.add(domain_keyword)
                if not cookie_path:
                    edit_telegram_message(f"🚨 **Download Failed!**\n`{url}`\n\n*Error:* Login required, but no cookie file found for `{domain_keyword}`.", message_id); continue
                returncode, stderr = attempt_download(use_cookies=True)

            if returncode == 0:
                edit_telegram_message(f"✅ **Retry Successful!**\nDownload complete for `{url}`", message_id)
            else:
                error_line = next((line for line in reversed(stderr.splitlines()) if line.strip().upper().startswith('ERROR:')), "Unknown error.")
                edit_telegram_message(f"🚨 **Download Failed!**\n`{url}`\n\n*Error:* `{error_line}`", message_id)

    if total_urls_found == 0:
        # This message will now only appear if new text messages were found, but none contained URLs.
        send_telegram_message("🤷‍♀️ I checked your new messages but couldn't find any links to download.")

if new_offset > last_update_id:
    with open(STATE_FILE, 'w') as f: f.write(str(new_offset))
    print("\n" + "-" * 50 + "\n✅ All tasks complete. State saved.")