In [None]:
# TTS Trainer - Data Exploration & Analysis

This notebook provides tools for exploring video/audio datasets and analyzing quality metrics for TTS training.

## Features

- **Video/Audio Analysis**: Quality assessment, duration analysis, format compatibility
- **Dataset Exploration**: Distribution analysis, outlier detection, quality scoring
- **Training Recommendations**: Data preprocessing suggestions, model selection guidance
- **Quality Metrics**: Audio quality validation, transcript alignment analysis

## Setup

Before running this notebook, ensure you have:
1. Installed all requirements from `requirements.txt`
2. Videos placed in `resources/videos/`
3. GPU drivers installed (for CUDA acceleration)


In [None]:
# Import required libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Audio, display
import warnings
warnings.filterwarnings('ignore')

# Add src to path for imports
sys.path.append(str(Path.cwd().parent / "src"))

# Import project modules
from utils.file_utils import get_video_files, get_audio_files, validate_audio_file
from utils.logging_utils import setup_logger
from pipeline.stages.video_processor import VideoProcessor

# Setup
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
logger = setup_logger("data_exploration")

print("✅ Libraries imported successfully!")
print(f"📁 Working directory: {Path.cwd()}")
print(f"📁 Project root: {Path.cwd().parent}")


In [None]:
# TTS Trainer - Data Exploration

This notebook provides tools for exploring and analyzing your video/audio datasets before training.

## Features
- Video metadata analysis
- Audio quality assessment
- Dataset statistics and visualization
- Transcript quality analysis


In [None]:
# Import required libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import soundfile as sf
from pathlib import Path
import json
from IPython.display import Audio, display, HTML
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Add src to path
sys.path.append('../src')

from utils.file_utils import get_video_files, get_audio_files
from pipeline.stages.video_processor import VideoProcessor
from pipeline.validators.audio_quality import AudioQualityValidator

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline
