Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 51 additions & 27 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ CAPTURE_PROVIDER=pyaudio
# Default: 16000
AUDIO_SAMPLE_RATE=16000

# Number of audio channels
# Options: 1 (mono), 2 (stereo)
# Default: 1
AUDIO_CHANNELS=1

# Audio chunk size for processing
# Default: 1024
AUDIO_CHUNK_SIZE=1024
Expand Down Expand Up @@ -84,10 +79,12 @@ AWS_LANGUAGE_CODE=en-US
# Default: 10
AWS_MAX_SPEAKERS=10

# AWS connection strategy
# Options: auto, single, dual
# Default: auto
AWS_CONNECTION_STRATEGY=auto
# AWS connection strategy (DEPRECATED - auto-detected based on device channels)
# Connection strategy is now automatically determined:
# - 1 channel device → single AWS connection
# - 2+ channel device → dual AWS connections for optimal transcription
# This setting is ignored and will be removed in a future version.
# AWS_CONNECTION_STRATEGY=auto

# Enable dual connection fallback
# Default: true
Expand All @@ -102,22 +99,6 @@ AWS_CHANNEL_BALANCE_THRESHOLD=0.3
# Default: false
AWS_DUAL_CONNECTION_TEST_MODE=false

# Save split audio files for debugging
# Default: false
AWS_DUAL_SAVE_SPLIT_AUDIO=false

# Save raw stereo audio for debugging
# Default: false
AWS_DUAL_SAVE_RAW_AUDIO=false

# Path to save debug audio files
# Default: debug_audio/
AWS_DUAL_AUDIO_SAVE_PATH=debug_audio/

# Duration to save debug audio (seconds)
# Default: 30
AWS_DUAL_AUDIO_SAVE_DURATION=30

# ======================================================================
# AZURE SPEECH SERVICE CONFIGURATION
# ======================================================================
Expand Down Expand Up @@ -200,6 +181,29 @@ PARTIAL_RESULT_TIMEOUT=2.0
# Default: 0.0
CONFIDENCE_THRESHOLD=0.0

# ======================================================================
# AUDIO SAVING (Provider-Agnostic)
# ======================================================================

# Enable raw audio saving to WAV file
# Works with all transcription providers (AWS, Azure, Whisper, etc.)
# Default: false
SAVE_RAW_AUDIO=false

# Enable split audio saving (save left/right channels separately for stereo input)
# Works with all transcription providers - saves separate L/R channel files alongside main file
# Only applies when stereo input device is selected
# Default: false
SAVE_SPLIT_AUDIO=false

# Directory path to save audio files
# Default: debug_audio/
AUDIO_SAVE_PATH=debug_audio/

# Maximum recording duration to save (seconds)
# Default: 30
AUDIO_SAVE_DURATION=30

# ======================================================================
# DATABASE CONFIGURATION (Optional)
# ======================================================================
Expand Down Expand Up @@ -267,13 +271,33 @@ TESTING=false
# AZURE_SPEECH_LANGUAGE=es-ES
# AZURE_ENABLE_SPEAKER_DIARIZATION=true

# Example 3: Development setup with file input
# Example 3: Development setup with file input and audio saving
# TRANSCRIPTION_PROVIDER=aws
# CAPTURE_PROVIDER=file
# SAVE_RAW_AUDIO=true
# AUDIO_SAVE_PATH=./recordings/
# AUDIO_SAVE_DURATION=60
# LOG_LEVEL=DEBUG
# SKIP_AWS_VALIDATION=true
# MOCK_SERVICES=true

# Example 4: Azure with stereo audio recording and channel splitting
# TRANSCRIPTION_PROVIDER=azure
# AZURE_SPEECH_KEY=your_azure_key_here
# AZURE_SPEECH_REGION=eastus
# AUDIO_CHANNELS=2
# SAVE_RAW_AUDIO=true
# SAVE_SPLIT_AUDIO=true
# AUDIO_SAVE_PATH=./meeting_recordings/

# Example 5: AWS with stereo device and split audio saving
# TRANSCRIPTION_PROVIDER=aws
# AWS_ACCESS_KEY_ID=your_access_key_here
# AWS_SECRET_ACCESS_KEY=your_secret_key_here
# SAVE_RAW_AUDIO=true
# SAVE_SPLIT_AUDIO=true
# AUDIO_SAVE_PATH=./debug_audio/

# ======================================================================
# SECURITY NOTES
# ======================================================================
Expand All @@ -297,6 +321,6 @@ TESTING=false
# 5. Visit the project documentation: https://github.com/dev-wei/ymemo

# For more detailed configuration options, see:
# - config/audio_config.py (configuration loading)
# - src/config/audio_config.py (configuration loading)
# - src/config/provider_config.py (provider configurations)
# - README.md (setup instructions)
3 changes: 1 addition & 2 deletions .github/test-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ export AWS_REGION=us-east-1
export TRANSCRIPTION_PROVIDER=aws
export CAPTURE_PROVIDER=pyaudio
export AUDIO_SAMPLE_RATE=16000
export AUDIO_CHANNELS=1

# Additional AWS environment variables for comprehensive mocking
export AWS_SESSION_TOKEN=test-session-token
Expand Down Expand Up @@ -74,5 +73,5 @@ echo " - AWS validation: DISABLED"
echo " - Mock services: ENABLED"
echo " - Log level: WARNING"
echo " - Provider: $TRANSCRIPTION_PROVIDER"
echo " - Audio: ${AUDIO_SAMPLE_RATE}Hz, ${AUDIO_CHANNELS} channel(s)"
echo " - Audio: ${AUDIO_SAMPLE_RATE}Hz, device-based channels"
echo ""
1 change: 0 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ env:
TRANSCRIPTION_PROVIDER: "aws"
CAPTURE_PROVIDER: "pyaudio"
AUDIO_SAMPLE_RATE: "16000"
AUDIO_CHANNELS: "1"
LOG_LEVEL: "WARNING" # Reduce CI log noise

# Disable real service connections
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/ci.yml.backup
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ env:
TRANSCRIPTION_PROVIDER: "aws"
CAPTURE_PROVIDER: "pyaudio"
AUDIO_SAMPLE_RATE: "16000"
AUDIO_CHANNELS: "1"
LOG_LEVEL: "WARNING" # Reduce CI log noise

# Disable real service connections
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/ci.yml.bak
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ env:
TRANSCRIPTION_PROVIDER: "aws"
CAPTURE_PROVIDER: "pyaudio"
AUDIO_SAMPLE_RATE: "16000"
AUDIO_CHANNELS: "1"
LOG_LEVEL: "WARNING" # Reduce CI log noise

# Disable real service connections
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,5 @@ config_local.py
settings_local.py

/debug_audio

/.ruff_cache
13 changes: 8 additions & 5 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ source .venv/bin/activate && python -c "from src.utils.database import test_data

**Centralized Configuration System:**

- All configuration is managed through `config/audio_config.py`
- All configuration is managed through `src/config/audio_config.py`
- Configuration is loaded from environment variables with sensible defaults
- Automatic validation with helpful error messages
- Debug logging shows loaded configuration values
Expand All @@ -154,17 +154,18 @@ source .venv/bin/activate && python -c "from src.utils.database import test_data
*Provider Selection:*

- `TRANSCRIPTION_PROVIDER` - Choose transcription provider ('aws', 'azure', 'whisper', 'google', default: 'aws')
- `aws` provider now intelligently switches between single and dual connections automatically
- `aws` provider automatically detects device channels and chooses optimal connection strategy
- `CAPTURE_PROVIDER` - Choose audio capture provider ('pyaudio', 'file', default: 'pyaudio')

*Audio Settings:*

- `AUDIO_QUALITY` - Audio quality preset ('high' for 44,100 Hz CD-quality, 'average' for 16,000 Hz speech-optimized, default: not set)
- `AUDIO_SAMPLE_RATE` - Sample rate in Hz (default: 16000, overridden by AUDIO_QUALITY if set)
- `AUDIO_CHANNELS` - Number of audio channels (default: 1)
- `AUDIO_CHUNK_SIZE` - Audio chunk size (default: 1024)
- `AUDIO_FORMAT` - Audio format ('int16', 'int24', 'int32', 'float32', default: 'int16')

**Note:** Number of audio channels is automatically detected based on the selected input device capabilities.

*AWS Configuration:*

- `AWS_REGION` - AWS region (default: 'us-east-1')
Expand All @@ -174,7 +175,9 @@ source .venv/bin/activate && python -c "from src.utils.database import test_data

*AWS Connection Strategy:*

- `AWS_CONNECTION_STRATEGY` - Connection mode ('auto', 'single', 'dual', default: 'auto')
- Connection strategy is now **automatically determined** based on device channels:
- 1-channel device → Single AWS Transcribe connection
- 2+ channel device → Dual AWS Transcribe connections for optimal transcription
- `AWS_DUAL_FALLBACK_ENABLED` - Enable fallback to dual connections (true/false, default: true)
- `AWS_CHANNEL_BALANCE_THRESHOLD` - Channel imbalance threshold for fallback (0.0-1.0, default: 0.3)

Expand All @@ -192,7 +195,7 @@ source .venv/bin/activate && python -c "from src.utils.database import test_data
**Configuration Debugging:**

```python
from config.audio_config import print_config_summary
from src.config.audio_config import print_config_summary
print_config_summary() # Shows current configuration
```

Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,16 @@ YMemo's provider system features enterprise-grade architecture:
<summary><b>🚀 AWS Transcribe Configuration</b></summary>

```bash
# Advanced AWS settings
export AWS_CONNECTION_STRATEGY=dual # Single or dual connections
# Advanced AWS settings (connection strategy now auto-detected)
# export AWS_CONNECTION_STRATEGY=dual # DEPRECATED - auto-detected based on device
export AWS_DUAL_FALLBACK_ENABLED=true # Automatic fallback
export AWS_MAX_SPEAKERS=10 # Speaker diarization limit
export ENABLE_PARTIAL_RESULTS=true # Real-time partial results
```

**Dual-Channel Mode**: YMemo's unique dual-channel architecture splits stereo audio for enhanced accuracy and speaker separation.
**Auto-Detected Connection Strategy**: YMemo automatically chooses the optimal AWS connection strategy based on your audio device:
- **1-channel devices** → Single AWS Transcribe connection
- **2+ channel devices** → Dual AWS Transcribe connections for enhanced accuracy and speaker separation

</details>

Expand Down
Empty file removed config/__init__.py
Empty file.
21 changes: 21 additions & 0 deletions src/audio/audio_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,15 @@ def __init__(
# Create directory if needed
self.save_path.mkdir(parents=True, exist_ok=True)

# Log important warning about dual channel audio saving
logger.warning("🎵 DualChannelAudioSaver: CREATING LEFT/RIGHT CHANNEL FILES")
logger.warning(
" ⚠️ This component should ONLY be used when processing stereo input that needs to be split!"
)
logger.warning(
" 💡 If you're seeing this for mono input, there's a configuration error upstream"
)

# Generate timestamp-based filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
left_file = self.save_path / f"left_channel_{timestamp}.wav"
Expand All @@ -304,19 +313,31 @@ def __init__(
logger.info("🎵 DualChannelAudioSaver: Initialized")
logger.info(f" 📁 Left: {left_file}")
logger.info(f" 📁 Right: {right_file}")
logger.info(
" 🎯 Expected usage: Stereo audio input that has been split into separate left/right channels"
)

def start_recording(self) -> bool:
"""Start recording both channels."""
if self.is_active:
return True

logger.info("🎵 DualChannelAudioSaver: Starting dual channel recording")
logger.info(
" 💡 This will create LEFT and RIGHT channel WAV files - ensure input is stereo!"
)

left_started = self.left_writer.start_recording()
right_started = self.right_writer.start_recording()

if left_started and right_started:
self.is_active = True
logger.info("🎵 DualChannelAudioSaver: Both channels recording started")
logger.info(
" 📁 Recording to 2 separate files (left_channel_*.wav and right_channel_*.wav)"
)
return True

logger.error("❌ DualChannelAudioSaver: Failed to start recording")
# Clean up any successful starts
if left_started:
Expand Down
Loading
Loading