### Air traffic control radio - Departure Event & Callsign Identification

In [1]:
!pip install pydub pyannote.audio transformers torch

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting omegaconf<3.0,>=2.1 (from pyannote.audio)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloadin

In [11]:
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Optional, Union
import torch
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from tqdm import tqdm
from datetime import datetime, timedelta
import re
import json

@dataclass
class Segment:
    """Represents a single audio segment with timing information"""
    start_time: datetime
    duration: float
    text: str
    original_file: str

    def to_dict(self):
        """Convert to dictionary with serializable datetime"""
        d = asdict(self)
        d['start_time'] = d['start_time'].isoformat()
        return d

@dataclass
class ProcessingResult:
    """Contains the results of processing a single audio file"""
    input_file: str
    start_time: datetime
    segments: List[Segment]

    @property
    def transcriptions(self) -> Dict[str, str]:
        """Returns a dictionary of start_time: transcribed_text"""
        return {seg.start_time.isoformat(): seg.text for seg in self.segments}

    def to_dict(self):
        """Convert to dictionary with serializable datetime"""
        return {
            'input_file': self.input_file,
            'start_time': self.start_time.isoformat(),
            'segments': [s.to_dict() for s in self.segments]
        }

class AudioProcessor:
    """Process audio files using Voice Activity Detection (VAD) and speech-to-text transcription.

    This class handles the end-to-end pipeline of:
    1. Detecting speech segments using VAD
    2. Transcribing speech content
    3. Maintaining timing information for each segment

    Parameters
    ----------
    output_dir : str, default="processed_audio"
        Directory for output files including JSON transcripts
    vad_model : str, default="pyannote/segmentation-3.0"
        HuggingFace model ID for voice activity detection
    transcription_model : str, default="distil-whisper/distil-large-v3"
        HuggingFace model ID for speech transcription
    hf_token : str, optional
        HuggingFace authentication token for model access

    Directory Structure
    ------------------
    processed_audio/
    └── transcripts/                  # JSON files containing transcriptions
        └── JFK12-Twr1-Dec-13-2024-0430Z_transcripts.json
    """

    def __init__(
        self,
        output_dir: str = "processed_audio",
        vad_model: str = "pyannote/segmentation-3.0",
        transcription_model: str = "openai/whisper-large-v3",
        hf_token: str = None
    ):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

        # Create directory for JSON outputs
        self.json_dir = self.output_dir / "transcripts"
        self.json_dir.mkdir(exist_ok=True)

        # Set up VAD
        model = Model.from_pretrained(vad_model, use_auth_token=hf_token)
        self.vad = VoiceActivityDetection(segmentation=model)
        self.vad.instantiate({"min_duration_on": 0.0, "min_duration_off": 0.0})

        # Set up transcription
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            transcription_model,
            torch_dtype=torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True
        )
        model.to(device)

        processor = AutoProcessor.from_pretrained(transcription_model)

        self.transcriber = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=10,
            torch_dtype=torch_dtype,
            device=device,
        )

    def parse_filename_time(self, filename: str) -> datetime:
        pattern = r'.*-(\w{3})-(\d{2})-(\d{4})-(\d{4})Z'
        match = re.search(pattern, filename)
        if match:
            month, day, year, time = match.groups()
            hour, minute = time[:2], time[2:]
            dt_string = f"{day} {month} {year} {hour}:{minute}"
            return datetime.strptime(dt_string, "%d %b %Y %H:%M")
        raise ValueError(f"Could not parse datetime from filename: {filename}")

    def get_segments(self, audio_path: Union[str, Path]) -> List[tuple]:
        """Get timing information for voice segments in an audio file"""
        vad_result = self.vad(str(audio_path))

        segment_info = []
        for segment, _ in vad_result.itertracks():
            segment_info.append((segment.start, segment.duration))

        return segment_info

    def process_file(self, audio_path: Union[str, Path], segment_limit: Optional[int] = None) -> ProcessingResult:
        audio_path = Path(audio_path)
        file_start_time = self.parse_filename_time(audio_path.name)

        # Get segment timings
        segment_info = self.get_segments(audio_path)
        if segment_limit:
            segment_info = segment_info[:segment_limit]

        # Create JSON file path
        json_path = self.json_dir / f"{audio_path.stem}_transcripts.json"

        # Process segments and update JSON after each transcription
        segments = []
        for start_offset, duration in tqdm(segment_info, desc="Transcribing segments"):
            # Transcribe the segment
            result = self.transcriber(str(audio_path), start=start_offset, end=start_offset + duration)

            # Calculate absolute start time
            segment_start_time = file_start_time + timedelta(seconds=start_offset)

            # Create segment object
            segment = Segment(
                start_time=segment_start_time,
                duration=duration,
                text=result["text"].strip(),
                original_file=str(audio_path)
            )
            segments.append(segment)

            # Create result object with current segments
            current_result = ProcessingResult(
                input_file=str(audio_path),
                start_time=file_start_time,
                segments=segments
            )

            # Save current state to JSON
            with open(json_path, 'w') as f:
                json.dump(current_result.to_dict(), f, indent=2)

        return current_result

    def process_directory(
        self,
        input_dir: Union[str, Path],
        file_pattern: str = "*.mp3"
    ) -> List[ProcessingResult]:
        """Process all matching audio files in a directory through VAD and transcription.

        Parameters
        ----------
        input_dir : str or Path
            Directory containing audio files to process
        file_pattern : str, optional
            Glob pattern to match audio files (default: "*.mp3")

        Returns
        -------
        List[ProcessingResult]
            List of processing results for each audio file
        """
        input_dir = Path(input_dir)
        audio_files = sorted(input_dir.glob(file_pattern))

        results = []
        for audio_path in tqdm(audio_files, desc="Processing files"):
            result = self.process_file(audio_path)
            results.append(result)

        return results

    @staticmethod
    def load_result(json_path: Union[str, Path]) -> ProcessingResult:
        """Load a ProcessingResult from a JSON file"""
        with open(json_path) as f:
            data = json.load(f)

        # Convert datetime strings back to datetime objects
        data['start_time'] = datetime.fromisoformat(data['start_time'])
        segments = []
        for seg in data['segments']:
            seg['start_time'] = datetime.fromisoformat(seg['start_time'])
            segments.append(Segment(**seg))

        return ProcessingResult(
            input_file=data['input_file'],
            start_time=data['start_time'],
            segments=segments
        )

In [9]:
# Example usage:
"""
processor = AudioProcessor(hf_token="your_token_here")

# Process a file - JSON is automatically saved during processing
result = processor.process_file("JFK12-Twr1-Dec-13-2024-0430Z.mp3")

# Load results from JSON later
loaded_result = AudioProcessor.load_result("processed_audio/transcripts/JFK12-Twr1-Dec-13-2024-0430Z_transcripts.json")

# Print transcriptions with timestamps
for segment in loaded_result.segments:
    print(f"{segment.start_time}: {segment.text}")
"""

'\nprocessor = AudioProcessor(hf_token="your_token_here")\n\n# Process a file - JSON is automatically saved during processing\nresult = processor.process_file("JFK12-Twr1-Dec-13-2024-0430Z.mp3")\n\n# Load results from JSON later\nloaded_result = AudioProcessor.load_result("processed_audio/transcripts/JFK12-Twr1-Dec-13-2024-0430Z_transcripts.json")\n\n# Print transcriptions with timestamps\nfor segment in loaded_result.segments:\n    print(f"{segment.start_time}: {segment.text}")\n'

In [12]:
processor = AudioProcessor(hf_token="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")

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

In [13]:
results = processor.process_directory("/content/audio_files2/")
for result in results:
    print(f"\nFile: {result.input_file}")
    print(f"Recording starts at: {result.start_time}")
    print(f"Number of segments: {len(result.segments)}")

Processing files:   0%|          | 0/1 [00:00<?, ?it/s]
Transcribing segments:   0%|          | 0/250 [00:00<?, ?it/s]
Processing files:   0%|          | 0/1 [02:26<?, ?it/s]


TypeError: AutomaticSpeechRecognitionPipeline._sanitize_parameters() got an unexpected keyword argument 'start'

In [4]:
print([segment.text for segment in result.segments])

["Luke, Blacks here, Chicago Docks 810, we'll have you, goodbye.", "Hey, Rex, what's the two heavy cross and leg turbos from the three-wheel left line? Go ahead.", '9.3 limit at 8.6.', "Copa 804, turn left whiskey, left alpha, I'm on the ground for the night.", 'Learn with ease, learn on alpha and monitor ground culture. 804 the next.', 'Good night.', 'Amherst 8 Uniform with Chief Super, contact here at the department.', '詳しくは、マスター・ユニフォームをご覧ください。', 'it.', 'ABEX 152 heavy wind, 300 at 10, runway 31 left.', 'Take off.', '3166AX162H', 'Chef Lou 2637, cross the lake, travel on 23, one left, line for eight.', 'Long away, 215, Joplin, 1637.', 'Tower Delta, 2585, ILS 3108.', 'Delta 2685, Kennedy Tower, wind 300 at 10, runway 31R, cleared to land.', '3rd Land, 310, right side, 2685.', '8X152 Heavy, contact near support.', 'Thank you for your attention.', 'Jet Blue 2637, wind 310 at 10, runway 31 left.', 'Chris, thank you.', 'Please catch up to my live.', 'Challenge 5-9-2, heavy wind, 3-1-0-1-0

In [5]:
print(result.transcriptions)

{datetime.datetime(2024, 12, 13, 4, 30, 0, 30969): "Luke, Blacks here, Chicago Docks 810, we'll have you, goodbye.", datetime.datetime(2024, 12, 13, 4, 30, 51, 347844): "Hey, Rex, what's the two heavy cross and leg turbos from the three-wheel left line? Go ahead.", datetime.datetime(2024, 12, 13, 4, 30, 55, 414719): '9.3 limit at 8.6.', datetime.datetime(2024, 12, 13, 4, 31, 12, 930969): "Copa 804, turn left whiskey, left alpha, I'm on the ground for the night.", datetime.datetime(2024, 12, 13, 4, 31, 16, 980969): 'Learn with ease, learn on alpha and monitor ground culture. 804 the next.', datetime.datetime(2024, 12, 13, 4, 31, 22, 397844): 'Good night.', datetime.datetime(2024, 12, 13, 4, 31, 50, 224719): 'Amherst 8 Uniform with Chief Super, contact here at the department.', datetime.datetime(2024, 12, 13, 4, 31, 53, 582844): '詳しくは、マスター・ユニフォームをご覧ください。', datetime.datetime(2024, 12, 13, 4, 31, 56, 181594): 'it.', datetime.datetime(2024, 12, 13, 4, 32, 27, 737844): 'ABEX 152 heavy wind, 

In [6]:
for k, v in result.transcriptions.items():
    print(k, v)

2024-12-13 04:30:00.030969 Luke, Blacks here, Chicago Docks 810, we'll have you, goodbye.
2024-12-13 04:30:51.347844 Hey, Rex, what's the two heavy cross and leg turbos from the three-wheel left line? Go ahead.
2024-12-13 04:30:55.414719 9.3 limit at 8.6.
2024-12-13 04:31:12.930969 Copa 804, turn left whiskey, left alpha, I'm on the ground for the night.
2024-12-13 04:31:16.980969 Learn with ease, learn on alpha and monitor ground culture. 804 the next.
2024-12-13 04:31:22.397844 Good night.
2024-12-13 04:31:50.224719 Amherst 8 Uniform with Chief Super, contact here at the department.
2024-12-13 04:31:53.582844 詳しくは、マスター・ユニフォームをご覧ください。
2024-12-13 04:31:56.181594 it.
2024-12-13 04:32:27.737844 ABEX 152 heavy wind, 300 at 10, runway 31 left.
2024-12-13 04:32:32.074719 Take off.
2024-12-13 04:32:33.475344 3166AX162H
2024-12-13 04:32:52.679094 Chef Lou 2637, cross the lake, travel on 23, one left, line for eight.
2024-12-13 04:32:56.543469 Long away, 215, Joplin, 1637.
2024-12-13 04:33:02.

In [1]:
!pip install RapidFuzz

Collecting RapidFuzz
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: RapidFuzz
Successfully installed RapidFuzz-3.10.1


In [8]:
from pathlib import Path
import json
from datetime import datetime, timedelta
from typing import List, Dict
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from rapidfuzz import fuzz, process
from tqdm import tqdm

TARGET_PHRASES = [
    "Clear for takeoff", "Cleared for takeoff",
    "Clear for take off", "Cleared for take off",
    "Clear to take off", "Cleared to take off"
]

def parse_filename_time(filename):
    """Extract datetime from ATC recording filename"""
    import re
    pattern = r'.*-(\w{3})-(\d{2})-(\d{4})-(\d{4})Z'
    match = re.search(pattern, filename)
    if match:
        month, day, year, time = match.groups()
        hour, minute = time[:2], time[2:]
        dt_string = f"{day} {month} {year} {hour}:{minute}"
        return datetime.strptime(dt_string, "%d %b %Y %H:%M")
    raise ValueError(f"Could not parse datetime from filename: {filename}")

def setup_transcriber(device=None):
    """Initialize the Whisper transcription model"""
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        "openai/whisper-large-v3",
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")

    return pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device
    )

def detect_keywords(text: str, target_phrases, threshold=80):
    """Find fuzzy matches of target phrases in text"""
    matches = process.extractOne(text, target_phrases, scorer=fuzz.partial_ratio)
    if matches and matches[1] >= threshold:
        return [(matches[0], matches[1])]
    return []

def process_audio_file(
    file_path: str,
    transcriber,
    output_dir: str = "transcripts",
    threshold: int = 80
) -> Dict:
    """Process a single audio file and detect keywords"""

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    file_path = Path(file_path)
    start_time = parse_filename_time(file_path.name)

    print(f"Transcribing {file_path.name}...")
    generate_kwargs = {'language': 'english', 'return_timestamps': True}
    result = transcriber(str(file_path), generate_kwargs=generate_kwargs)

    # Process chunks with timestamps
    processed_segments = []
    for chunk in result['chunks']:
        # Get timestamp and text
        timestamp_start = chunk['timestamp'][0]  # in seconds
        timestamp_end = chunk['timestamp'][1]    # in seconds
        text = chunk['text'].strip()

        # Calculate absolute start time for this segment
        seg_start = start_time + timedelta(seconds=timestamp_start)

        # Store segment info with matches
        segment_data = {
            "start_time": seg_start.isoformat(),
            "text": text,
            "duration": timestamp_end - timestamp_start,
            "matches": [
                {"phrase": p, "confidence": s}
                for p, s in detect_keywords(text, TARGET_PHRASES, threshold)
            ]
        }
        processed_segments.append(segment_data)

    final_result = {
        "file": str(file_path),
        "start_time": start_time.isoformat(),
        "segments": processed_segments
    }

    # Save to JSON
    output_file = output_dir / f"{file_path.stem}_transcript.json"
    with open(output_file, 'w') as f:
        json.dump(final_result, f, indent=2)

    return final_result

def process_directory(
    input_dir: str,
    output_dir: str = "transcripts",
    file_pattern: str = "*.mp3"
) -> List[Dict]:
    """Process all audio files in a directory"""

    transcriber = setup_transcriber()

    input_dir = Path(input_dir)
    audio_files = sorted(input_dir.glob(file_pattern))

    results = []
    for audio_path in tqdm(audio_files, desc="Processing files"):
        result = process_audio_file(
            str(audio_path),
            transcriber,
            output_dir
        )
        results.append(result)

        for segment in result["segments"]:
            if segment["matches"]:
                print(f"\nMatches in {audio_path.name} at {segment['start_time']}:")
                print(f"Text: {segment['text']}")
                for match in segment["matches"]:
                    print(f"- '{match['phrase']}' (confidence: {match['confidence']})")

    return results

results = process_directory("/content/audio_files")

Processing files:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing JFK12-Twr1-Dec-13-2024-1400Z.mp3...


You have passed language=english, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=english.
Processing files:   0%|          | 0/1 [01:58<?, ?it/s]


KeyError: 'chunks'

In [9]:
from pathlib import Path
import json
from datetime import datetime, timedelta
from typing import List, Dict
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from rapidfuzz import fuzz, process
from tqdm import tqdm

TARGET_PHRASES = [
    "Clear for takeoff", "Cleared for takeoff",
    "Clear for take off", "Cleared for take off",
    "Clear to take off", "Cleared to take off"
]

def parse_filename_time(filename):
    """Extract datetime from ATC recording filename"""
    import re
    pattern = r'.*-(\w{3})-(\d{2})-(\d{4})-(\d{4})Z'
    match = re.search(pattern, filename)
    if match:
        month, day, year, time = match.groups()
        hour, minute = time[:2], time[2:]
        dt_string = f"{day} {month} {year} {hour}:{minute}"
        return datetime.strptime(dt_string, "%d %b %Y %H:%M")
    raise ValueError(f"Could not parse datetime from filename: {filename}")

def setup_transcriber(device=None):
    """Initialize the Whisper transcription model"""
    if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        "openai/whisper-large-v3",
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")

    return pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
        chunk_length_s=30,  # Process in 30-second chunks
        stride_length_s=5    # 5-second overlap between chunks
    )

def detect_keywords(text: str, target_phrases, threshold=80):
    """Find fuzzy matches of target phrases in text"""
    matches = process.extractOne(text, target_phrases, scorer=fuzz.partial_ratio)
    if matches and matches[1] >= threshold:
        return [(matches[0], matches[1])]
    return []

def process_audio_file(
    file_path: str,
    transcriber,
    output_dir: str = "transcripts",
    threshold: int = 80
) -> Dict:
    """Process a single audio file and detect keywords"""

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)

    file_path = Path(file_path)
    start_time = parse_filename_time(file_path.name)

    print(f"Transcribing {file_path.name}...")
    generate_kwargs = {'language': 'english', 'return_timestamps': True}
    result = transcriber(str(file_path), generate_kwargs=generate_kwargs)

    # Process segments with timestamps
    processed_segments = []
    if isinstance(result, dict) and 'text' in result:
        # Single segment case
        segment_data = {
            "start_time": start_time.isoformat(),
            "text": result['text'].strip(),
            "duration": None,  # Duration unknown in this case
            "matches": [
                {"phrase": p, "confidence": s}
                for p, s in detect_keywords(result['text'], TARGET_PHRASES, threshold)
            ]
        }
        processed_segments.append(segment_data)
    else:
        # Multiple segments case
        for idx, segment in enumerate(result):
            if isinstance(segment, dict) and 'timestamp' in segment:
                timestamp_start = segment['timestamp'][0]
                timestamp_end = segment['timestamp'][1]
                text = segment['text'].strip()
            else:
                # If no timestamps, estimate based on position and chunk length
                chunk_length = 30  # seconds
                timestamp_start = idx * chunk_length
                timestamp_end = (idx + 1) * chunk_length
                text = segment.strip()

            seg_start = start_time + timedelta(seconds=timestamp_start)

            segment_data = {
                "start_time": seg_start.isoformat(),
                "text": text,
                "duration": timestamp_end - timestamp_start,
                "matches": [
                    {"phrase": p, "confidence": s}
                    for p, s in detect_keywords(text, TARGET_PHRASES, threshold)
                ]
            }
            processed_segments.append(segment_data)

    final_result = {
        "file": str(file_path),
        "start_time": start_time.isoformat(),
        "segments": processed_segments
    }

    # Save to JSON
    output_file = output_dir / f"{file_path.stem}_transcript.json"
    with open(output_file, 'w') as f:
        json.dump(final_result, f, indent=2)

    return final_result

def process_directory(
    input_dir: str,
    output_dir: str = "transcripts",
    file_pattern: str = "*.mp3"
) -> List[Dict]:
    """Process all audio files in a directory"""

    transcriber = setup_transcriber()

    input_dir = Path(input_dir)
    audio_files = sorted(input_dir.glob(file_pattern))

    results = []
    for audio_path in tqdm(audio_files, desc="Processing files"):
        try:
            result = process_audio_file(
                str(audio_path),
                transcriber,
                output_dir
            )
            results.append(result)

            for segment in result["segments"]:
                if segment["matches"]:
                    print(f"\nMatches in {audio_path.name} at {segment['start_time']}:")
                    print(f"Text: {segment['text']}")
                    for match in segment["matches"]:
                        print(f"- '{match['phrase']}' (confidence: {match['confidence']})")
        except Exception as e:
            print(f"Error processing {audio_path}: {str(e)}")
            continue

    return results

In [10]:
results = process_directory("/content/audio_files")

Processing files:   0%|          | 0/1 [00:00<?, ?it/s]

Transcribing JFK12-Twr1-Dec-13-2024-1400Z.mp3...


Processing files:   0%|          | 0/1 [02:11<?, ?it/s]


KeyboardInterrupt: 

 if device is None:
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        "openai/whisper-large-v3",
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True
    )
    model.to(device)
    
    processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
    
    return pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
        chunk_length_s=30,  # Process in 30-second chunks
        stride_length_s=5    # 5-second overlap between chunks
    )

In [32]:
def parse_filename_time(filename):
    """Extract datetime from ATC recording filename"""
    import re
    pattern = r'.*-(\w{3})-(\d{2})-(\d{4})-(\d{4})Z'
    match = re.search(pattern, filename)
    if match:
        month, day, year, time = match.groups()
        hour, minute = time[:2], time[2:]
        dt_string = f"{day} {month} {year} {hour}:{minute}"
        return datetime.strptime(dt_string, "%d %b %Y %H:%M")
    raise ValueError(f"Could not parse datetime from filename: {filename}")

In [11]:
result = process_audio_file(
    '/content/audio_files/JFK12-Twr1-Dec-13-2024-1400Z.mp3',
    transcriber=setup_transcriber(),
    output_dir="transcripts",
    threshold=80
  )

Transcribing JFK12-Twr1-Dec-13-2024-1400Z.mp3...




KeyboardInterrupt: 

In [40]:
from pathlib import Path
import json
from datetime import datetime, timedelta
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from rapidfuzz import fuzz, process

# Setup model and processor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-large-v3",
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
).to(device)

processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    chunk_length_s=30)

In [33]:
def parse_filename_time(filename):
    """Extract datetime from ATC recording filename"""
    import re
    pattern = r'.*-(\w{3})-(\d{2})-(\d{4})-(\d{4})Z'
    match = re.search(pattern, filename)
    if match:
        month, day, year, time = match.groups()
        hour, minute = time[:2], time[2:]
        dt_string = f"{day} {month} {year} {hour}:{minute}"
        return datetime.strptime(dt_string, "%d %b %Y %H:%M")
    raise ValueError(f"Could not parse datetime from filename: {filename}")

In [None]:
file_path = "/content/audio_files/JFK12-Twr1-Dec-13-2024-1400Z.mp3"
base_time = parse_filename_time(file_path)

result = pipe(
    file_path,
    generate_kwargs={
        'language': 'english'
    },
    return_timestamps=True
)



In [None]:
# 2 min per file


In [None]:
result.keys()

In [None]:
print(type(result['text']))
print(type(result['chunks']))

In [39]:
result['chunks']

[{'timestamp': (0.0, 24.62), 'text': " We'll be right back."},
 {'timestamp': (0.0, 3.36),
  'text': ' Target 343, cleared for takeoff, 3-0-0-1-4'},
 {'timestamp': (4.88, 7.68),
  'text': ' Raef, got the 30-43, cleared for takeoff, 3-1-Lift'},
 {'timestamp': (8.08, 10.24),
  'text': ' 3-1-6-3, good morning, 3-1-Lift, line in wait'},
 {'timestamp': (10.4, 13.28), 'text': ' Good morning, 3-1-Lift, JetBlue 163'},
 {'timestamp': (14.48, 17.76), 'text': ' Tower, high, D-5445, ILS 3-1-Right'},
 {'timestamp': (18.08, 21.12),
  'text': ' JetBlue 445, good morning, 3-0-0-1-3, cleared to land'},
 {'timestamp': (22.4, 25.28), 'text': ' Cleared to land, 3-1-Right, D-5445'},
 {'timestamp': (25.44, 27.28),
  'text': ' JetBlue 4-6, contact Star Trek, you do?'},
 {'timestamp': (27.52, 29.2), 'text': ' Star Trek, Star Trek, 356'},
 {'timestamp': (0.0, 0.62), 'text': ' you'},
 {'timestamp': (0.0, 7.0),
  'text': " We're going to have to return to the gate for the maintenance station."},
 {'timestamp': (

In [36]:
for i, chunk in enumerate(result['chunks']):
    start_time = base_time + timedelta(seconds=chunk['timestamp'][0])
    text = chunk['text']
    print(f"{i+1}: {start_time.isoformat()} - {text}")



1: 2024-12-13T14:00:00 -  We'll be right back.
2: 2024-12-13T14:00:00 -  Target 343, cleared for takeoff, 3-0-0-1-4
3: 2024-12-13T14:00:04.880000 -  Raef, got the 30-43, cleared for takeoff, 3-1-Lift
4: 2024-12-13T14:00:08.080000 -  3-1-6-3, good morning, 3-1-Lift, line in wait
5: 2024-12-13T14:00:10.400000 -  Good morning, 3-1-Lift, JetBlue 163
6: 2024-12-13T14:00:14.480000 -  Tower, high, D-5445, ILS 3-1-Right
7: 2024-12-13T14:00:18.080000 -  JetBlue 445, good morning, 3-0-0-1-3, cleared to land
8: 2024-12-13T14:00:22.400000 -  Cleared to land, 3-1-Right, D-5445
9: 2024-12-13T14:00:25.440000 -  JetBlue 4-6, contact Star Trek, you do?
10: 2024-12-13T14:00:27.520000 -  Star Trek, Star Trek, 356
11: 2024-12-13T14:00:00 -  you
12: 2024-12-13T14:00:00 -  We're going to have to return to the gate for the maintenance station.
13: 2024-12-13T14:00:09 -  No problem, Jeff. 481, continuing Juliet, short of 22R, give ground at 4.9, we'll take care of you when you come back out.
14: 2024-12-13T14

In [20]:
print("\nTranscription:")
if isinstance(result, dict) and 'text' in result:
    print(result['text'])
else:
    for segment in result:
        if isinstance(segment, dict) and 'timestamp' in segment:
            print(f"[{segment['timestamp'][0]:.1f}s - {segment['timestamp'][1]:.1f}s]: {segment['text']}")
        else:
            print(segment)


Transcription:
 We'll be right back. Target 343, cleared for takeoff, 3-0-0-1-4 Raef, got the 30-43, cleared for takeoff, 3-1-Lift 3-1-6-3, good morning, 3-1-Lift, line in wait Good morning, 3-1-Lift, JetBlue 163 Tower, high, D-5445, ILS 3-1-Right JetBlue 445, good morning, 3-0-0-1-3, cleared to land Cleared to land, 3-1-Right, D-5445 JetBlue 4-6, contact Star Trek, you do? Star Trek, Star Trek, 356 you We're going to have to return to the gate for the maintenance station. No problem, Jeff. 481, continuing Juliet, short of 22R, give ground at 4.9, we'll take care of you when you come back out. Juliet, short 22R, Jeff, 481. Mark at 3043, you'll follow us, have a nice day. Mark at 3043, roger. Checkpoint 163, through and left, cleared to takeoff with 30.012 Cleared to takeoff 31L, checkpoint 163 Delta 1845, can you give me a good morning through and left, 5.8? 31L, 5.8, Delta 1845 Tower, good morning, Delta 1993, ILS 31R. Delta 1993, Canary Tower, wind 310 at 14, wind 31R, cleared to la

In [None]:
print("\nTranscription:")
if isinstance(result, dict) and 'text' in result:
    print(result['text'])
else:
    for segment in result:
        if isinstance(segment, dict) and 'timestamp' in segment:
            print(f"[{segment['timestamp'][0]:.1f}s - {segment['timestamp'][1]:.1f}s]: {segment['text']}")
        else:
            print(segment)