In [11]:
import whisperx
import gc 

device = "cpu" 
audio_file = "drone_delta_mix.mp3"
batch_size = 8 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

In [2]:

model = whisperx.load_model("medium.en", device, compute_type=compute_type)


Downloading (…)1d2350ce/config.json: 100%|██████████| 2.64k/2.64k [00:00<?, ?B/s]
Downloading (…)350ce/vocabulary.txt: 100%|██████████| 422k/422k [00:00<00:00, 662kB/s]
Downloading (…)350ce/tokenizer.json: 100%|██████████| 2.13M/2.13M [00:01<00:00, 1.94MB/s]
Downloading model.bin: 100%|██████████| 1.53G/1.53G [11:37<00:00, 2.19MB/s]
100%|██████████████████████████████████████| 16.9M/16.9M [00:25<00:00, 703kiB/s]
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file C:\Users\k66gu\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.


In [12]:
audio = whisperx.load_audio(audio_file)


In [5]:
audio

array([-2.1362305e-04, -2.1362305e-04, -3.0517578e-05, ...,
       -2.7465820e-04, -3.8452148e-03, -7.2631836e-03], dtype=float32)

In [13]:
result = model.transcribe(audio, batch_size=batch_size)


In [14]:
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)


In [15]:
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)


In [54]:
result['segments']

[{'start': 38.958,
  'end': 48.543,
  'text': " Cessnal aircraft, unauthorized UAS reported over the construction site, midfield, between the runways, we don't have an altitude audit.",
  'words': [{'word': 'Cessnal',
    'start': 38.958,
    'end': 39.378,
    'score': 0.598,
    'start_time_hms': '00:00:38',
    'end_time_hms': '00:00:39'},
   {'word': 'aircraft,',
    'start': 39.438,
    'end': 39.818,
    'score': 0.606,
    'start_time_hms': '00:00:39',
    'end_time_hms': '00:00:39'},
   {'word': 'unauthorized',
    'start': 39.859,
    'end': 40.499,
    'score': 0.576,
    'start_time_hms': '00:00:39',
    'end_time_hms': '00:00:40'},
   {'word': 'UAS',
    'start': 40.519,
    'end': 41.019,
    'score': 0.586,
    'start_time_hms': '00:00:40',
    'end_time_hms': '00:00:41'},
   {'word': 'reported',
    'start': 41.659,
    'end': 42.14,
    'score': 0.817,
    'start_time_hms': '00:00:41',
    'end_time_hms': '00:00:42'},
   {'word': 'over',
    'start': 42.36,
    'end': 4

In [55]:
text_info = []
for txt in result['segments']:
    text_info.append(txt['text'])
print(text_info)

[" Cessnal aircraft, unauthorized UAS reported over the construction site, midfield, between the runways, we don't have an altitude audit.", 'Tower, Mexico 1683, Cessnal 10R.', 'Cessnal 1583, Cessnal 10R, wind 060 at 11, runway 10R, cleared to land, traffic reporting position runway 14.', '10R, cleared to land, American 1583.', ' Central aircraft are on frequency.', 'Unauthorized UAS activity.', 'Lid sealed over the construction.', 'Between the runways, altitude between 300 and 350 feet.', 'Type unknown.', 'Pittsburgh tower, Frontier 686.', 'Kind of a dogleg right base to 10R.', 'Frontier 686, Pittsburgh ground.', 'The wind 070 at niner, runway 10R.', 'Clear to land.', 'Alright, 10R.', 'Clear to land.', 'Frontier 686.', " That's all aircraft on an unauthorized UAS activity.", 'Midfield, over the terminal, kind of by the construction area, last reported between 300 and 350 feet.', 'And the last report of that unauthorized UAS was towards the south end of the airfield, south of the termi

In [57]:
text_info = []
for txt in result['segments']:
    text_info.append(txt['text'])
result_text = ''.join(text_info).strip(" ")

"Cessnal aircraft, unauthorized UAS reported over the construction site, midfield, between the runways, we don't have an altitude audit.Tower, Mexico 1683, Cessnal 10R.Cessnal 1583, Cessnal 10R, wind 060 at 11, runway 10R, cleared to land, traffic reporting position runway 14.10R, cleared to land, American 1583. Central aircraft are on frequency.Unauthorized UAS activity.Lid sealed over the construction.Between the runways, altitude between 300 and 350 feet.Type unknown.Pittsburgh tower, Frontier 686.Kind of a dogleg right base to 10R.Frontier 686, Pittsburgh ground.The wind 070 at niner, runway 10R.Clear to land.Alright, 10R.Clear to land.Frontier 686. That's all aircraft on an unauthorized UAS activity.Midfield, over the terminal, kind of by the construction area, last reported between 300 and 350 feet.And the last report of that unauthorized UAS was towards the south end of the airfield, south of the terminal and construction area between 300 and 350 feet AGL.We don't have a type of

In [26]:
def _seconds_to_time(time_value):
    """
    Convert seconds to time based on a value
    Args:
        time_value (float): Time in seconds
    Returns:
        str: String formatted in HH:MM:SS
    """
    m,s = divmod(time_value, 60)
    h, m = divmod(m, 60)
    s = int(s)
    m = int(m)
    h = int(h)
    new_value = f'{h:02d}:{m:02d}:{s:02d}'
    return new_value


def _create_hms_time(segments):
    """
    Create new datetime values that abstract away from ever increasing seconds.
    Output is now of HH:MM:SS
    Returns:
        new_segments (list): Updated whisper results with new column
    """
    for item in segments:
        item['start_time_hms'] = _seconds_to_time(item['start'])
        item['end_time_hms'] = _seconds_to_time(item['end'])
    return segments

In [45]:
segments = _create_hms_time(result['segments'])

In [53]:
result

{'segments': [{'start': 38.958,
   'end': 48.543,
   'text': " Cessnal aircraft, unauthorized UAS reported over the construction site, midfield, between the runways, we don't have an altitude audit.",
   'words': [{'word': 'Cessnal',
     'start': 38.958,
     'end': 39.378,
     'score': 0.598,
     'start_time_hms': '00:00:38',
     'end_time_hms': '00:00:39'},
    {'word': 'aircraft,',
     'start': 39.438,
     'end': 39.818,
     'score': 0.606,
     'start_time_hms': '00:00:39',
     'end_time_hms': '00:00:39'},
    {'word': 'unauthorized',
     'start': 39.859,
     'end': 40.499,
     'score': 0.576,
     'start_time_hms': '00:00:39',
     'end_time_hms': '00:00:40'},
    {'word': 'UAS',
     'start': 40.519,
     'end': 41.019,
     'score': 0.586,
     'start_time_hms': '00:00:40',
     'end_time_hms': '00:00:41'},
    {'word': 'reported',
     'start': 41.659,
     'end': 42.14,
     'score': 0.817,
     'start_time_hms': '00:00:41',
     'end_time_hms': '00:00:42'},
    {'w

In [None]:
{'segments': [{'start': 38.958,
   'end': 48.543,
   'text': " Cessnal aircraft, unauthorized UAS reported over the construction site, midfield, between the runways, we don't have an altitude audit.",
   'words': [{'word': 'Cessnal',
     'start': 38.958,
     'end': 39.378,
     'score': 0.598,
     'start_time_hms': '00:00:38',
     'end_time_hms': '00:00:39'}]}]}