In [3]:
import os
import whisper
import polars as pl
from babel.dates import format_time

In [6]:
# Function to transcribe audio using OpenAI's transcription service
def transcribe_audio(model, file_path):
    with open(file_path, "rb") as audio_file:
        transcription = client.audio.transcriptions.create(
            model="whisper-1", 
            file=audio_file, 
            response_format="srt"
        )
        # Pass the transcription directly for processing
        return process_transcription(transcription)
        #return response  # Directly return the response, assuming it's the transcription text

# Function to process the raw transcription into the desired format
def process_transcription(transcription):
    blocks = transcription.split('\n\n')
    processed_lines = []
    for block in blocks:
        lines = block.split('\n')
        if len(lines) >= 3:
            time_range = lines[1]
            text = lines[2]
            start_time = time_range.split(' --> ')[0]
            # Convert the time format from "00:00:00,000" to "0:00:00"
            formatted_start_time = format_time(start_time)
            processed_line = f"[{formatted_start_time}]{text}"
            processed_lines.append(processed_line)
    return '\n'.join(processed_lines)


In [None]:

transcribe_audio(client, file_path = "../data/ad_audio_testing/pres_trimmed_incl_scene-P-1632-82726.wav")

In [1]:
!whisper "../data/ad_audio_testing/pres_trimmed_incl_scene-P-1632-82726.wav" --language en --model large

[00:00.000 --> 00:02.960]  John McCain promised a clean campaign.
[00:03.180 --> 00:05.420]  Then attacked Governor Bush with misleading ads.
[00:05.420 --> 00:09.480]  McCain says he's the only candidate who can beat Gore on campaign finance.
[00:09.600 --> 00:14.940]  But news investigations reveal McCain solicits money from lobbyists with interest before his committee
[00:14.940 --> 00:17.700]  and pressures agencies on behalf of contributors.
[00:18.100 --> 00:21.180]  He attacks special interests, but the Wall Street Journal reports
[00:21.180 --> 00:23.800]  McCain's campaign is crawling with lobbyists.
[00:23.960 --> 00:25.960]  His conservative hometown paper warns
[00:25.960 --> 00:29.200]  it's time the rest of the nation learns about the McCain we know.
[00:30.000 --> 00:59.980]  Thanks for watching.


In [4]:
model = whisper.load_model("large")
result = model.transcribe("../data/ad_audio_testing/pres_trimmed_incl_scene-P-1632-82726.wav")
print(result['segments'])



{'text': " John McCain promised a clean campaign. Then attacked Governor Bush with misleading ads. McCain says he's the only candidate who can beat Gore on campaign finance. But news investigations reveal McCain solicits money from lobbyists with interest before his committee and pressures agencies on behalf of contributors. He attacks special interests, but the Wall Street Journal reports McCain's campaign is crawling with lobbyists. His conservative hometown paper warns It's time the rest of the nation learns about the McCain we know. Thanks for watching.", 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.96, 'text': ' John McCain promised a clean campaign.', 'tokens': [50365, 2619, 49725, 10768, 257, 2541, 5129, 13, 50513], 'temperature': 0.0, 'avg_logprob': -0.11165671599538703, 'compression_ratio': 1.669811320754717, 'no_speech_prob': 0.14629918336868286}, {'id': 1, 'seek': 0, 'start': 3.18, 'end': 5.42, 'text': ' Then attacked Governor Bush with misleading ads.', 'tokens'

In [5]:
print(result['segments'])

[{'id': 0, 'seek': 0, 'start': 0.0, 'end': 2.96, 'text': ' John McCain promised a clean campaign.', 'tokens': [50365, 2619, 49725, 10768, 257, 2541, 5129, 13, 50513], 'temperature': 0.0, 'avg_logprob': -0.11165671599538703, 'compression_ratio': 1.669811320754717, 'no_speech_prob': 0.14629918336868286}, {'id': 1, 'seek': 0, 'start': 3.18, 'end': 5.42, 'text': ' Then attacked Governor Bush with misleading ads.', 'tokens': [50524, 1396, 12692, 14550, 15782, 365, 36429, 10342, 13, 50636], 'temperature': 0.0, 'avg_logprob': -0.11165671599538703, 'compression_ratio': 1.669811320754717, 'no_speech_prob': 0.14629918336868286}, {'id': 2, 'seek': 0, 'start': 5.5600000000000005, 'end': 9.46, 'text': " McCain says he's the only candidate who can beat Gore on campaign finance.", 'tokens': [50643, 49725, 1619, 415, 311, 264, 787, 11532, 567, 393, 4224, 45450, 322, 5129, 10719, 13, 50838], 'temperature': 0.0, 'avg_logprob': -0.11165671599538703, 'compression_ratio': 1.669811320754717, 'no_speech_prob