In [53]:
from dotenv import load_dotenv
load_dotenv()

True

In [54]:
from moviepy.editor import AudioFileClip
import tempfile
from io import BytesIO
import base64
import os
import nest_asyncio
nest_asyncio.apply()


def convert_audio_to_chunks(
    input_file_path, binary_chunk_size=18 * 1024 * 1024
):  # 18MB of binary data
    chunks = []

    with tempfile.NamedTemporaryFile(
        suffix=".mp3", mode="wb", delete=False
    ) as temp_audio_file:
        # Extract audio from MP4 or directly use MP3
        if input_file_path.endswith(".mp4"):
            video = AudioFileClip(input_file_path)
            video.write_audiofile(temp_audio_file.name, codec="mp3")
        else:
            with open(input_file_path, "rb") as audio_file:
                temp_audio_file.write(audio_file.read())
                temp_audio_file.flush()

        temp_audio_file_name = temp_audio_file.name

    # Reopen the temporary file in read-binary mode to read the audio data
    with open(temp_audio_file_name, "rb") as temp_audio_file_rb:
        audio_data = temp_audio_file_rb.read()

    # Split the binary audio data into chunks and encode to Base64
    for i in range(0, len(audio_data), binary_chunk_size):
        chunk = audio_data[i : i + binary_chunk_size]
        chunks.append(chunk)

    # Ensure to clean up the temporary file manually since delete=False
    os.remove(temp_audio_file_name)

    return chunks

In [55]:
input_file_path = 'data/short_dsa.mp4'  # or 'path/to/your/file.mp4'
chunks = convert_audio_to_chunks(input_file_path)
print(len(chunks))

MoviePy - Writing audio in /var/folders/9q/qp70wln55bd5fkdxdy0jw5t80000gn/T/tmp0rgyjtbt.mp3


                                                                      

MoviePy - Done.
1




In [56]:
import asyncio
from openai import AsyncOpenAI


async def transcribe_chunk(client, chunk, index):
    with tempfile.NamedTemporaryFile(
        suffix=".mp3", mode="wb", delete=True
    ) as temp_audio_file:
        temp_audio_file.write(chunk)
        temp_audio_file.flush()
        temp_audio_file.seek(0)

        with open(temp_audio_file.name, "rb") as audio_file:
            transcript_obj = await client.audio.transcriptions.create(
                model="whisper-1", file=audio_file, response_format="verbose_json"
            )

    # Include the chunk index to help with ordering and timestamp adjustments later
    return index, transcript_obj


async def transcribe_all_chunks(chunks):
    client = AsyncOpenAI()
    tasks = [transcribe_chunk(client, chunk, i) for i, chunk in enumerate(chunks)]
    results = await asyncio.gather(*tasks)

    # Ensure the results are ordered by the original chunk index
    ordered_results = sorted(results, key=lambda x: x[0])

    return ordered_results


def adjust_timestamps_and_combine(transcripts):
    combined_transcript = []
    total_duration = 0

    for _, transcript_obj in transcripts:
        segments = transcript_obj.segments
        for segment in segments:
            # Adjust timestamps
            segment["start"] += total_duration
            segment["end"] += total_duration
            combined_transcript.append(
                {
                    "text": segment["text"],
                    "start": segment["start"],
                    "end": segment["end"],
                }
            )

        # Update total duration for the next chunk
        last_segment = segments[-1]
        total_duration = last_segment["end"]

    return combined_transcript

In [57]:
async def transcribe_audio_chunks(chunks):
    # Define the chunk size (18MB of decoded data is a safe estimate to stay under 25MB when encoded)
    ordered_transcripts = await transcribe_all_chunks(chunks)
    combined_transcript = adjust_timestamps_and_combine(ordered_transcripts)

    return combined_transcript

In [58]:
transcripts = asyncio.run(transcribe_audio_chunks(chunks))


In [59]:
text = ""
for transcript in transcripts:
    text += "(" + str(transcript["start"]) + "): " + transcript["text"] + " "
print(text)

(0.0):  So at this point, we're pretty familiar with all of the data types (2.799999952316284):  that come native to C. We have characters, and floats, and integers, (6.840000152587891):  and doubles. (8.119999885559082):  And we're also familiar now with the CS50 data types of strings and bools. (13.279999732971191):  But that doesn't limit everything that we can do. (15.5600004196167):  We surely can do more. (17.200000762939453):  Indeed, with structures, that gives us an ability (19.200000762939453):  to start to define our own data types that (21.040000915527344):  might be useful for our own programs. (23.520000457763672):  What's cool about structures is they allow (25.239999771118164):  us to unify many different variables of different data types (29.15999984741211):  into a single brand new type. (31.799999237060547):  And we can give that new type its own unique type name (34.84000015258789):  as a way to identify it. (36.439998626708984):  This isn't the first time we've see

In [60]:
# Save the transcript to a text file
with open("transcript.txt", "w") as text_file:
    text_file.write(text)