# Whisper Transcription

Simple audio transcription following OpenAI "getting started" guide for the whisper API


In [4]:
%pip install -r requirements.txt


Collecting onnxruntime (from -r requirements.txt (line 6))
  Downloading onnxruntime-1.16.3-cp311-cp311-macosx_10_15_x86_64.whl.metadata (4.3 kB)
Collecting coloredlogs (from onnxruntime->-r requirements.txt (line 6))
  Using cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Collecting flatbuffers (from onnxruntime->-r requirements.txt (line 6))
  Using cached flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->-r requirements.txt (line 6))
  Using cached humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
Downloading onnxruntime-1.16.3-cp311-cp311-macosx_10_15_x86_64.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)
Installing collected packages: flatbuffers, humanfriendly, coloredlogs, onnxruntime
Successfully installed coloredlogs-15.0.1 flatbuffer

In [2]:
# Load env variables

import os
from dotenv import dotenv_values, load_dotenv

load_dotenv()

True

# Basic OpenAI Whisper API Transcription

Essentially a copy of the OpenAI "getting started" guide for the whisper API, but with a few tweaks to make it work for me.

In [58]:
import os
from pydub import AudioSegment
from pydub.utils import make_chunks

# Set the audio path we want to work on
input_audio_path = "audio/hwb_user_interviews/charlott practitioner north.mp3"

output_folder = os.path.join(os.path.dirname(input_audio_path), os.path.basename(input_audio_path).split('.')[0])
subject_name = os.path.basename(input_audio_path).split('.')[0]

os.makedirs(output_folder, exist_ok=True)

myaudio = AudioSegment.from_file(input_audio_path, format=input_audio_path.split(".")[-1])
chunk_length_ms = 600000  # 10 minutes in milliseconds
chunks = make_chunks(myaudio, chunk_length_ms)  # Make chunks of 10 minutes
chunk_paths = []

for i, chunk in enumerate(chunks):
    chunk_name = f"{output_folder}/{subject_name}_chunk{i}.{input_audio_path.split('.')[-1]}"
    chunk_paths.append(chunk_name)
    print("exporting", chunk_name)
    chunk.export(chunk_name, format=input_audio_path.split(".")[-1])

#  chunks now contains the 10 minute a

exporting audio/hwb_user_interviews/charlott practitioner north/charlott practitioner north_chunk0.mp3
exporting audio/hwb_user_interviews/charlott practitioner north/charlott practitioner north_chunk1.mp3
exporting audio/hwb_user_interviews/charlott practitioner north/charlott practitioner north_chunk2.mp3
exporting audio/hwb_user_interviews/charlott practitioner north/charlott practitioner north_chunk3.mp3
exporting audio/hwb_user_interviews/charlott practitioner north/charlott practitioner north_chunk4.mp3
exporting audio/hwb_user_interviews/charlott practitioner north/charlott practitioner north_chunk5.mp3


In [55]:
from docx import Document
from openai import OpenAI

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    # api_key="My API Key",
)

def transcribe_audio_obj(model, prompt, response_format, audio_file):
    transcript = client.audio.transcriptions.create(
            model=model,
            prompt=prompt,
            response_format=response_format,
            file=audio_file,
            language="en",
        )
    
    return transcript

def transcribe_audio(audio_file_path="", model="whisper-1", prompt="Transcribe the following audio?", response_format="srt"):
    with open(audio_file_path, 'rb') as audio_file:
        transcript = transcribe_audio_obj(model, prompt, response_format, audio_file)
        print(f"transcribed audio file: {audio_file_path}")
    return transcript


# def transcribe_chunk(audio_segment=None, model="whisper-1", prompt="Transcribe the following audio?", response_format="srt"):
#     with audio_segment.export() as audio_file:
#         transcript = transcribe_audio_obj(
#             model, prompt, response_format, audio_file)
#         print(f"transcribed audio file: {audio_file_path}")
#     return transcript

In [67]:
# for each item in chunk_paths, transcribe it and write the result to a text file, names the same as the audio file with a txt extension
def transcribe_files(file_list=[], prompt="Transribe this audio as accurately as possible."):
    for i, file_path in enumerate(file_list):
        transcript = transcribe_audio(file_path, prompt="Interview with Charlott, a teacher from North Wales, about her use of and opinions about the Hwb learning platform and the suite of online teaching and learning tools it proides.")
        with open(f"{file_path}.txt", "w") as f:
            f.write(transcript)
        print(f"wrote transcript to {file_path}.txt")

# transcribe_files()

In [68]:
# find all the mp3 files in the given path. return a list of paths to the mp3 files

def find_mp3s(path):
    mp3s = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".mp3"):
                mp3s.append(os.path.join(root, file))
    return mp3s


gareth_mp3s = find_mp3s("audio/hwb_user_interviews/gareth gwyr")

transcribe_files(gareth_mp3s, prompt="An interview with a teacher at Gwyr comprehensive school in Swansea, about the use of the Hwb learning platform and its suite of online tools.")



transcribed audio file: audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk4.mp3
wrote transcript to audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk4.mp3.txt
transcribed audio file: audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk5.mp3
wrote transcript to audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk5.mp3.txt
transcribed audio file: audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk2.mp3
wrote transcript to audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk2.mp3.txt
transcribed audio file: audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk3.mp3
wrote transcript to audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk3.mp3.txt
transcribed audio file: audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk1.mp3
wrote transcript to audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk1.mp3.txt
transcribed audio file: audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk0.mp3
wrote transcript to audio/hwb_user_interviews/gareth gwyr/gareth 

In [42]:
transcript = transcribe_audio(
    audio_file_path="audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk0.mp3",
    prompt="An interview with a teacher at Gwyr comprehensive school about the use of the Hwb learning platform and its suite of online tools."
)

done


In [43]:
transcript

"1\n00:00:00,000 --> 00:00:21,000\nI was head of IT at Gwyr, I'm head of computer science at Gwyr, but I've been here for 25 years, I know I'm only 21, and I've used Hwb since day one really, bit slow getting into it.\n\n2\n00:00:21,000 --> 00:00:44,000\nI'm not head of, well I am head of department now, but I've changed now to head of digital strategy across the school, so I look after, I did anyway, but they've kind of given it a new term now, because we were looking for another IT teacher, we couldn't get anyone, so we advertised for a head of department and I went sideways to a new title, but we couldn't get anyone.\n\n3\n00:00:44,000 --> 00:00:58,000\nBecause I know you, I know what you're referring to, but for Ines who probably doesn't know, my understanding from when we've known each other a long time is you were kind of looking after the IT systems, doing a lot of non-teaching sort of functions, are you still teaching?\n\n4\n00:00:58,000 --> 00:01:27,000\nYeah, I still teach, b

# Speaker Diarization

Experiment with [`pyannote.audio`](https://github.com/pyannote/pyannote-audio), an open-source toolkit written in Python for **speaker diarization**. 

In [69]:
%pip install   pyannote.audio

python(23515) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting pyannote.audio
  Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)
Collecting einops>=0.6.0 (from pyannote.audio)
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub>=0.13.0 (from pyannote.audio)
  Downloading huggingface_hub-0.20.2-py3-none-any.whl.metadata (12 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.1.3-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omegaconf<3.0,>=2.1 (from pyannote.audio)
  Using cached omegaconf-2.3.0-py3-none-any.whl (79 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB

We'll take a 10 minute chunk of one of the Hwb videos to test it out.

In [3]:
from pydub import AudioSegment

audio_path = "audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk0.mp3"
audio_chunk = AudioSegment.from_file(audio_path, format="mp3")

In [2]:
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained(
    'pyannote/speaker-diarization-3.0', use_auth_token="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")

  torchaudio.set_audio_backend("soundfile")
  from .autonotebook import tqdm as notebook_tqdm
  torchaudio.set_audio_backend("soundfile")
torchvision is not available - cannot save figures
speaker-embedding.onnx: 100%|██████████| 26.5M/26.5M [00:01<00:00, 17.8MB/s]


In [4]:
diarization = pipeline(audio_path)

In [5]:
# dump the diarization output to disk using RTTM format
with open("audio/hwb_user_interviews/gareth gwyr/gareth gwyr_chunk0.mp3.rttm", "w") as rttm:
    diarization.write_rttm(rttm)
    diarization.write

ValueError: Space-separated RTTM file format does not allow file URIs containing spaces (got: "gareth gwyr_chunk0").