### Necessary imports

In [None]:
import os
import whisper
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

### Whisper model choice and load paths, initial prompt from .env

In [None]:
model = whisper.load_model(os.getenv('CONFIG')) # choose large-v2

audioPath = os.getenv('AUDIO_PATH')
textPath = os.getenv('TEXT_PATH')
initialPrompt = os.getenv('INITIAL_PROMPT')


verbose = True prints out the transcription process with time stamps and segments, easy to compare outputs of short transcriptions while adjusting parameters.

Audio used; excerpt from ITeam-meeting, anknutnastiftelser.wav. 

Prompt used: "En del av ett möte om dataset, Chalmers, testamentera, digitalisering av fysiska papper, use case, stiftelser, stipendier, OCR:a, skanna."

In [None]:
# transcribe audio
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.8,
                              initial_prompt=initialPrompt)

### Adjusting the temperature

In [None]:
# transcribe audio
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              initial_prompt=initialPrompt)

# lower temperature much better for this audio

### Adjusting compression ratio threshold
Default is 2.4. Filters out segments with ratio above that. These segments are more repetitive with less coherent text.

In [None]:
# increase to 5, some difference but not major
# keeps words like "tror jag" and the like
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              initial_prompt=initialPrompt)

# increase to 10
# from "Jag ser två utmaningar med den datamängden." (with 5)
# to "Så att den datamängden är ju, jag ser två utmaningar med den datamängden."
# where the second is verbatim
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=10,
                              initial_prompt=initialPrompt)

### Adjusting logprob threshold
Default is -1.0. Tokens with log prob below -1 are filtered out, considered a moderate threshold balancing between filtering out low-probability tokens while keeping most of the meaningful transcription.

In [None]:
# set to 0.0, keep only highly confident predictions
# also eliminates some "redundant" words like "tror jag" 
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = 0.0,
                              initial_prompt=initialPrompt)

In [None]:
# set to -2, keeps some "redundancy"
# from "Den andra utmaningen med det är vad man kan göra med den här datan när man väl har den."
# to "Den andra utmaningen med det, det är vad man då kan göra med den här datan när man väl har den."
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -2.0,
                              initial_prompt=initialPrompt)

# overall for this audio not huge differences

### Adjust no speech threshold
Default is 0.6. Based on the model's confidence that speech is present in the segment. Segments with probabilities below this threshold are considered no speech.

Lowering the threshold can include more background noise or faint speech. Increasing the threshold might miss quiet or unclear speech.

In [None]:
# lower to 0.3
# splits into more segments, ex: segment 1: "Så att den datamängden är ju...", segment 2: "Jag ser två utmaningar med den datamängden.".
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.3,
                              initial_prompt=initialPrompt)

# seems to follow more natural speech better segment-wise

# increase to 0.8
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.8,
                              initial_prompt=initialPrompt)

# as expected: trims more, fewer segments

### Adjusting beam size
Default is 5. Essentially, beam size (from beam search algorithm) regulates how long the sequence candidates are before the cumulative probability is calculated and a sequence is chosen as the result, for the next step. 

Lowering beam size gives faster computation but likely less accuracy, and the other way around.

In [None]:
# increase to 9
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.5,
                              beam_size = 9,
                              initial_prompt=initialPrompt)

# decrease to 2
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.5,
                              beam_size = 2,
                              initial_prompt=initialPrompt)

# as expected, higher accuracy with larger beam size
# beam size 9: "Så att den datamängden är ju, jag ser två utmaningar med den datamängden."
# beam size 2: "Jag ser två utmaningar med den datamängden."

### Adjusting number of "best of" candidates during beam search
Default is 5. Determines the number of sequence candidates considered when choosing the best possible sequence in beam search. 

Default is usually enough but for higher accuracy, with computational resources, it can be increased.

### Adjusting patience during beam search
Default is 1.0. Patience controls how many sequence candidates are considered when calculating cumulative probabilities in order to select the best sequence. The parameter is multiplicative; set to 2.0 will double the number of candidates considered compared to default.

Increased patience can lead to higher transcription quality, but the computational requirements need to be considered, especially for multiple, or long, audiofiles.

In [None]:
# baseline
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.5,
                              beam_size = 5,
                              patience = 1.0,
                              initial_prompt=initialPrompt)

# double default
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.5,
                              beam_size = 5,
                              patience = 2.0,
                              initial_prompt=initialPrompt)

# for this audio, not a large difference, some spelling variations of unusual words, slightly more verbatim with increased patience

### Adjusting length penalty
Default is none, range [0,1]. Adjusts balance between length of transcribed output and its probability.

Higher value encourages model to have more detailed and comprehensive transcriptions even if they are longer. Lets the model favor longer transcriptions.

Lower value is more appropriate for generating summaries or key points from the audio, eliminates filler words.

The default value is neutral, meaning that the model doesn't favor shorter nor longer outputs and only relies on the probability of the sequence.

In [None]:
# increase to 0.2
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.5,
                              beam_size = 5,
                              patience = 1.0,
                              length_penalty = 0.2,
                              initial_prompt=initialPrompt)

# increase to 0.8
result = model.transcribe(audioPath, fp16=False,
                              task="transcribe",
                              verbose = True,
                              temperature=0.4,
                              compression_ratio_threshold=5,
                              logprob_threshold = -1.0,
                              no_speech_threshold = 0.5,
                              beam_size = 5,
                              patience = 1.0,
                              length_penalty = 0.8,
                              initial_prompt=initialPrompt)

# for this file, no differences really

### Suppress tokens
Default is none. Can be used to avoid transcribing common noise words or specific phrases like "uh" or "um", or control punctuation by ".", ",". 

### Prefix
Default is none. A string that appears at the beginning of the transcription, e.g. "This is a meeting about" when the audio starts with "new guidelines". Mainly useful for starting a transcription in a specific context such as for audio chunks of meetings.