In [1]:
import whisper

from symspellpy import SymSpell, Verbosity

import re
import difflib

In [2]:
# File Path of the audio file
music_file_path = "../../data/input/Godzilla.mp3"

# English dictionary file path
dictionary_path = "../../data/input/frequency_dictionary_en_82_765.txt"

# 1. Transcribe mp3 directly

In [3]:
# Load the base model
base_model = whisper.load_model("base")

In [None]:
# Transcribe an audio file
result_1 = base_model.transcribe(music_file_path)

# Write Result
print(result_1["text"])

 Ugh, you're a monster I can swallow a bottle of alcohol and a feel like Godzilla Better hit the deck like the cartilla My whole squad's I'm here walking around the party Across between a zombie, a pocket of symbibombida Rain-hearing witches probably the same reason I wrestle with vanias, shadis and a bitch I'm posseed I'm considerate to cross me across the mistake If they sleep in armidum, hose better get insomnia A.D.H.D. hot-drocksie cut, that's the prophecy In dirt, in dirt, in dirt, in a K-me lay, in a set If they can play date, been a vacate, retreat Like a vacate, may day This beat is crick-ray, ray-chay, ha-chay, ha-chay Matt Fee all the way to the bank, got spring flames They cannot tame a play-cake, the monster You get in my way, I'm gonna feed you to the monster I'm no more during the day, but at night I do a monster When the moush eyes like ice roll truckers I look like a villain that ain't always blocked by stars Up to the fire, spit a monster But I'm the best on the Louis

# 2. Transcribe mp3 with line-by-line

In [None]:
# Transcribe an audio file with word timestamps
result_2 = base_model.transcribe(music_file_path, word_timestamps=True)

In [None]:
# Lists for storing the lines
lines = []
current_line = []

# Pause threshold (seconds)
pause_threshold = 0.1  # 100 milliseconds

# Loop Segments
for segment in result_2["segments"]:
    words = segment.get("words", [])  # Some segments may not have "words", let's check
    for i, word in enumerate(words):
        current_line.append(word["word"])  # Add the word to the current line

        # Look at the start time of the next word
        if i < len(words) - 1:
            next_word_start = words[i + 1]["start"]
            current_word_end = word["end"]

            # If there is a long pause between two words, go to a new line
            if next_word_start - current_word_end > pause_threshold:
                lines.append("".join(current_line))
                current_line = []
            
# Add the last line
if current_line:
    lines.append(" ".join(current_line))
    
# Print the lines
for line in lines:
    print(line)

 Ugh,
 you're a monster I can swallow a bottle of alcohol and a feel like Godzilla Better hit the deck like the cartilla My whole squad's I'm here walking around the party Across between a zombie,
 a pocket of symbibombida Rain-hearing witches probably the same reason I wrestle with vanias, shadis and a bitch I'm posseed I'm considerate to cross me across the mistake If they sleep in armidum, hose better get insomnia A.D.H.D. hot-drocksie cut,
 that's the prophecy And I keep a day, but an A.K.A. Me lafe in the city, like a play date Been a vacate,
 retreat,
 like a vacate, me day This beat is Craig Ray, J-A-J-A-J-H-J Matt Feet all the way to the bank, got sprang flames They cannot tame a play cake, they're monster You get in my way, I'mma feed you to the monster I'm no more during the day, but at night I do a monster When the moose shines like ice real chargers I look like a villain that a little blockbuster I'm too a fiabid, a monster Burdened and the little oevee carcass Fire,
 I'm t

With this code, we can move to a new line when a certain amount of milliseconds pass between two words. The example mp3 doesn't work properly because it's Eminem.

# 3. Transcribe mp3 and use SymSpell Misspelling

In [3]:
# Create Transcription with base model of whisper
base_model = whisper.load_model("base")
result = base_model.transcribe(music_file_path)

## 3.1. Use Directly

In [4]:
# System Spell Checker
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load the dictionary file for English
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [6]:
# Correct Transcription Output
transcribed_text = result["text"]
corrected_words = []

for word in transcribed_text.split():
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    corrected_word = suggestions[0].term if suggestions else word
    corrected_words.append(corrected_word)

corrected_text = " ".join(corrected_words)

# Results
print("\n--- WHISPER Transcript ---\n")
print(transcribed_text)
print("\n--- After SYMSPELL Correction ---\n")
print(corrected_text)



--- WHISPER Transcript ---

 Ugh, you're a monster I can swallow a bottle of alcohol and a feel like Godzilla Better hit the deck like the cartilla My whole squad's I'm here walking around the party Across between a zombie, a pocket of symbibombida Rain-hearing witches probably the same reason I wrestle with vanias, shadis and a bitch I'm posseed I'm considerate to cross me across the mistake If they sleep in armidum, hose better get insomnia A.D.H.D. hot-drocksie cut, that's the prophecy In dirt, in dirt, in dirt, in a K-me lay, in a set If they can play date, been a vacate, retreat Like a vacate, may day This beat is crick-ray, ray-chay, ha-chay, ha-chay Matt Fee all the way to the bank, got spring flames They cannot tame a play-cake, the monster You get in my way, I'm gonna feed you to the monster I'm no more during the day, but at night I do a monster When the moush eyes like ice roll truckers I look like a villain that ain't always blocked by stars Up to the fire, spit a monster 

In [9]:
# Split the texts into lines for better readability
corrected_lines = corrected_text.split()
transcribed_lines = transcribed_text.split()

# Use difflib to find the differences
diff = difflib.unified_diff(transcribed_lines, corrected_lines, lineterm='')

# Print the differences
print('\n'.join(diff))

--- 
+++ 
@@ -1,8 +1,8 @@
-Ugh,
+ghz
 you're
 a
 monster
-I
+a
 can
 swallow
 a
@@ -13,27 +13,27 @@
 a
 feel
 like
-Godzilla
-Better
+godzilla
+better
 hit
-the
+they
 deck
 like
-the
-cartilla
-My
+they
+castilla
+by
 whole
-squad's
-I'm
+squads
+i'm
 here
 walking
 around
-the
+they
 party
-Across
+across
 between
 a
-zombie,
+zombie
 a
 pocket
 of
@@ -41,28 +41,28 @@
 Rain-hearing
 witches
 probably
-the
+they
 same
 reason
-I
+a
 wrestle
 with
-vanias,
-shadis
+vanish
+shades
 and
 a
 bitch
-I'm
-posseed
-I'm
+i'm
+posted
+i'm
 considerate
 to
 cross
 me
 across
-the
+they
 mistake
-If
+of
 they
 sleep
 in
@@ -73,237 +73,237 @@
 insomnia
 A.D.H.D.
 hot-drocksie
-cut,
+cut
 that's
-the
+they
 prophecy
-In
-dirt,
-in
-dirt,
-in
-dirt,
-in
-a
-K-me
-lay,
+in
+dirty
+in
+dirty
+in
+dirty
+in
+a
+home
+lay
 in
 a
 set
-If
+of
 they
 can
 play
-date,
+date
 been
 a
-vacate,
+vacated
 retreat
-Like
-a
-vacate,
+like
+a
+vacated
 may
 day
-This
+this
 beat
 is
 crick-ray,
 ray-chay,
 ha-ch

## 3.2. Hide Punctuation

In [10]:
transcribed_text = result["text"]

# Split the text into tokens
tokens = re.findall(r"[\w']+|[.,!?;]", transcribed_text)  # Split words and keep punctuation marks

# Correct Words keeping the punctuation marks
corrected_tokens = []
for token in tokens:
    if re.match(r"[\w']+", token):  # If the token is a word 
        suggestions = sym_spell.lookup(token, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_tokens.append(suggestions[0].term if suggestions else token)
    else:
        corrected_tokens.append(token)  # If the token is a punctuation mark 

# Join the tokens
corrected_text = " ".join(corrected_tokens)

# Remove extra spaces before punctuation marks
corrected_text = re.sub(r"\s+([.,!?;])", r"\1", corrected_text)

print("\n--- WHISPER Transcript ---\n")
print(transcribed_text)

print("\n--- After SYMSPELL Correction ---\n")
print(corrected_text)


--- WHISPER Transcript ---

 Ugh, you're a monster I can swallow a bottle of alcohol and a feel like Godzilla Better hit the deck like the cartilla My whole squad's I'm here walking around the party Across between a zombie, a pocket of symbibombida Rain-hearing witches probably the same reason I wrestle with vanias, shadis and a bitch I'm posseed I'm considerate to cross me across the mistake If they sleep in armidum, hose better get insomnia A.D.H.D. hot-drocksie cut, that's the prophecy In dirt, in dirt, in dirt, in a K-me lay, in a set If they can play date, been a vacate, retreat Like a vacate, may day This beat is crick-ray, ray-chay, ha-chay, ha-chay Matt Fee all the way to the bank, got spring flames They cannot tame a play-cake, the monster You get in my way, I'm gonna feed you to the monster I'm no more during the day, but at night I do a monster When the moush eyes like ice roll truckers I look like a villain that ain't always blocked by stars Up to the fire, spit a monster 

We Handled punctuations issues, There's still UpperCase issue

## 3.2. Handle UpperCase Problem

In [11]:
transcribed_text = result["text"]

# Separate Words and Punctuation
tokens = re.findall(r"[\w']+|[.,!?;]", transcribed_text)

# Edit Words Preserving Capitalization Formatting
corrected_tokens = []
for token in tokens:
    if re.match(r"[\w']+", token):  # If it's a word, correct it
        suggestions = sym_spell.lookup(token.lower(), Verbosity.CLOSEST, max_edit_distance=2)
        corrected_word = suggestions[0].term if suggestions else token.lower()

        # Keep original uppercase formatting
        if token.istitle():  # If the first letter is capitalized
            corrected_word = corrected_word.capitalize()
        elif token.isupper():  # If the word is in all capital letters
            corrected_word = corrected_word.upper()

        corrected_tokens.append(corrected_word)
    else:
        corrected_tokens.append(token)  # Leave punctuation as is

# Recombine the Sentences
corrected_text = " ".join(corrected_tokens)

# Remove Extra Spaces (Especially Before Punctuation)
corrected_text = re.sub(r"\s+([.,!?;])", r"\1", corrected_text)

# Result
print("\n--- WHISPER TRANSCRIPTION ---\n")
print(transcribed_text)

print("\n--- AFTER SYMSPELL CORRECTION (Capitalization Retained) ---\n")
print(corrected_text)



--- WHISPER TRANSCRIPTION ---

 Ugh, you're a monster I can swallow a bottle of alcohol and a feel like Godzilla Better hit the deck like the cartilla My whole squad's I'm here walking around the party Across between a zombie, a pocket of symbibombida Rain-hearing witches probably the same reason I wrestle with vanias, shadis and a bitch I'm posseed I'm considerate to cross me across the mistake If they sleep in armidum, hose better get insomnia A.D.H.D. hot-drocksie cut, that's the prophecy In dirt, in dirt, in dirt, in a K-me lay, in a set If they can play date, been a vacate, retreat Like a vacate, may day This beat is crick-ray, ray-chay, ha-chay, ha-chay Matt Fee all the way to the bank, got spring flames They cannot tame a play-cake, the monster You get in my way, I'm gonna feed you to the monster I'm no more during the day, but at night I do a monster When the moush eyes like ice roll truckers I look like a villain that ain't always blocked by stars Up to the fire, spit a monst

No matter how much we fix the capitalization problem, it can change even the correct words to wrong. (the -> they)