In [1]:
#!pip install flashtext
#!pip install rapidfuzz
#!pip install sounddevice 
#!pip install torch 
#!pip install transformers
#!pip install scipy 
#!pip install numpy


In [1]:
import torch
import numpy as np
import sounddevice as sd
from scipy.signal import resample
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [2]:
from flashtext import KeywordProcessor
from rapidfuzz import process

In [3]:
# Device & dtype setup
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(torch_dtype)

cuda:0
torch.float16


### Load Whisper Large V3 Turbo
#### model from hugging face: 
##### (1)https://huggingface.co/openai/whisper-large-v3-turbo 
##### (2)https://huggingface.co/primeline/whisper-large-v3-turbo-german 

In [4]:
#model_id = "primeline/whisper-large-v3-turbo-german"
#model_id = "openai/whisper-large-v3-turbo"
#If you load the model from local Machine
#model_id = r"D:\ASR_Model\whisper-large-v3-turbo"
model_id = r"D:\ASR_Model\whisper-large-v3-turbo-german"

In [5]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    #low_cpu_mem_usage=True,
    use_safetensors=True,
)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=Tr

In [6]:
processor = AutoProcessor.from_pretrained(model_id)

In [7]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)



### Initialize flashtext KeywordProcessor

In [8]:

keyword_processor = KeywordProcessor()

### Custom vocabulary corrections / Add custom vocabulary

In [9]:

custom_vocab = {
    "*Blutdruck*": ["blut druck", "bluttruck", "blut druk", "blud druck"],
    "*Herzinfark*t": ["herz infarkt", "herz in fakt", "herzin fackt", "hertz infarkt"],
    "*Diabetes*": ["di abetes", "diabetis", "diabedes", "di abetis"],
    "*Asthma*": ["asma", "ast ma", "asth mar", "azma"],
    "*Krebs*": ["crebs", "crabs", "kreps", "kreppz"],
    "*Fieber*": ["fiba", "fiba", "fever", "fiebar"],
    "*Infektion*": ["infec tion", "in fektion", "infekshun", "infactshion"],
    "*Antibiotika*": ["antibiotika", "anti biotika", "antybiotika", "anti biotika"],
    "*Schmerzmittel*": ["schmertz mittel", "shmertzmittel", "smertz mittel", "schmerz mitl"],
    "*Notaufnahme*": ["not aufnahme", "noto fnahme", "notaufnahm", "note aufnahme"],
    "*Herzrhythmusstörung*": ["herz rhythmus störung", "hertz rithmus störung", "herz rytmus störung"],
    "*Lungenentzündung*": ["lungen entzuendung", "lungen entzündung", "lungen enzündung", "lungen entzundung"],
    "*Arzneimittel*": ["arz neimittel", "arznei mittel", "arznaimittel", "ars nai mittel"],
    "*Jet Engine*": ["jet injun", "judging", "jett in june","jet in june"],
    "*Quantum Bit*": ["quantum bet", "kwantum bit"],
    "*ChatGenie*": ["chat genie", "chat jeanie", "chat gini", "chatt ginny"],
    "*Foysal*": ["abdullah", "abdulla", "abdula"],
}

### Populate FlashText and a map for fuzzy matching

In [17]:

dialect_map = {}
for correct_term, variants in custom_vocab.items():
    for variant in variants:
        keyword_processor.add_keyword(variant.lower(), correct_term)
        dialect_map[variant.lower()] = correct_term

variant_list = list(dialect_map.keys())

# Fuzzy fallback function
def fuzzy_correction(text, threshold=90):
    words = text.split()
    corrected = []
    for word in words:
        if word in variant_list:
            corrected.append(dialect_map[word])
        else:
            match = process.extractOne(word, variant_list)
            if match and match[1] >= threshold:
                corrected.append(dialect_map[match[0]])
            else:
                corrected.append(word)
    return " ".join(corrected)

### Real-time streaming function

In [18]:

def real_time_transcribe(duration_chunk=3.0, input_rate=44100, model_rate=16000):
    buffer = []
    print("Start speaking...\n")

    def callback(indata, frames, time, status):
        if status:
            print("Status:", status)
        audio = indata[:, 0]  # mono
        resampled = resample(audio, int(len(audio) * model_rate / input_rate)).astype(np.float32)
        result = pipe(resampled, generate_kwargs={"language": "german", "task": "transcribe"})
        text = result.get("text", "").strip()

        if text:
            flashtext_corrected = keyword_processor.replace_keywords(text.lower())
            final_corrected = fuzzy_correction(flashtext_corrected)
            buffer.append(final_corrected)
            print(" ".join(buffer), end="\r")

    try:
        with sd.InputStream(callback=callback,
                            channels=1,
                            samplerate=input_rate,
                            blocksize=int(duration_chunk * input_rate),
                            dtype='float32'):
            while True:
                sd.sleep(1000)
    except KeyboardInterrupt:
        print("\nTranscription stopped.")
        print("\nFinal Transcript:\n", " ".join(buffer))



In [20]:
#Start real-time transcription
real_time_transcribe()

Start speaking...

beim telefonieren mein telefon klingelt. ich nehme den anruf an. *Diabetes* ist meine meine freundin maria. sie fragt, wie *Diabetes* mir geht. wir sprechen über unsere pläne für das wochenende. maria schlägt vor, ins kino zu gehen. wir nehmen zu *Lungenentzündung* wir vereinbaren eine zeit. verabschieden uns *Lungenentzündung* legen auch beim telefonieren
Transcription stopped.

Final Transcript:
 beim telefonieren mein telefon klingelt. ich nehme den anruf an. *Diabetes* ist meine meine freundin maria. sie fragt, wie *Diabetes* mir geht. wir sprechen über unsere pläne für das wochenende. maria schlägt vor, ins kino zu gehen. wir nehmen zu *Lungenentzündung* wir vereinbaren eine zeit. verabschieden uns *Lungenentzündung* legen auch beim telefonieren


### References
##### https://pypi.org/project/flashtext/
##### https://pypi.org/project/RapidFuzz/