In [1]:

import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer,WhisperFeatureExtractor
import soundfile as sf


model="ARTPARK-IISc/whisper-small-vaani-kannada"

# Load tokenizer and feature extractor individually
feature_extractor = WhisperFeatureExtractor.from_pretrained(model)
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Kannada", task="transcribe")


# Create the processor manually
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the processor and model
model = WhisperForConditionalGeneration.from_pretrained(model).to(device)


In [26]:
# Load and preprocess the audio file
audio_file_path = "data/sample.wav"  # replace with your audio file path

# load audio
audio_data, sample_rate = sf.read(audio_file_path)
audio_data = audio_data[:10000]

print(sample_rate)
print(audio_data.size)
print(audio_data.dtype)
# Ensure the audio is 16kHz (Whisper expects 16kHz audio)
if sample_rate != 16000:
    import torchaudio
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000, dtype=torch.float64)
    # audio_data = resampler(torch.tensor(audio_data).unsqueeze(0)).squeeze().numpy()
    audio_data_tensor = torch.tensor(audio_data).double()
    print(audio_data_tensor.dtype)
    audio_data_tensor = audio_data_tensor.unsqueeze(0).squeeze().numpy()
    print(audio_data_tensor.shape)
    audio_data = resampler(torch.tensor(audio_data))


# Use the processor to prepare the input features
input_features = processor(audio_data, sampling_rate=16000, return_tensors="pt").input_features.to(device)

# Generate transcription (disable gradient calculation during inference)
with torch.no_grad():
    predicted_ids = model.generate(input_features)

# Decode the generated IDs into human-readable text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

print(transcription)

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


48000
10000
float64
torch.float64
(10000,)
ಮುಂದೇನೆಜ್‌ನ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತಿದೆ ಎಲ್ಲ ಕಾಣುತ


### ASR for kannada

In [1]:
# Kannada transcription
import torch
from transformers import pipeline
import soundfile as sf
import librosa

# path to the audio file to be transcribed
audio = "data/sample.wav"
audio, sampling_rate = librosa.load(audio, sr=16000)
# audio_file, sample_rate = sf.read(audio)
audio_file = audio[:10000]
print(audio_file.shape)
# device = "cuda:0" if torch.cuda.is_available() else "cpu"


# if sample_rate != 16000:
#     import torchaudio
#     resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000, dtype=torch.float64)
#     # audio_data = resampler(torch.tensor(audio_data).unsqueeze(0)).squeeze().numpy()
#     audio_data_tensor = torch.tensor(audio_file).double()
#     audio_data_tensor = audio_data_tensor.unsqueeze(0).squeeze().numpy()
#     audio_data = resampler(torch.tensor(audio_file))

# # audio_data = audio_data.numpy()

# # Use the processor to prepare the input features
# input_features = processor(audio_data, sampling_rate=16000, return_tensors="pt").input_features.to(device)
# input_features = input_features.numpy()
# print(input_features.shape)


transcribe = pipeline(task="automatic-speech-recognition", model="vasista22/whisper-kannada-tiny", chunk_length_s=30, device="cpu")
# transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(language="kn", task="transcribe")
# print(transcribe.model.config.forced_decoder_ids)

print('Transcription: ', transcribe(audio_file))

(10000,)


Device set to use cpu


IndexError: index -2 is out of bounds for dimension 0 with size 0