In [None]:
#  Install necessary libraries
!pip install transformers torch torchaudio pyaudio pydub huggingface_hub librosa soundfile

In [None]:
# I seems that some librareis are needed in order to install pyaudio.
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install PyAudio

In [9]:
# Pydub couldn't be installed in the above block, so I add it now here
!pip install pydub



In [7]:
# Import libraries
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer
import pyaudio
import wave
import librosa
import soundfile as sf
from pydub import AudioSegment
from IPython.display import Audio
import os


## Part 1: Loading model and evaluating

In [None]:
# Load  fine-tuned model from Hugging Face
# Replace 'your-username/your-repo-name' with your Hugging Face model repository ID
MODEL_ID = "your-username/your-repo-name"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load the processor (combines feature extractor and tokenizer)
# Using WhisperProcessor is often more convenient
try:
    processor = WhisperProcessor.from_pretrained(MODEL_ID)
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)
    print(f"Successfully loaded model and processor for {MODEL_ID}")
except Exception as e:
    print(f"Error loading processor. Trying feature extractor and tokenizer separately: {e}")
    # Fallback to loading tokenizer and feature extractor separately if processor fails
    # This is closer to your original training script's setup
    tokenizer = WhisperTokenizer.from_pretrained(MODEL_ID, language="spanish", task="transcribe") # Or your target language e.g. "gn"
    feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_ID)
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)
    # You might need to create a processor manually if you go this route for the pipeline later
    # processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    print(f"Successfully loaded model, tokenizer, and feature extractor for {MODEL_ID}")

model.eval() # Set the model to evaluation mode

# Regarding language for generation:
# Your training script used 'spanish' in model.generate().
# For best results with your Guarani fine-tuned model, you might want to specify Guarani.
# Whisper uses language codes (e.g., 'gn' for Guarani if your tokenizer was adapted or supports it).
# If you used 'spanish' as a target language token during fine-tuning consistently, you might stick to it.
# You can set this in the generation config or pass it to the generate function.
# Example:
# model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="gn", task="transcribe")
# Or if your processor was loaded with Spanish as default and fine-tuning adapted it for Guarani without changing the language token:
# model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="spanish", task="transcribe")
# For now, let's assume the loaded model's config is appropriate or you'll pass language to generate/pipeline.

## Part 2. Getting audio as input

In [11]:
# Cell 4: Record audio for a fixed duration
FORMAT = pyaudio.paInt16  # Audio format (16-bit PCM)
CHANNELS = 1              # Number of audio channels (1 for mono, 2 for stereo)
RATE = 16000              # Sample rate (Whisper expects 16kHz)
CHUNK = 1024              # Number of frames per buffer
RECORD_SECONDS = 5        # Duration of recording in seconds
WAV_FILENAME = "recorded_audio.wav"
MP3_FILENAME = "recorded_audio.mp3"

audio = pyaudio.PyAudio()

# Start Recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording...")
frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("Finished recording.")

# Stop Recording
stream.stop_stream()
stream.close()
audio.terminate()

# Save the recorded data as a WAV file
wf = wave.open(WAV_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

print(f"Audio saved as {WAV_FILENAME}")

# Convert WAV to MP3 using pydub
try:
    sound = AudioSegment.from_wav(WAV_FILENAME)
    sound.export(MP3_FILENAME, format="mp3")
    print(f"Converted to {MP3_FILENAME}")
    os.remove(WAV_FILENAME) # Optional: remove the intermediate WAV file
    # Display the audio player in the notebook
    display(Audio(MP3_FILENAME))
except Exception as e:
    print(f"Could not convert to MP3. Make sure ffmpeg is installed and in your PATH. Error: {e}")
    print("You can still use the WAV file for prediction if MP3 conversion fails, but ensure your prediction block loads the correct file type.")

OSError: [Errno -9996] Invalid input device (no default output device)

In [10]:
# Cell 5: Record audio (stop with Enter key in console - more for script, but can work)
# This method will record until you press Enter in the console where Jupyter is running.
# Note: In some environments (like Colab without specific widgets), direct key press detection is hard.
# A simpler alternative is just a longer fixed duration.

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000 # Whisper expects 16kHz
CHUNK = 1024
WAV_FILENAME_MANUAL = "recorded_audio_manual.wav"
MP3_FILENAME_MANUAL = "recorded_audio_manual.mp3"

audio = pyaudio.PyAudio()

stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording... Press Enter in the console running Jupyter to stop.")
frames = []
# This input() will block, so recording happens while it waits.
# This isn't a perfect "stop recording on keypress" but a simple way to gate it.
# For true keypress stop, you'd need a more complex GUI or threading.
try:
    while True: # A loop to simulate continuous recording until input
        data = stream.read(CHUNK)
        frames.append(data)
        # This is a crude way to check for input without blocking read too much
        # It's not ideal. A better way is to record fixed long duration or use threads.
        # For this example, let's make it simpler: Record for a max duration or until input.
        # The input() below is the primary stop.
except KeyboardInterrupt: # If you press Ctrl+C in console
    print("Recording stopped by user.")
except Exception as e:
    pass # Handle other potential stream errors

# The following is a simpler approach for notebooks:
# Ask user to press enter, then record for a few seconds.
# Or, just use a fixed duration like Method 1.

# Let's refine Method 2 for a notebook: record after user hits enter.
print("Prepare to record. Press Enter to start recording for 10 seconds.")
input() # Wait for user to press Enter

print("Recording for 10 seconds...")
frames = []
for i in range(0, int(RATE / CHUNK * 10)): # Record for 10 seconds
    data = stream.read(CHUNK)
    frames.append(data)
print("Finished recording.")

stream.stop_stream()
stream.close()
audio.terminate()

wf = wave.open(WAV_FILENAME_MANUAL, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
print(f"Audio saved as {WAV_FILENAME_MANUAL}")

try:
    sound = AudioSegment.from_wav(WAV_FILENAME_MANUAL)
    sound.export(MP3_FILENAME_MANUAL, format="mp3")
    print(f"Converted to {MP3_FILENAME_MANUAL}")
    os.remove(WAV_FILENAME_MANUAL)
    display(Audio(MP3_FILENAME_MANUAL))
except Exception as e:
    print(f"Could not convert to MP3: {e}. Using WAV.")
    MP3_FILENAME_MANUAL = WAV_FILENAME_MANUAL # Fallback to WAV if MP3 fails

OSError: [Errno -9996] Invalid input device (no default output device)

## Part 3. Geting predictions with new model

In [None]:
# Cell 6: Make predictions with the recorded audio
# Ensure this uses the filename from the recording block you ran (e.g., MP3_FILENAME)
AUDIO_TO_TRANSCRIBE = MP3_FILENAME # Or MP3_FILENAME_MANUAL if you used that block

if not os.path.exists(AUDIO_TO_TRANSCRIBE):
    print(f"Audio file {AUDIO_TO_TRANSCRIBE} not found. Please record audio first.")
else:
    print(f"Transcribing {AUDIO_TO_TRANSCRIBE}...")
    # Load the audio file using librosa (handles resampling and conversion to mono float)
    # Whisper feature_extractor expects a 1D numpy array at 16kHz.
    speech_array, sampling_rate = librosa.load(AUDIO_TO_TRANSCRIBE, sr=16000, mono=True)

    # Preprocess the audio
    # If you loaded processor = WhisperProcessor.from_pretrained(MODEL_ID)
    inputs = processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)
    input_features = inputs.input_features.to(DEVICE)
    # If you loaded feature_extractor and tokenizer separately:
    # input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)


    # Generate token IDs
    with torch.no_grad():
        # You might need to specify the language here if your model requires it for Guarani.
        # Example: generated_ids = model.generate(input_features, language="gn")
        # If your model was trained with "spanish" as the target token:
        # generated_ids = model.generate(input_features, language="spanish")
        # Or rely on the model's default config if set:
        generated_ids = model.generate(input_features)

    # Decode the token IDs to text
    # If you loaded processor:
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    # If you loaded tokenizer separately:
    # transcription = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print("\nTranscription:")
    print(transcription)

    # Alternative: Using the pipeline (can be more convenient)
    # from transformers import pipeline
    # print("\nUsing pipeline for transcription:")
    # # If you loaded feature_extractor and tokenizer separately, ensure processor is created for pipeline
    # # if 'processor' not in locals():
    # # processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

    # pipe = pipeline("automatic-speech-recognition",
    #                 model=model,
    #                 tokenizer=processor.tokenizer, # or your loaded tokenizer
    #                 feature_extractor=processor.feature_extractor, # or your loaded feature_extractor
    #                 device=DEVICE)
    # result = pipe(AUDIO_TO_TRANSCRIBE, generate_kwargs={"language": "gn"}) # or "spanish" or remove if config is set
    # print(result["text"])

## Part 4. Comparing model sizes

In [None]:
# Cell 7: Load a specific older version of the model (your 'tiny' version)
# Replace 'your-username/your-repo-name' with your Hugging Face model ID
# Replace 'commit_hash_of_tiny_version' with the actual commit hash from Hugging Face
MODEL_ID_FOR_COMPARISON = "your-username/your-repo-name"
COMMIT_HASH_TINY = "commit_hash_of_tiny_version" # <--- PASTE THE CORRECT COMMIT HASH HERE

print(f"Loading 'tiny' version from commit: {COMMIT_HASH_TINY}")

try:
    # Attempt to load with WhisperProcessor first
    processor_tiny = WhisperProcessor.from_pretrained(MODEL_ID_FOR_COMPARISON, revision=COMMIT_HASH_TINY)
    model_tiny = WhisperForConditionalGeneration.from_pretrained(MODEL_ID_FOR_COMPARISON, revision=COMMIT_HASH_TINY).to(DEVICE)
    print("Successfully loaded 'tiny' model and processor using revision.")
except Exception as e:
    print(f"Error loading 'tiny' processor with revision: {e}. Trying tokenizer/feature_extractor separately.")
    # Fallback if processor loading fails for that revision
    tokenizer_tiny = WhisperTokenizer.from_pretrained(MODEL_ID_FOR_COMPARISON, revision=COMMIT_HASH_TINY)
    feature_extractor_tiny = WhisperFeatureExtractor.from_pretrained(MODEL_ID_FOR_COMPARISON, revision=COMMIT_HASH_TINY)
    model_tiny = WhisperForConditionalGeneration.from_pretrained(MODEL_ID_FOR_COMPARISON, revision=COMMIT_HASH_TINY).to(DEVICE)
    print("Successfully loaded 'tiny' model, tokenizer, and feature_extractor using revision.")


model_tiny.eval()

# Now you have 'model_tiny' (and its processor/tokenizer/feature_extractor) loaded.
# You can compare its performance to your current 'model' (the 'small' version).
# For example, you could re-run the prediction cell (Cell 6) but use model_tiny:

# Example of using the loaded tiny model for prediction:
# if 'speech_array' in locals(): # Check if speech_array from previous prediction exists
#     inputs_tiny = processor_tiny(speech_array, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)
#     # Or if using separate tokenizer/feature_extractor_tiny:
#     # inputs_tiny = feature_extractor_tiny(speech_array, sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE)

#     with torch.no_grad():
#         generated_ids_tiny = model_tiny.generate(inputs_tiny) # Add language if needed
#     transcription_tiny = processor_tiny.batch_decode(generated_ids_tiny, skip_special_tokens=True)[0]
#     # Or with separate tokenizer_tiny:
#     # transcription_tiny = tokenizer_tiny.batch_decode(generated_ids_tiny, skip_special_tokens=True)[0]
#     print("\nTranscription from 'tiny' model:")
#     print(transcription_tiny)
# else:
#     print("Run audio recording and preprocessing first to compare 'tiny' model.")