In [None]:
"""
Step 1: Download the video and turn it into wav file
"""
!apt update
!apt install -y ffmpeg
!python -m pip install -U pytubefix

In [None]:
from pytubefix import YouTube
from pytubefix.cli import on_progress

url = "https://youtu.be/lnln7QkR30w"
folder = "yt"
filename = "sample.mp4"

yt = YouTube(url, on_progress_callback = on_progress)
print(yt.title)

ys = yt.streams.get_highest_resolution()
ys.download(folder, filename)

In [None]:
!ffmpeg -i yt/sample.mp4 -acodec pcm_s16le -ar 16000 -ac 1 -y sample.wav

In [None]:
"""
Install whisperx + cudnn (needed for colab's 12.4 CUDA)
"""
!pip install whisperx
!apt install libcudnn8 libcudnn8-dev -y

In [None]:
"""
Step 2: Transcribe the audio file using WhisperX
"""

import os
import whisperx
import gc
import requests
import json

# os.environ["HF_TOKEN"] = ''

# --- Configuration ---
device = "cuda"
audio_file = "sample.wav"
batch_size = 4
compute_type = "float16"  # or "int8" for lower memory usage

# --- WhisperX Transcription ---
print(f"Loading WhisperX model for '{compute_type}' computation on '{device}'...")
try:
    model = whisperx.load_model("large-v3", device, compute_type=compute_type)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading WhisperX model: {e}")
    print("Please ensure you have the necessary dependencies and a compatible device (CUDA if 'device' is 'cuda').")
    exit()

print(f"Loading audio from '{audio_file}'...")
try:
    audio = whisperx.load_audio(audio_file)
    print("Audio loaded successfully.")
except Exception as e:
    print(f"Error loading audio file: {e}")
    print("Please check if the audio file exists and is a valid .wav format.")
    exit()

print("Starting transcription...")
try:
    result = model.transcribe(audio, batch_size=batch_size)
    print("Transcription complete.")

    # The 'result["segments"]' contains the transcribed text with timestamps.
    # To get the full text, we'll concatenate the 'text' from each segment.
    full_transcription = " ".join([segment["text"] for segment in result["segments"]])
    print("\n--- Full Transcription ---")
    print(full_transcription)

except Exception as e:
    print(f"Error during transcription: {e}")
    print("This might be due to insufficient GPU memory, an issue with the audio file, or other system-level problems.")
    exit()

# free up memory
del model
# del model_a # if you used align model
gc.collect()

In [None]:
"""
Step 3: Clean the transcribed text using Ollama
"""
import requests

# Configuration
OLLAMA_ENDPOINT = "http://192.168.20.51:11434/api/generate"  # Update with your server URL
MODEL_NAME = "gemma3:27b"  # e.g., "mistral", "llama2", or your custom model

# Sample ASR-generated text (replace with your actual input)
asr_text = str(full_transcription)

# Construct the prompt with clear instructions
prompt = f"""
### Instruction:
Clean and correct the following ASR-generated text:
1. Add appropriate punctuation (periods, commas, question marks).
2. Fix any obvious speech recognition errors.
3. Preserve the original language it was spoken in (mostly persian and english).
4. Preserve all original content meaning.

### Input Text:
{asr_text}

### Cleaned Text:
"""

print(prompt)

prompt = prompt.replace('\n', '\\n')

with open("prompt.txt", "w") as file:
  file.write(prompt)

# Prepare the request payload
payload = {
    "model": MODEL_NAME,
    "prompt": prompt,
    "stream": False,  # Set to True if you want streaming response
    "options": {
        "temperature": 0.0,  # Low temperature for minimal creativity
        "num_ctx": 4096      # Context window size
    }
}

# Send request to Ollama
try:
    response = requests.post(
        OLLAMA_ENDPOINT,
        json=payload,
        headers={"Content-Type": "application/json"},
        timeout=300  # Increase timeout for longer texts
    )
    response.raise_for_status()

    # Extract cleaned text from response
    result = response.json()
    cleaned_text = result.get("response", "").strip()

    print("Cleaned Text Output:")
    print(cleaned_text)

    with open("cleaned_text.txt", "w") as file:
        file.write(cleaned_text)

except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

In [None]:
"""
Step 4: Convert the cleaned text to speech using Piper
"""
!pip install piper-tts

In [None]:
!cat cleaned_text.txt | piper \
  --update-voices \
  --model fa_IR-reza_ibrahim-medium \
  --output_file reza_ibrahim.wav