In [1]:
# Install ffmpeg
!apt-get install -qq ffmpeg

# Update packages to latest versions
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting datasets[audio]
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets[audio])
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting torchcodec>=0.6.0 (from datasets[audio])
  Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Downloading datasets-4.3.0-py3-none-any.whl (506 kB)
Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

In [2]:
import pandas as pd

df = pd.read_csv('capstone_audio.csv')

print(f"Loaded {len(df)} rows")
print(df.head())

Loaded 6 rows
    diagnosis                                         ori_script  \
0        copd  Good morning, Ms. Sharma. How are you feeling ...   
1  gallstones  Good morning, please come in and take a seat. ...   
2   arthritis  Good morning, Mr. Taylor. Come in and have a s...   
3      anemia  Good morning, Ms. Sharma. I understand you've ...   
4         dka  Hello there, I'm Dr. Lee. I hear you haven't b...   

                                  audio_download_url  \
0  https://drive.google.com/uc?export=download&id...   
1  https://drive.google.com/uc?export=download&id...   
2  https://drive.google.com/uc?export=download&id...   
3  https://drive.google.com/uc?export=download&id...   
4  https://drive.google.com/uc?export=download&id...   

                                  audio_playable_url  
0  https://drive.google.com/file/d/1vfpynTJCOBDJ0...  
1  https://drive.google.com/file/d/1wOH5tSpnyclwn...  
2  https://drive.google.com/file/d/1ACJkQn7SZjcjP...  
3  https://drive.goo

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32



model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

#Process each audio file
transcripts = []

for idx, row in df.iterrows():
    audio_url = row['audio_download_url']
    diagnosis = row['diagnosis']

    print(f"\nProcessing {idx+1}/{len(df)}: {diagnosis}")

    # Download audio file
    audio_filename = f"audio_{idx}.m4a"
    wav_filename = f"audio_{idx}.wav"
    try:
        !wget -q "{audio_url}" -O {audio_filename}

        # Convert to wav
        !ffmpeg -i {audio_filename} {wav_filename} -y -loglevel quiet

        # Transcribe (return_timestamps=True for long audio)
        result = pipe(wav_filename, return_timestamps=True)

        transcript = result["text"]
        transcripts.append(transcript)

        print(f"Completed: {len(transcript)} characters")

        # Clean up files (intermediate)
        !rm {audio_filename} {wav_filename}

    except Exception as e:
        print(f"Error: {e}")
        transcripts.append("")

#Add transcripts to dataframe
df['wlv3t_transcript'] = transcripts

#Save to the same csv file
df.to_csv('capstone_audio.csv', index=False)

print(f"Done! Updated capstone_audio.csv")
print(f"Columns: {list(df.columns)}")
print(df[['diagnosis', 'audio_download_url', 'ori_script', 'wlv3t_transcript']].head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0



Processing 1/6: copd


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


Completed: 4970 characters

Processing 2/6: gallstones
Completed: 4671 characters

Processing 3/6: arthritis
Completed: 4358 characters

Processing 4/6: anemia
Completed: 5087 characters

Processing 5/6: dka
Completed: 4228 characters

Processing 6/6: gerd
Completed: 3883 characters
Done! Updated capstone_audio.csv
Columns: ['diagnosis', 'ori_script', 'audio_download_url', 'audio_playable_url', 'wlv3t_transcript']
    diagnosis                                 audio_download_url  \
0        copd  https://drive.google.com/uc?export=download&id...   
1  gallstones  https://drive.google.com/uc?export=download&id...   
2   arthritis  https://drive.google.com/uc?export=download&id...   
3      anemia  https://drive.google.com/uc?export=download&id...   
4         dka  https://drive.google.com/uc?export=download&id...   

                                          ori_script  \
0  Good morning, Ms. Sharma. How are you feeling ...   
1  Good morning, please come in and take a seat. ...   
2  Go

In [4]:
!pip install jiwer
import jiwer
import pandas as pd
from pathlib import Path

# Path to your CSV
csv_path = Path.cwd() / "capstone_audio.csv"

# Load CSV
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")

# Create a jiwer transformation
transformation = jiwer.Compose([
    jiwer.RemovePunctuation(),
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip()
])

# Normalize all references (ori_script) and hypotheses (wlv3t_transcript)
refs = [transformation(str(r)) for r in df["ori_script"]]
hyps = [transformation(str(h)) for h in df["wlv3t_transcript"]]

# Calculate WER for each row
wer_scores = []
for i in range(len(refs)):
    wer_score = jiwer.wer(refs[i], hyps[i])
    wer_scores.append(wer_score)

# Add WER scores to dataframe as new column
df['wer'] = wer_scores

# Calculate average WER
average_wer = sum(wer_scores) / len(wer_scores)

print("\n=== RESULTS ===")
print(f"Model: openai/whisper-large-v3-turbo")
print(f"\nIndividual WER scores:")
for idx, row in df.iterrows():
    print(f"  {row['diagnosis']}: {row['wer']:.4f}")
print(f"\nAverage WER: {average_wer:.4f}")

# Save updated CSV with WER column
output_path = "capstone_audio_with_wer.csv"
df.to_csv(output_path, index=False)

print(f"\n Saved CSV with WER scores to: {output_path}")
print(f"Columns: {list(df.columns)}")

# Display summary
print("\n=== SUMMARY TABLE ===")
print(df[['diagnosis', 'wer']])

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.14.3
Loaded 6 rows
Columns: ['diagnosis', 'ori_script', 'audio_download_url', 'audio_playable_url', 'wlv3t_transcript']

=== RESULTS ===
Model: openai/whisper-large-v3-turbo

Individual WER scores:
  copd: 0.1083
  gallstones: 0.2437
  arthritis: 0.1203
  anemia: 0.1508
  dka: 0.1611
  gerd: 0.0363

Average WER: 0.1368

 Saved CSV with WER scores to: capstone_audio_with_wer.csv
Columns: ['diagnosis', 'or

In [5]:
import jiwer
import pandas as pd
from pathlib import Path

# Load CSV
csv_path = "capstone_audio_with_wer.csv"
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")

# Transformation for normalization
transformation = jiwer.Compose([
    jiwer.RemovePunctuation(),
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip()
])

# Prepare list to store alignment visualizations
alignment_visualizations = []

# Loop through all audio files
for idx, row in df.iterrows():
    diagnosis = row["diagnosis"]
    ref = transformation(str(row["ori_script"]))
    hyp = transformation(str(row["wlv3t_transcript"]))

    print(f"Processing {idx+1}/{len(df)}: {diagnosis}")

    # Generate sentence-level word alignment
    alignment = jiwer.process_words([ref], [hyp])
    alignment_str = jiwer.visualize_alignment(alignment)

    # Append to list
    alignment_visualizations.append(alignment_str)

# Add the alignment visualization column to the dataframe
df["alignment_visualization"] = alignment_visualizations

# Save updated CSV
output_path = Path.cwd() / "capstone_audio_alignment_large.csv"
df.to_csv(output_path, index=False)

print(f"\n✓ Saved sentence-level alignment to {output_path}")


Loaded 6 rows
Columns: ['diagnosis', 'ori_script', 'audio_download_url', 'audio_playable_url', 'wlv3t_transcript', 'wer']
Processing 1/6: copd
Processing 2/6: gallstones
Processing 3/6: arthritis
Processing 4/6: anemia
Processing 5/6: dka
Processing 6/6: gerd

✓ Saved sentence-level alignment to /content/capstone_audio_alignment_large.csv
