In [21]:
import os

In [2]:
os.chdir('/Users/carlos.salas/Documents/vista_hackathon_2024/whisper.cpp/')

In [3]:
os.getcwd()

'/Users/carlos.salas/Documents/vista_hackathon_2024/whisper.cpp'

## Getting and downloading models

_____
### For the base ggml model in English, we can run:
```bash 
./models/download-ggml-model.sh base.en
```

#### This downloads the model in the models folder, and we can use it with:
```bash
./main -m models/ggml-base.en.bin -f samples/jfk.wav
```
____
### For [Quantized models](https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#quantization), we can run:
```bash
make quantize
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
```

#### This downloads the model in the models folder, and we can use it with:
```bash
./main -m models/ggml-base.en-q5_0.bin samples/jfk.wav
```
____
### For CoreML Models we can run:
```bash
./models/generate-coreml-model.sh base.en
```

Then run:
```bash
make clean
WHISPER_COREML=1 make -j
```

#### This downloads the model in the models folder, and we can use it with:
```bash
./main -m models/ggml-base.en.bin -f samples/jfk.wav
```

### Available models (under models folder)
- ggml-base.en-q5_0.bin
- ggml-base.en.bin 
    * Note, this will be the coreML optimized one if you ran coreML steps)




# Test models

In [10]:
# Base model
!./main -m models/ggml-base.en.bin -f samples/jfk.wav

whisper_init_from_file_with_params_no_state: loading model from 'models/ggml-base.en.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51864
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 512
whisper_model_load: n_audio_head  = 8
whisper_model_load: n_audio_layer = 6
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 512
whisper_model_load: n_text_head   = 8
whisper_model_load: n_text_layer  = 6
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: qntvr         = 0
whisper_model_load: type          = 2 (base)
whisper_model_load: adding 1607 extra tokens
whisper_model_load: n_langs       = 99
whisper_model_load:    Metal total size =   147.37 MB
whisper_model_load: mod

In [11]:
# Test the quantized model
!./main -m models/ggml-base.en-q5_0.bin samples/jfk.wav

whisper_init_from_file_with_params_no_state: loading model from 'models/ggml-base.en-q5_0.bin'
whisper_init_with_params_no_state: use gpu    = 1
whisper_init_with_params_no_state: flash attn = 0
whisper_init_with_params_no_state: gpu_device = 0
whisper_init_with_params_no_state: dtw        = 0
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51864
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 512
whisper_model_load: n_audio_head  = 8
whisper_model_load: n_audio_layer = 6
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 512
whisper_model_load: n_text_head   = 8
whisper_model_load: n_text_layer  = 6
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 8
whisper_model_load: qntvr         = 2
whisper_model_load: type          = 2 (base)
whisper_model_load: adding 1607 extra tokens
whisper_model_load: n_langs       = 99
whisper_model_load:    Metal total size =    54.71 MB
whisper_model_load

### Using [CoreML](https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#core-ml-support)

In [22]:
# !pip install ane_transformers
# !pip install openai-whisper
# !pip install coremltools
# !pip install sounddevice

In [34]:
# !./models/generate-coreml-model.sh base.en

Torch version 2.3.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.2.0 is the most recent version that has been tested.
100%|████████████████████████████████████████| 139M/139M [06:43<00:00, 360kiB/s]
ModelDimensions(n_mels=80, n_audio_ctx=1500, n_audio_state=512, n_audio_head=8, n_audio_layer=6, n_vocab=51864, n_text_ctx=448, n_text_state=512, n_text_head=8, n_text_layer=6)
  assert x.shape[1:] == self.positional_embedding.shape[::-1], "incorrect audio shape"
  assert inputs.size(1) == self.num_channels
  scale = float(dim_per_head)**-0.5
Converting PyTorch Frontend ==> MIL Ops: 100%|▉| 829/830 [00:00<00:00, 9768.91 o
Running MIL frontend_pytorch pipeline: 100%|█| 5/5 [00:00<00:00, 145.40 passes/s
Running MIL default pipeline: 100%|████████| 78/78 [00:01<00:00, 53.57 passes/s]
Running MIL backend_mlprogram pipeline: 100%|█| 12/12 [00:00<00:00, 204.53 passe
done converting
xcrun: error: unable to find utility "coremlc", not a developer tool or in PATH

In [None]:
# Test the coreML model
!./main -m models/ggml-base.en-q5_0.bin samples/jfk.wav

## Testing real time

In [29]:
import tempfile
import sounddevice as sd
import subprocess
import wave

In [19]:
def transcribe_to_txt(input_filename: str, output_filename: str):
    print('Running whisper transcription...')
    # Compose the command of all components
    command = ['./main', '-f', input_filename, '-otxt', '-of', output_filename, '-np']

    # Execute the command
    result = subprocess.run(command, capture_output=True, text=True)

In [33]:

os.getcwd()

'/Users/carlos.salas/Documents/vista_hackathon_2024/whisper.cpp'

In [30]:
def callback(indata, frames, time, status):
    # Raise for status if required
    if status:
        print(status)
    
    # Create a tempfile to save the audio to, with autodeletion
    with tempfile.NamedTemporaryFile(delete=True, suffix='.wav', prefix='audio_', dir='.') as tmpfile:
        # Save the 5 second audio to a .wav file
        with wave.open(tmpfile.name, 'wb') as wav_file:
            wav_file.setnchannels(1)  # Mono audio
            wav_file.setsampwidth(2)  # 16-bit audio
            wav_file.setframerate(16000)  # Sample rate
            wav_file.writeframes(indata)
        
        # Prepare the output filename
        output_filename = tmpfile.name.replace('.wav', '')
        
        # Transcribe the audio to text using our whisper.cpp wrapper
        transcribe_to_txt(tmpfile.name, output_filename)

        # Print the transcribed text
        with open(output_filename + '.txt', 'r') as file:
            print(file.read())
        
        # Clean up temporary files
        os.remove(output_filename + '.txt')


In [31]:
# Seconds buffer size - breaks up the callback into 5 second chunks for processing
buffer_size_seconds = 5
samplerate = 16000
buffer_size = buffer_size_seconds * samplerate

In [32]:
# Start recording
try:
    # Start recording with a rolling 5-second buffer
    with sd.InputStream(callback=callback, dtype='int16', channels=1, samplerate=16000, blocksize=buffer_size_seconds):
        print("Recording... Press Ctrl+C to stop.")
        while True:
            pass
except KeyboardInterrupt:
    print('Recording stopped.')

Recording... Press Ctrl+C to stop.
Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisper transcription...

Running whisp

## Now with speaker diarization

### First we will pull the diarize model using this:
```bash
/models/download-ggml-model.sh small.en-tdrz
```

In [37]:
!./models/download-ggml-model.sh small.en-tdrz

Downloading ggml model small.en-tdrz from 'https://huggingface.co/akashmjn/tinydiarize-whisper.cpp' ...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1228  100  1228    0     0  10828      0 --:--:-- --:--:-- --:--:-- 10867
100  465M  100  465M    0     0  1531k      0  0:05:10  0:05:10 --:--:-- 1531kk      0  0:05:56  0:00:14  0:05:42 1413k      0  0:05:31  0:00:39  0:04:52 1217k:05:20  0:00:51  0:04:29 1632k2k      0  0:05:21  0:00:53  0:04:28 1578k0:04:25 1410k 0  0:05:17  0:01:02  0:04:15 1660k:17  0:01:15  0:04:02 1627k1495k      0  0:05:18  0:01:30  0:03:48 1485k  1499k      0  0:05:17  0:01:39  0:03:38 1625k  0:05:14  0:01:59  0:03:15 1670k0:02:08  0:03:08 1407k0     0  1513k      0  0:05:14  0:02:54  0:02:20 1527k0  0:05:14  0:02:56  0:02:18 1483k 1472k   0     0  1512k      0  0:05:14  0:03:22  0:01:52 1331k      0  0:05:14  0:03:23  0:01:51 1391k 0  1513k      

In [None]:
!./main -m /Users/carlos.salas/Documents/vista_hackathon_2024/whisper.cpp/models/ggml-small.en-tdrz.bin -f samples/jfk.wavgit