In [None]:
#!pip install whisper-timestamped

In [None]:
import whisper_timestamped as whisper
import torch

fine_tuned_model_path = ".\\whisper-ft\\" 

#model = whisper.load_model(fine_tuned_model_path, device="cuda" if torch.cuda.is_available() else "cpu")
model = whisper.load_model('openai/whisper-base', device='cuda' if torch.cuda.is_available() else 'cpu')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
import pandas as pd
import re

def remove_punctuation(s):
    s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
    return s.lower()

def transcribe_processed_audio(processed_audio_file):
    '''
    Assumes whisper model has been loaded. 
    Input is as *processed* audio path(mono, 16kHz, .wav)
    Returns a pandas dataframe with the lyrics
    '''
    print(f'Transcribing audio at {processed_audio_file} ...\n')
    audio = whisper.load_audio(processed_audio_file)

    result = whisper.transcribe(model, audio, language="en",)

    to_add = []
    for segment in result["segments"]:
        for word_info in segment['words']:

            word = word_info['text'].strip()
            word = remove_punctuation(word)
            
            start_time = float(word_info['start'])
            end_time = float(word_info['end'])
            prob = word_info['confidence']
            
            to_add.append([word, start_time, end_time, prob])

    # Create Dataframe
    columns = ['word', 'start', 'end', 'prob']
    df = pd.DataFrame(to_add, columns=columns)

    return df

In [None]:
import demucs.separate
from pydub import AudioSegment
import os

def create_vocals_stem(audio_path, remove_stems=True):
    '''
    Creates the vocals stems in the appropriate format for Whisper (16kHz, mono, .wav)
    Inputs: audio_path 
    Outputs: processed_file_path
    '''
    # Define path names 
    print(f'Preprocessing audio at {audio_path} ...\n')
    audio_path = os.path.abspath(audio_path)
    song_dir = os.path.dirname(audio_path)
    
    root, ext = os.path.splitext(os.path.basename(audio_path))

    demucs_path = os.path.abspath(f"separated/mdx_extra/{root}")
    vocals_path = os.path.join(demucs_path, "vocals.wav")
    no_vocals_path = os.path.join(demucs_path, "no_vocals.wav")

    # Split vocals
    demucs.separate.main(["--two-stems", "vocals", "-n", "mdx_extra", audio_path])

    audio = AudioSegment.from_file(vocals_path)

    if audio.channels > 1:
        audio = audio.set_channels(1)

    if audio.frame_rate != 16000:
        audio = audio.set_frame_rate(16000)

    output_path = os.path.abspath(f"{song_dir}\\{root}-vocals-processed.wav")
    audio.export(output_path, format="wav")
    print(f'Vocals stems extracted and converted to 16kHz mono.\nAudio saved at {output_path}\n')

    # Clean up files
    if remove_stems:
        os.remove(vocals_path)
        os.remove(no_vocals_path)
        os.rmdir(demucs_path)

    return output_path

In [None]:
processed_path = create_vocals_stem('.\\blah\\balance.mp3')
df = transcribe_processed_audio(processed_path)

Preprocessing audio at .\blah\balance.mp3 ...

Selected model is a bag of 4 models. You will see that many progress bars per track.
Separated tracks will be stored in C:\Users\dacla\Documents\auto-censoring-local\separated\mdx_extra
Separating track c:\Users\dacla\Documents\auto-censoring-local\blah\balance.mp3


100%|███████████████████████████████████████████████████████████████████████| 264.0/264.0 [00:02<00:00, 100.10seconds/s]
100%|███████████████████████████████████████████████████████████████████████| 264.0/264.0 [00:02<00:00, 101.47seconds/s]
100%|███████████████████████████████████████████████████████████████████████| 264.0/264.0 [00:02<00:00, 100.92seconds/s]
100%|███████████████████████████████████████████████████████████████████████| 264.0/264.0 [00:02<00:00, 105.23seconds/s]


Vocals stems extracted and converted to 16kHz mono.
Audio saved at c:\Users\dacla\Documents\auto-censoring-local\blah\balance-vocals-processed.wav

Transcribing audio at c:\Users\dacla\Documents\auto-censoring-local\blah\balance-vocals-processed.wav ...



100%|██████████| 25738/25738 [00:09<00:00, 2606.53frames/s]


In [46]:
pd.set_option('display.max_colwidth', 60)
pd.set_option('display.max_rows', None)
df

Unnamed: 0,word,start,end,prob
0,they,5.9,5.92,0.16
1,love,5.92,6.2,0.615
2,the,6.2,6.44,0.993
3,taste,6.44,6.72,0.985
4,of,6.72,6.96,0.995
5,blood,6.96,7.36,0.992
6,i,7.9,8.18,0.393
7,dont,8.18,8.32,0.988
8,know,8.32,8.46,0.986
9,what,8.46,8.58,0.989
