In [1]:
import pandas as pd
from preprocessing import read_srt_in_memory, extract_tokens_with_sentences, isolate_speech
from audio_complexity import extract_complexity, improve_timesteps
from filter_in_out import mark_non_english_in_df, mark_notes_in_df, mark_excluded_words, mark_numbers_in_df
from translation import batch_translate_and_align
from feature_extraction import FeatureExtractor
from model import predict_with_bias, BinaryClassifier
from subtitle_generation import create_srt_file
from utils import device
from types import SimpleNamespace
import torch
import wandb
import time

Using device: cuda


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Modell nur einmal laden
api = wandb.Api()
run = api.run("/humorless5218-gymnasium-berchtesgaden/Intelligent Subtitles Simple NN 5/swdvym3w")
wandb.config = SimpleNamespace(**run.config)
#run.file("best_model.pth").download(replace=True)
model = BinaryClassifier(input_features=5, config=wandb.config).to(device)
model.load_state_dict(torch.load('best_model.pth', map_location=device))
model.eval()

BinaryClassifier(
  (model): Sequential(
    (0): Linear(in_features=5, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.14279874595232844, inplace=False)
    (4): Linear(in_features=64, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.14279874595232844, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.14279874595232844, inplace=False)
    (12): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [3]:
name = 'boys2'
reference_file = 'data/boys2.srt'
audio_file = 'data/boys2.mp4'

In [4]:
srt_lines_in_memory = read_srt_in_memory(reference_file)
original_tokens = extract_tokens_with_sentences(srt_lines_in_memory)
audio_complexity_results = extract_complexity(audio_file, original_tokens, device=str(device), batch_size=16) # Device-String
df = pd.DataFrame(audio_complexity_results)
df['position'] = [token['position'] for token in original_tokens]
df['sentence'] = [token['sentence'] for token in original_tokens]
df

Using device: cuda for audio complexity extraction
Initializing WhisperModel on cuda with compute_type int8
Getting transcription from audio file...


Unnamed: 0,word,audio_complexity,position,sentence
0,i,0.110596,1,I am sorry.
1,am,0.110596,2,I am sorry.
2,sorry,0.002441,3,I am sorry.
3,he's,0.132812,1,He's very tired.
4,very,0.004883,2,He's very tired.
...,...,...,...,...
311,char,1.000000,16,"Do you know, I can't remember the last time I ..."
312,you're,0.284912,1,You're a doll.
313,a,1.000000,2,You're a doll.
314,doll,1.000000,3,You're a doll.


In [5]:
original_tokens = []

## Isolate Voice

In [13]:
# Import all dependencies
import os
import numpy as np
import torch
import torchaudio
import tempfile
import subprocess
from scipy.io import wavfile
from demucs.pretrained import get_model
from demucs.apply import apply_model

def process_audio_with_demucs(audio_file, device, output_dir="data"):
    """
    Process audio with Demucs to separate vocals from other audio components.
    
    Args:
        audio_file: Path to the input audio or video file
        output_dir: Directory to save the processed audio files (default: "data")
    
    Returns:
        Tuple of paths to the processed files (vocals_path, no_vocals_path)
    """
    
    # Load model
    model = get_model("htdemucs").to(device)
    print("Using device to process audio:", device)
    sample_rate = model.samplerate
    
    def extract_audio_from_video(file_path):
        temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
        temp_wav.close()
        cmd = ['ffmpeg', '-i', file_path, '-vn', '-acodec', 'pcm_s16le', 
               '-ar', str(sample_rate), '-ac', '2', '-y', temp_wav.name]
        try:
            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            waveform, sr = torchaudio.load(temp_wav.name)
            return waveform, sample_rate
        except Exception:
            return None, None
        finally:
            try:
                os.unlink(temp_wav.name)
            except:
                pass
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract file name without extension
    name = os.path.splitext(os.path.basename(audio_file))[0]
    
    # Extract audio from video/audio file
    waveform, sr = extract_audio_from_video(audio_file)
    if waveform is None:
        return None, None
        
    # Ensure correct format
    if sr != sample_rate:
        waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
    if waveform.shape[0] == 1:
        waveform = waveform.repeat(2, 1)
    elif waveform.shape[0] > 2:
        waveform = waveform[:2]
    
    # Process audio
    waveform = waveform.unsqueeze(0).to(device)
    with torch.no_grad():
        sources = apply_model(model, waveform)
    
    # Extract and save vocals
    vocal_idx = model.sources.index("vocals")
    vocals = sources[0, vocal_idx].cpu()
    vocals_np = (vocals * 32767).numpy().astype(np.int16)
    vocals_path = os.path.join(output_dir, name + "_vocals.wav")
    wavfile.write(vocals_path, sample_rate, vocals_np.T)
    
    # Create and save no_vocals track
    no_vocals = torch.zeros_like(sources[0, 0])
    for i in range(len(model.sources)):
        if i != vocal_idx:
            no_vocals += sources[0, i]
    no_vocals_np = (no_vocals.cpu() * 32767).numpy().astype(np.int16)
    no_vocals_path = os.path.join(output_dir, name + "_no_vocals.wav")
    wavfile.write(no_vocals_path, sample_rate, no_vocals_np.T)
    
    return vocals_path, no_vocals_path

In [5]:
isolate_speech(audio_file, device)

Using device to process audio: cuda


('data\\boys2_vocals.wav', 'data\\boys2_no_vocals.wav')

In [None]:
# Import all dependencies at the top
import os
import numpy as np
import torch
import torchaudio
import tempfile
import subprocess
from scipy.io import wavfile
from pydub import AudioSegment
# Assuming these imports are from your environment
from demucs.pretrained import get_model
from demucs.apply import apply_model

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = get_model("htdemucs").to(device)
sample_rate = model.samplerate

def extract_audio_from_video(file_path):
    temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    temp_wav.close()
    cmd = ['ffmpeg', '-i', file_path, '-vn', '-acodec', 'pcm_s16le', 
           '-ar', str(sample_rate), '-ac', '2', '-y', temp_wav.name]
    try:
        subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        waveform, sr = torchaudio.load(temp_wav.name)
        return waveform, sample_rate
    except Exception:
        return None, None
    finally:
        try:
            os.unlink(temp_wav.name)
        except:
            pass


waveform, sr = extract_audio_from_video(audio_file)
    
# Ensure correct format
if sr != sample_rate:
    waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
if waveform.shape[0] == 1:
    waveform = waveform.repeat(2, 1)
elif waveform.shape[0] > 2:
    waveform = waveform[:2]

# Process audio
waveform = waveform.unsqueeze(0).to(device)
with torch.no_grad():
    sources = apply_model(model, waveform)

# Create output directory
output_dir = "data"

# Extract and save vocals
vocal_idx = model.sources.index("vocals")
vocals = sources[0, vocal_idx].cpu()
vocals_np = (vocals * 32767).numpy().astype(np.int16)
wavfile.write(os.path.join(output_dir, name + "_vocals.wav"), sample_rate, vocals_np.T)

# Create and save no_vocals track
no_vocals = torch.zeros_like(sources[0, 0])
for i in range(len(model.sources)):
    if i != vocal_idx:
        no_vocals += sources[0, i]
no_vocals_np = (no_vocals.cpu() * 32767).numpy().astype(np.int16)
wavfile.write(os.path.join(output_dir, name + "_no_vocals.wav"), sample_rate, no_vocals_np.T)

## Improve timesteps

In [6]:
aligned_results = improve_timesteps("data/" + name + "_vocals.wav", df, device=str(device), batch_size=16)
df['start_time'] = [result['start_time'] for result in aligned_results]
df['end_time'] = [result['end_time'] for result in aligned_results]

# Print the updated DataFrame
print(df)

Using device: cuda for improved timestep extraction
Getting improved transcription from audio file...
       word  audio_complexity  position  \
0         i          0.110596         1   
1        am          0.110596         2   
2     sorry          0.002441         3   
3      he's          0.132812         1   
4      very          0.004883         2   
..      ...               ...       ...   
311    char          1.000000        16   
312  you're          0.284912         1   
313       a          1.000000         2   
314    doll          1.000000         3   
315      ah          1.000000         1   

                                              sentence  start_time  end_time  
0                                          I am sorry.        0.62      1.02  
1                                          I am sorry.        0.62      1.02  
2                                          I am sorry.        1.02      1.42  
3                                     He's very tired.        1.4

In [8]:
print(df.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  start_time  end_time
0              i          0.110596         1                                                                                                                                                          I am sorry.        0.62      1.02
1             am          0.110596         2                                                                                                                                                          I am sorry.        0.62      1.02
2          sorry          0.002441         3                                                                                                                                                          I am sorry.        1.02      1.42
3           he's          0.132812         1                            

## Speech-Speed

In [19]:
import re
import numpy as np
import pandas as pd
import nltk
# Uncomment the following line if running for the first time
# nltk.download('cmudict')
from nltk.corpus import cmudict

def process_text_dataframe(df):
    """
    Process a DataFrame containing words and timings to compute syllables-per-second
    metrics and a normalized speed value based on a sigmoid function.
    
    Assumes the DataFrame has the following columns:
      - "word": the text of the word.
      - "position": numerical order in the sentence (with each sentence starting with 1).
      - "start_time" and "end_time": timing values (may contain nan).
    
    Returns:
        The input DataFrame with a new 'speed' column.
    """

    cmu_dict = cmudict.dict()

    def count_syllables(word):
        """
        Count syllables using the CMU dictionary.
        If a word is not available, falls back to counting vowel-group matches.
        """
        word = word.lower().strip(".,;:'\"!?")
        if word in cmu_dict:
            pronunciation = cmu_dict[word][0]  # use first pronunciation
            return sum(1 for ph in pronunciation if ph[-1].isdigit())
        else:
            return len(re.findall(r'[aeiouy]+', word))

    def compute_sentence_metrics(group):
        """
        Compute duration and syllables-per-second for a sentence group.
        """
        group = group.sort_values(by='position').reset_index(drop=True)
        valid_start = group[group['start_time'].notna()]
        valid_end = group[group['end_time'].notna()]
        if not valid_start.empty and not valid_end.empty:
            # Use the first valid start_time and the last valid end_time.
            start_time = valid_start.iloc[0]['start_time']
            end_time = valid_end.iloc[-1]['end_time']
            duration = end_time - start_time
            # Compute syllable count for words between the first and last valid timing.
            start_idx = group.index[group['start_time'].notna()][0]
            end_idx = group.index[group['end_time'].notna()][-1]
            syllable_count = group.loc[start_idx:end_idx, 'word'].apply(count_syllables).sum()
            sps = syllable_count / duration if duration > 0 else 0
        else:
            duration = None
            sps = None
        return pd.Series({'duration': duration, 'syllables_per_second': sps})

    # Assign a unique sentence group identifier. Assumes each sentence starts with position == 1.
    df['sentence_group'] = (df['position'] == 1).cumsum()

    # Compute metrics per sentence.
    sentence_metrics = df.groupby('sentence_group').apply(compute_sentence_metrics).reset_index()
    overall_mean = sentence_metrics['syllables_per_second'].dropna().mean()
    # Replace None values with overall mean.
    sentence_metrics['syllables_per_second'] = sentence_metrics['syllables_per_second'].fillna(overall_mean)

    # Merge the syllables_per_second data back into the main DataFrame.
    if 'syllables_per_second' in df.columns:
        df.drop('syllables_per_second', axis=1, inplace=True)
    df = df.merge(sentence_metrics[['sentence_group', 'syllables_per_second']], on='sentence_group', how='left')

    def sigmoid_normalize_sps(value):
        """
        Normalize a syllables-per-second value using a sigmoid function.
        """
        center = 4.0  # center of the sigmoid (adjust as needed)
        scale = 2     # scale factor controlling steepness
        return 1 / (1 + np.exp(-(value - center) / center * scale))

    # Apply sigmoid normalization.
    df['speed'] = df['syllables_per_second'].apply(sigmoid_normalize_sps)

    # Remove the temporary syllables_per_second column.
    return df.drop('syllables_per_second', axis=1)

df = process_text_dataframe(df)

  sentence_metrics = df.groupby('sentence_group').apply(compute_sentence_metrics).reset_index()


In [8]:
import pandas as pd
import re
import nltk

# Uncomment for first-time setup.
# nltk.download('cmudict')
from nltk.corpus import cmudict

cmu_dict = cmudict.dict()

def count_syllables(word):
    """
    Count syllables using NLTK's CMU dict.
    Falls back to a regex approach if the word is not found.
    """
    word = word.lower().strip(".,;:'\"!?")
    if word in cmu_dict:
        # Use first pronunciation in case of multiple entries.
        pronunciation = cmu_dict[word][0]
        return len([ph for ph in pronunciation if ph[-1].isdigit()])
    else:
        # Fallback approach: count groups of vowels.
        return len(re.findall(r'[aeiouy]+', word))

def compute_sentence_metrics(group):
    # Sort group by position in case it's not already.
    group = group.sort_values(by='position').reset_index(drop=True)
    
    valid_start = group[group['start_time'].notna()]
    valid_end = group[group['end_time'].notna()]
    
    if not valid_start.empty and not valid_end.empty:
        # Get the first valid start_time and last valid end_time.
        first_valid = valid_start.iloc[0]
        last_valid = valid_end.iloc[-1]
        
        start_time = first_valid['start_time']
        end_time = last_valid['end_time']
        duration = end_time - start_time
        
        # Find index positions of the first and last valid entries.
        start_idx = group.index[group['start_time'].notna()][0]
        end_idx = group.index[group['end_time'].notna()][-1]
        
        # Compute syllable count ONLY on words between valid_start and valid_end (inclusive).
        syllable_count = group.loc[start_idx:end_idx, 'word'].apply(count_syllables).sum()
        syllables_per_second = syllable_count / duration if duration > 0 else 0
    else:
        # For sentences with no valid timing, set to None for now.
        duration = None
        syllables_per_second = None
        
    return pd.Series({
        'duration': duration,
        'syllables_per_second': syllables_per_second,
    })

# Create a unique group for each sentence instance.
# We assume that each sentence instance starts with position == 1.
df['sentence_group'] = (df['position'] == 1).cumsum()

# Compute metrics per sentence instance.
sentence_metrics = df.groupby('sentence_group').apply(compute_sentence_metrics).reset_index()

# Compute the overall mean syllables per second across sentences with valid timing.
overall_mean = sentence_metrics['syllables_per_second'].dropna().mean()

print(f"Overall mean syllables per second: {overall_mean}")

# For sentences missing valid timings, fallback to the overall mean.
sentence_metrics['syllables_per_second'] = sentence_metrics['syllables_per_second'].fillna(overall_mean)

# Merge the computed syllables_per_second back into the original DataFrame.
# This assigns the sentence's value for every word in that sentence.
if 'syllables_per_second' in df.columns:
    df.drop('syllables_per_second', axis=1, inplace=True)

df = df.merge(sentence_metrics[['sentence_group', 'syllables_per_second']], on='sentence_group', how='left')

def sigmoid_normalize_sps(value):
    # Center point (where output = 0.5)
    center = 4.0  # Close to your mean
    # Scale factor (controls steepness)
    scale = 2
    
    normalized = 1 / (1 + np.exp(-(value - center) / center * scale))
    return normalized

# Apply the sigmoid normalization to scale syllables_per_second
df['speed'] = df['syllables_per_second'].apply(sigmoid_normalize_sps)

print(f"Min syllables per second: {df['syllables_per_second'].min()}")
print(f"Max syllables per second: {df['syllables_per_second'].max()}")
print(f"Mean syllables per second: {df['syllables_per_second'].mean()}")
print(f"Median syllables per second: {df['syllables_per_second'].median()}")

df = df.drop('syllables_per_second', axis=1)

print(df)

Overall mean syllables per second: 3.8318697948422686
Min syllables per second: 0.0
Max syllables per second: 11.111111111111128
Mean syllables per second: 4.191647106066254
Median syllables per second: 3.8251366120218613
       word  audio_complexity  position  \
0         i          0.110596         1   
1        am          0.110596         2   
2     sorry          0.002441         3   
3      he's          0.132812         1   
4      very          0.004883         2   
..      ...               ...       ...   
311    char          1.000000        16   
312  you're          0.284912         1   
313       a          1.000000         2   
314    doll          1.000000         3   
315      ah          1.000000         1   

                                              sentence  start_time  end_time  \
0                                          I am sorry.        0.62      1.02   
1                                          I am sorry.        0.62      1.02   
2                    

  sentence_metrics = df.groupby('sentence_group').apply(compute_sentence_metrics).reset_index()


In [9]:
print(df.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  start_time  end_time  sentence_group     speed
0              i          0.110596         1                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459
1             am          0.110596         2                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459
2          sorry          0.002441         3                                                                                                                                                          I am sorry.        1.02     

## Speechvolume

In [24]:
import librosa
import numpy as np
import pandas as pd

# Load the audio file (update "name" accordingly)
vocals_path = "data/" + name + "_vocals.wav"
print("Loading audio file...")
vocals_audio, sr_vocals = librosa.load(vocals_path, sr=None)

def calculate_segment_volume(audio, sr, start_time, end_time):
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    
    # Ensure indices are within bounds
    start_sample = max(0, min(start_sample, len(audio)-1))
    end_sample = max(start_sample+1, min(end_sample, len(audio)))
    
    segment = audio[start_sample:end_sample]
    return np.sqrt(np.mean(np.square(segment))) if len(segment) > 0 else 0.0

# Assume the dataframe is stored in a variable called df.
sentence_volumes = {}

# Group by sentence_group and compute the volume for groups with valid times.
for group_id, group_df in df.groupby("sentence_group"):
    valid_start = group_df["start_time"].dropna()
    valid_end = group_df["end_time"].dropna()
    
    if not valid_start.empty and not valid_end.empty:
        start_time = valid_start.iloc[0]
        end_time = valid_end.iloc[-1]
        sentence_volume = calculate_segment_volume(vocals_audio, sr_vocals, start_time, end_time)
    else:
        sentence_volume = np.nan  # No valid times available
        
    sentence_volumes[group_id] = sentence_volume
    sentence_text = group_df["sentence"].iloc[0] if "sentence" in group_df.columns else f"Group {group_id}"
    print(f"Sentence: {sentence_text[:30]}... Volume: {sentence_volume if not np.isnan(sentence_volume) else 'NaN'}")

# Compute the mean volume using only valid sentence volumes
valid_volumes = [vol for vol in sentence_volumes.values() if not np.isnan(vol)]
if valid_volumes:
    mean_volume = np.mean(valid_volumes)
else:
    mean_volume = 0.0
print(f"Mean sentence volume (valid only): {mean_volume:.6f}")

# Compute the volume ratio for each sentence group:
# If the sentence has no valid volume (NaN) assign ratio 1, otherwise calculate vol/mean_volume.
sentence_volume_ratios = {}
for group_id, vol in sentence_volumes.items():
    if np.isnan(vol) or mean_volume == 0:
        ratio = 1
    else:
        ratio = vol / mean_volume
    sentence_volume_ratios[group_id] = ratio

# Create a new column in df assigning each word the ratio of its sentence
df["volume_ratio"] = df["sentence_group"].map(sentence_volume_ratios)

unweighted_mean_ratio = np.mean(list(sentence_volume_ratios.values()))
print(f"Unweighted mean volume ratio: {unweighted_mean_ratio:.6f}")

min_volume_ratio = 0 # minimum volume ratio
max_volume_ratio = 2 # twice as loud as the mean

# Scale the volume ratios to the range [0, 1]
# Min-Max because ratio values under 1 are quieter than the mean > interesting, on the other hand louder and much louder than 2 are not that interesting
df['speech_volume'] = (df['volume_ratio'] - 0) / (max_volume_ratio - min_volume_ratio)
df = df.drop('volume_ratio', axis=1)
print(df)


Loading audio file...
Sentence: I am sorry.... Volume: 0.006802065763622522
Sentence: He's very tired.... Volume: 0.00592344393953681
Sentence: I understand he's had a few su... Volume: 0.0222382340580225
Sentence: The name's William Butcher.... Volume: 0.02317729778587818
Sentence: Mr. Butcher, I can't apologize... Volume: 0.03512658178806305
Sentence: What was he like?... Volume: 0.01467239111661911
Sentence: Who?... Volume: 0.0471954345703125
Sentence: Homelander.... Volume: NaN
Sentence: Growing up as a lad, what was ... Volume: 0.011143782176077366
Sentence: I am sure you don't want to ta... Volume: 0.017884807661175728
Sentence: Oh, there is nothing I want to... Volume: 0.00849794503301382
Sentence: And who knows better than you?... Volume: 0.01870792917907238
Sentence: When he was a little boy, five... Volume: 0.019063251093029976
Sentence: He'd cuddle up to me.... Volume: 0.029250038787722588
Sentence: He'll have stories about Davy ... Volume: 0.0205435361713171
Sentence: But I

In [11]:
print(df.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  start_time  end_time  sentence_group     speed  speech_volume
0              i          0.110596         1                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459       0.183038
1             am          0.110596         2                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459       0.183038
2          sorry          0.002441         3                                                                                                                                         

In [22]:
import os
import librosa
import numpy as np
import pandas as pd

def get_sentence_speech_volume(name, df, data_dir="data"):
    """
    Process sentence volume metrics and add a 'speech_volume' column to the DataFrame.
    
    The function:
      - Loads the vocals audio file from data_dir using the provided name.
      - Computes the volume (root mean square) for each sentence based on valid start/end times.
      - Normalizes the volume with respect to the mean (with a cap at twice the mean).
      - Updates DataFrame with a 'speech_volume' column.
    
    Args:
        name (str): Base name to locate the vocals file (file must be data/{name}_vocals.wav).
        df (pd.DataFrame): DataFrame containing columns: 'sentence_group', 'start_time', 
                           'end_time', and optionally 'sentence'.
        data_dir (str): Directory where the vocals audio file is stored.
    
    Returns:
        pd.DataFrame: Updated DataFrame with a new 'speech_volume' column.
    """

    # Load the audio file
    vocals_path = os.path.join(data_dir, f"{name}_vocals.wav")
    print("Loading audio file from:", vocals_path)
    vocals_audio, sr_vocals = librosa.load(vocals_path, sr=None)

    def calculate_segment_volume(audio, sr, start_time, end_time):
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        # Ensure indices are within valid bounds
        start_sample = max(0, min(start_sample, len(audio) - 1))
        end_sample = max(start_sample + 1, min(end_sample, len(audio)))
        segment = audio[start_sample:end_sample]
        return np.sqrt(np.mean(np.square(segment))) if len(segment) > 0 else 0.0

    sentence_volumes = {}
    # Compute volume for each sentence_group
    for group_id, group_df in df.groupby("sentence_group"):
        valid_start = group_df["start_time"].dropna()
        valid_end = group_df["end_time"].dropna()
        if not valid_start.empty and not valid_end.empty:
            start_time = valid_start.iloc[0]
            end_time = valid_end.iloc[-1]
            sentence_volume = calculate_segment_volume(vocals_audio, sr_vocals, start_time, end_time)
        else:
            sentence_volume = np.nan
        sentence_volumes[group_id] = sentence_volume
        sentence_text = group_df["sentence"].iloc[0] if "sentence" in group_df.columns else f"Group {group_id}"
        print(f"Sentence: {sentence_text[:30]}... Volume: {sentence_volume if not np.isnan(sentence_volume) else 'NaN'}")

    # Compute overall mean volume only from valid sentences
    valid_volumes = [vol for vol in sentence_volumes.values() if not np.isnan(vol)]
    mean_volume = np.mean(valid_volumes) if valid_volumes else 0.0
    print(f"Mean sentence volume (valid only): {mean_volume:.6f}")

    # Compute volume ratios for each sentence group (default to 1 if missing or mean_volume is 0)
    sentence_volume_ratios = {group_id: (vol / mean_volume if (not np.isnan(vol) and mean_volume > 0) else 1)
                              for group_id, vol in sentence_volumes.items()}

    # Map the sentence volume ratio to each row in the DataFrame
    df["volume_ratio"] = df["sentence_group"].map(sentence_volume_ratios)

    unweighted_mean_ratio = np.mean(list(sentence_volume_ratios.values()))
    print(f"Unweighted mean volume ratio: {unweighted_mean_ratio:.6f}")

    # Normalize volume ratio to the range [0, 1]
    min_volume_ratio = 0  # minimum reference
    max_volume_ratio = 2  # reference for twice as loud as the mean

    df['speech_volume'] = (df['volume_ratio'] - min_volume_ratio) / (max_volume_ratio - min_volume_ratio)
    df = df.drop('volume_ratio', axis=1)

    print("Updated DataFrame with 'speech_volume':")
    print(df)
    return df

# Example usage:
df_updated = get_sentence_speech_volume(name, df)

Loading audio file from: data\boys2_vocals.wav
Sentence: I am sorry.... Volume: 0.006802065763622522
Sentence: He's very tired.... Volume: 0.00592344393953681
Sentence: I understand he's had a few su... Volume: 0.0222382340580225
Sentence: The name's William Butcher.... Volume: 0.02317729778587818
Sentence: Mr. Butcher, I can't apologize... Volume: 0.03512658178806305
Sentence: What was he like?... Volume: 0.01467239111661911
Sentence: Who?... Volume: 0.0471954345703125
Sentence: Homelander.... Volume: NaN
Sentence: Growing up as a lad, what was ... Volume: 0.011143782176077366
Sentence: I am sure you don't want to ta... Volume: 0.017884807661175728
Sentence: Oh, there is nothing I want to... Volume: 0.00849794503301382
Sentence: And who knows better than you?... Volume: 0.01870792917907238
Sentence: When he was a little boy, five... Volume: 0.019063251093029976
Sentence: He'd cuddle up to me.... Volume: 0.029250038787722588
Sentence: He'll have stories about Davy ... Volume: 0.0205435

In [21]:
print(df_updated.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  sentence_group  start_time  end_time     speed  speech_volume
0              i          0.110596         1                                                                                                                                                          I am sorry.               1        0.62      1.02  0.622459       0.166964
1             am          0.110596         2                                                                                                                                                          I am sorry.               1        0.62      1.02  0.622459       0.166964
2          sorry          0.002441         3                                                                                                                                         

## Ambient volume

In [12]:
import numpy as np

print("Calculating word volumes for vocals and instrumental tracks...")
# Loading audio files and ensuring matching sample rates
print("Loading audio files...")
no_vocals_path = "data/" + name + "_no_vocals.wav"
no_vocals_audio, sr_no_vocals = librosa.load(no_vocals_path, sr=None)

if sr_vocals != sr_no_vocals:
    print(f"Warning: Sample rates differ (vocals: {sr_vocals}, no_vocals: {sr_no_vocals})")
    print("Resampling no_vocals to match vocals sample rate...")
    no_vocals_audio = librosa.resample(no_vocals_audio, orig_sr=sr_no_vocals, target_sr=sr_vocals)
    sr_no_vocals = sr_vocals

sr = sr_vocals


def get_fallback_times(df, idx, epsilon=0.01):
    """
    Returns a tuple (start_time, end_time) for the word at index idx using
    fallback values if needed from previous or next available words.
    If no fallback is found, a short interval is used.
    """
    word = df.loc[idx]
    start_time = word["start_time"]
    end_time = word["end_time"]

    # If start_time is missing, get previous available end_time.
    if pd.isna(start_time):
        for j in range(idx - 1, -1, -1):
            if pd.notna(df.loc[j, "end_time"]):
                start_time = df.loc[j, "end_time"]
                break
    # If end_time is missing, get next available start_time.
    if pd.isna(end_time):
        for j in range(idx + 1, len(df)):
            if pd.notna(df.loc[j, "start_time"]):
                end_time = df.loc[j, "start_time"]
                break

    # Fallback if still missing
    if pd.isna(start_time) and pd.notna(end_time):
        start_time = end_time - epsilon
    if pd.isna(end_time) and pd.notna(start_time):
        end_time = start_time + epsilon
    if pd.isna(start_time) and pd.isna(end_time):
        start_time = 0
        end_time = epsilon

    return start_time, end_time

word_volumes = {}

# Ratio > 1 → The speech is significantly louder than the ambient track 
for idx, word in df.iterrows():
    # Get start and end times, using fallback if necessary
    start_time, end_time = get_fallback_times(df, idx)
    word_text = word["word"]
    
    # Calculate volumes for this word in both tracks
    vocals_volume = calculate_segment_volume(vocals_audio, sr, start_time, end_time)
    no_vocals_volume = calculate_segment_volume(no_vocals_audio, sr, start_time, end_time)
    
    # Calculate vocals-to-instrumental ratio
    ratio = vocals_volume / no_vocals_volume if no_vocals_volume > 0 else 10.0
    
    word_volumes[idx] = {
        "vocals_volume": vocals_volume,
        "no_vocals_volume": no_vocals_volume,
        "ratio": ratio
    }
    
    #print(f"Word: {word_text}, Vocals: {vocals_volume:.6f}, Instrumental: {no_vocals_volume:.6f}, Ratio: {ratio:.2f}")

# emphasize values below 1
def log_normalize(x, C=4):
    normalized = np.log(1 + x) / np.log(1 + C)
    return min(normalized, 1)

# Apply normalization 
normalized_ratios = {}
for idx, data in word_volumes.items():
    ratio = data["ratio"]
    normalized = 1 - log_normalize(ratio)
    normalized_ratios[idx] = normalized

#df["ambient_volume_normalized"] = [word_volumes[idx]["ratio"] for idx in df.index]
df["ambient_volume"] = [normalized_ratios[idx] for idx in df.index]
print(df)

Calculating word volumes for vocals and instrumental tracks...
Loading audio files...
       word  audio_complexity  position  \
0         i          0.110596         1   
1        am          0.110596         2   
2     sorry          0.002441         3   
3      he's          0.132812         1   
4      very          0.004883         2   
..      ...               ...       ...   
311    char          1.000000        16   
312  you're          0.284912         1   
313       a          1.000000         2   
314    doll          1.000000         3   
315      ah          1.000000         1   

                                              sentence  start_time  end_time  \
0                                          I am sorry.        0.62      1.02   
1                                          I am sorry.        0.62      1.02   
2                                          I am sorry.        1.02      1.42   
3                                     He's very tired.        1.42      2.76 

In [25]:
def get_word_ambient_volume(name, df, vocals_audio, sr_vocals, data_dir="data"):
    """
    Compute per-word ambient volume ratios from vocals and instrumental tracks 
    and add a normalized "ambient_volume" column to the DataFrame.
    
    Args:
        name (str): Base name used to form the filename (expects {name}_no_vocals.wav in data_dir).
        df (pd.DataFrame): DataFrame containing word info and timing info.
            Must include columns: "word", "start_time", "end_time".
        vocals_audio (np.ndarray): Audio data for vocals.
        sr_vocals (int): Sample rate for the vocals audio.
        data_dir (str): Directory where the no_vocals file is stored (default: "data").
        
    Returns:
        pd.DataFrame: Updated DataFrame with an added "ambient_volume" column.
    """
    import os
    import numpy as np
    import pandas as pd
    import librosa

    # Load the instrumental (no_vocals) audio file
    no_vocals_path = os.path.join(data_dir, f"{name}_no_vocals.wav")
    print("Loading instrumental audio file from:", no_vocals_path)
    no_vocals_audio, sr_no_vocals = librosa.load(no_vocals_path, sr=None)

    # Ensure sample rates match
    if sr_vocals != sr_no_vocals:
        print(f"Warning: Sample rates differ (vocals: {sr_vocals}, instrumental: {sr_no_vocals})")
        print("Resampling instrumental track to match vocals sample rate...")
        no_vocals_audio = librosa.resample(no_vocals_audio, orig_sr=sr_no_vocals, target_sr=sr_vocals)
        sr_no_vocals = sr_vocals

    sr = sr_vocals

    def calculate_segment_volume(audio, sr, start_time, end_time):
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)
        # Ensure indices are within bounds
        start_sample = max(0, min(start_sample, len(audio) - 1))
        end_sample = max(start_sample + 1, min(end_sample, len(audio)))
        segment = audio[start_sample:end_sample]
        return np.sqrt(np.mean(np.square(segment))) if len(segment) > 0 else 0.0

    def get_fallback_times(df, idx, epsilon=0.01):
        """
        Returns a tuple (start_time, end_time) for the word at index idx using
        fallback values if needed from previous or next available words.
        """
        word = df.loc[idx]
        start_time = word["start_time"]
        end_time = word["end_time"]

        # Fallback for missing start_time: search backward for available end_time.
        if pd.isna(start_time):
            for j in range(idx - 1, -1, -1):
                if pd.notna(df.loc[j, "end_time"]):
                    start_time = df.loc[j, "end_time"]
                    break
        # Fallback for missing end_time: search forward for available start_time.
        if pd.isna(end_time):
            for j in range(idx + 1, len(df)):
                if pd.notna(df.loc[j, "start_time"]):
                    end_time = df.loc[j, "start_time"]
                    break

        # Final fallbacks if still missing.
        if pd.isna(start_time) and pd.notna(end_time):
            start_time = end_time - epsilon
        if pd.isna(end_time) and pd.notna(start_time):
            end_time = start_time + epsilon
        if pd.isna(start_time) and pd.isna(end_time):
            start_time, end_time = 0, epsilon

        return start_time, end_time

    def log_normalize(x, C=4):
        normalized = np.log(1 + x) / np.log(1 + C)
        return min(normalized, 1)

    word_volumes = {}
    # Iterate over each word in the DataFrame.
    for idx, word in df.iterrows():
        start_time, end_time = get_fallback_times(df, idx)
        # Calculate volumes for both vocals and instrumental tracks.
        vocals_volume = calculate_segment_volume(vocals_audio, sr, start_time, end_time)
        no_vocals_volume = calculate_segment_volume(no_vocals_audio, sr, start_time, end_time)
        # Compute the ratio; if instrumental volume is zero, use a default high value.
        ratio = vocals_volume / no_vocals_volume if no_vocals_volume > 0 else 10.0

        word_volumes[idx] = {
            "vocals_volume": vocals_volume,
            "no_vocals_volume": no_vocals_volume,
            "ratio": ratio
        }

    # Apply logarithmic normalization to emphasize values below 1.
    normalized_ratios = {}
    for idx, data in word_volumes.items():
        normalized = 1 - log_normalize(data["ratio"])
        normalized_ratios[idx] = normalized

    # Map the normalized value to each row in the DataFrame.
    df["ambient_volume"] = [normalized_ratios[idx] for idx in df.index]
    print(df)
    return df

# Example usage:
df_updated = get_word_ambient_volume(name, df, vocals_audio, sr_vocals, data_dir="data")

Loading instrumental audio file from: data\boys2_no_vocals.wav
       word  audio_complexity  position  \
0         i          0.110596         1   
1        am          0.110596         2   
2     sorry          0.002441         3   
3      he's          0.132812         1   
4      very          0.004883         2   
..      ...               ...       ...   
311    char          1.000000        16   
312  you're          0.284912         1   
313       a          1.000000         2   
314    doll          1.000000         3   
315      ah          1.000000         1   

                                              sentence  sentence_group  \
0                                          I am sorry.               1   
1                                          I am sorry.               1   
2                                          I am sorry.               1   
3                                     He's very tired.               2   
4                                     He's very ti

In [13]:
print(df.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  start_time  end_time  sentence_group     speed  speech_volume  ambient_volume
0              i          0.110596         1                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459       0.183038        0.796028
1             am          0.110596         2                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459       0.183038        0.796028
2          sorry          0.002441         3                                                                                         

## As usual

In [14]:
original_tokens

[{'token': 'i', 'sentence': 'I am sorry.', 'position': 1},
 {'token': 'am', 'sentence': 'I am sorry.', 'position': 2},
 {'token': 'sorry', 'sentence': 'I am sorry.', 'position': 3},
 {'token': "he's", 'sentence': "He's very tired.", 'position': 1},
 {'token': 'very', 'sentence': "He's very tired.", 'position': 2},
 {'token': 'tired', 'sentence': "He's very tired.", 'position': 3},
 {'token': 'i',
  'sentence': "I understand he's had a few surprise visitors today, but you tell him he's got one more.",
  'position': 1},
 {'token': 'understand',
  'sentence': "I understand he's had a few surprise visitors today, but you tell him he's got one more.",
  'position': 2},
 {'token': "he's",
  'sentence': "I understand he's had a few surprise visitors today, but you tell him he's got one more.",
  'position': 3},
 {'token': 'had',
  'sentence': "I understand he's had a few surprise visitors today, but you tell him he's got one more.",
  'position': 4},
 {'token': 'a',
  'sentence': "I understan

In [7]:
df['display'] = None
df['set_manually'] = False
df['process'] = True
exception_words = ["i", "no", "so"]
df = mark_non_english_in_df(df, exception_words)
df = mark_notes_in_df(df)
df = mark_excluded_words(df)
df = mark_numbers_in_df(df)


Excluded 5 words based on EXCLUDED_WORDS list


In [8]:
translation_results = batch_translate_and_align(df, device=str(device)) # Device-String
df_translations = pd.DataFrame(translation_results)
df['translation'] = df_translations['german_translation']

Using device: cuda for translation and alignment


Device set to use cuda
2025-03-03 16:00:51,534 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [None]:
from feature_extraction import FeatureExtractor

feature_extractor = FeatureExtractor(device=str(device))
df.loc[df['process'], 'word_occurrence'] = df.loc[df['process'], 'word'].apply(feature_extractor.get_word_occurrence)
df.loc[df['process'], 'word_complexity'] = df.loc[df['process'], 'word'].apply(feature_extractor.get_word_complexity)
df_sentence_complexity = feature_extractor.get_sentence_complexity(df)
df["sentence_complexity"] = df_sentence_complexity["sentence_complexity"]
word_importances = feature_extractor.get_word_importance(df, batch_size=32)
df["word_importance"] = word_importances
df = feature_extractor.get_sentence_speed(df)
df = feature_extractor.get_sentence_speech_volume(name, df)
df = feature_extractor.get_word_ambient_volume(name, df)
df

Using cuda for feature extraction.
Max entropy: 4.33504986479276


TypeError: FeatureExtractor.get_sentence_speech_volume() missing 1 required positional argument: 'df'

In [11]:
print(df.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  start_time  end_time  display  set_manually  process           translation  word_occurrence  word_complexity  sentence_complexity  word_importance  sentence_group     speed  speech_volume  ambient_volume
0              i          0.110596         1                                                                                                                                                          I am sorry.        0.62      1.02    False         False     True                     i         0.000000              0.0             0.413319         0.003906               1  0.622459       0.185914        0.796390
1             am          0.110596         2                                                                                                                              

In [19]:
print(df.to_string())

            word  audio_complexity  position                                                                                                                                                             sentence  start_time  end_time  sentence_group     speed  speech_volume  ambient_volume  display  set_manually  process           translation  word_occurrence  word_complexity  sentence_complexity  word_importance
0              i          0.110596         1                                                                                                                                                          I am sorry.        0.62      1.02               1  0.622459       0.183038        0.796028    False         False     True                     i         0.000000              0.0             0.413319         0.003906
1             am          0.110596         2                                                                                                                              