## Hearing Your Way Through Music Recordings:  A Text Alignment and Synthesis Approach &ndash; Demo Notebook

In this notebook, we demonstrate the use of this processing pipeline with three case studies: 
1. [Case Study: Measure Numbers](#1.-Case-Study:-Measure-Numbers) (Schubert Winterreise, Beethoven Piano Sonatas)
2. [Case Study: Chords and Harmonies](#2.-Case-Study:-Chords-and-Harmonies) (Beethoven Piano Sonatas, Schubert Winterreise)
3. [Case Study: Leitmotifs](#3.-Case-Study:-Leitmotifs) (Wagner Operas)
4. [Case Study: Structure](#4.-Case-Study:-Structure) (Beethoven Piano Sonatas)

In [1]:
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import torch
import pandas as pd
import librosa
import IPython.display as ipd

import textalignsynth

In [2]:
# global settings
data_basedir = './data'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"{device=}")

show_progress_bar = True

def play_segment(audio, fs, play_segment_sec=[0, 10]):
    if play_segment_sec is not None:
        ipd.display(ipd.Audio(audio[play_segment_sec[0]*fs : play_segment_sec[1]*fs], rate=fs))
    else:
        ipd.display(ipd.Audio(audio, rate=fs))

device='cuda'


## 1. Case Study: Measure Numbers

### a) Schubert Winterreise

<img src=figures/measures.jpg width="600">

In [3]:
# load audio and annotations
path_audio = os.path.join(data_basedir, 'Schubert_Winterreise', '01_RawData', 'audio_wav')
path_annot = os.path.join(data_basedir, 'Schubert_Winterreise', '02_Annotations', 'ann_audio_measure')

fn = 'Schubert_D911-22_SC06'

x, x_fs = librosa.load(os.path.join(path_audio, f'{fn}.wav'))

print('Original recording:')
play_segment(x, x_fs)

# text comment generation 
x_measure_annot = pd.read_csv(os.path.join(path_annot, f'{fn}.csv'), sep=';').values.tolist()
measure_comment_list = textalignsynth.get_measure_comments(x_measure_annot, start=1, step=1)

# text-to-speech synthesis, post-processing, and superposition
commenter = textalignsynth.Commenter(device=device, language='en')

comment_params = {
    'speed': 1.0,
    't_min': None,
    't_max': None,
    'pos_rel': 0.1,
    'pos_offset_abs': 0.0,
    'offset_loc': -5.0,
    'offset_glob': -5.0,
    'w_glob_loc': 0.5,
}

x_commented = commenter(x, x_fs, measure_comment_list, **comment_params, show_progress_bar=show_progress_bar)

print('Commented recording:')
play_segment(x_commented, x_fs)

Original recording:


100%|██████████| 64/64 [00:02<00:00, 31.33it/s]

Commented recording:





### b) Beethoven Piano Sonatas

In [4]:
# load audio and annotations
path_audio = os.path.join(data_basedir, 'BeethovenPianoSonatas', '1_Audio')
path_annot = os.path.join(data_basedir, 'BeethovenPianoSonatas', '2_Annotations', 'ann_audio_measure')

fn = 'Beethoven_Op002No1-01_FG58'

x, x_fs = librosa.load(os.path.join(path_audio, f'{fn}.wav'))

print('Original recording:')
play_segment(x, x_fs)

# text comment generation 
x_measure_annot = pd.read_csv(os.path.join(path_annot, f'{fn}.csv'), sep=';').values.tolist()
measure_comment_list = textalignsynth.get_measure_comments(x_measure_annot, start=1, step=1)

# text-to-speech synthesis, post-processing, and superposition
commenter = textalignsynth.Commenter(device=device, language='de')

comment_params = {
    'speed': 1.1,
    't_min': None,
    't_max': 0.8,
    'pos_rel': 0.1,
    'pos_offset_abs': 0.0,
    'offset_loc': -5.0,
    'offset_glob': -5.0,
    'w_glob_loc': 0.5,
}

x_commented = commenter(x, x_fs, measure_comment_list, **comment_params, show_progress_bar=show_progress_bar)

print('Commented recording:')
play_segment(x_commented, x_fs)

Original recording:


100%|██████████| 200/200 [00:15<00:00, 12.95it/s]

Commented recording:





## 2. Case Study: Chords and Harmonies

<img src=figures/chords.jpg width="500">

### a) Beethoven Piano Sonatas

In [5]:
# load audio and annotations
path_audio = os.path.join(data_basedir, 'BeethovenPianoSonatas', '1_Audio')
path_annot = os.path.join(data_basedir, 'BeethovenPianoSonatas', '2_Annotations', 'ann_audio_chord')

fn = 'Beethoven_Op002No1-01_FG58'

x, x_fs = librosa.load(os.path.join(path_audio, f'{fn}.wav'))

print('Original recording:')
play_segment(x, x_fs)   

# text comment generation 
chord_annot_list = pd.read_csv(os.path.join(path_annot, f'{fn}.csv'), sep=';')[['start', 'majmin']].values.tolist()
chord_comment_list = textalignsynth.get_chord_comments(chord_annot_list, filter_valid=True, remove_repeated=True)

# text-to-speech synthesis, post-processing, and superposition
commenter = textalignsynth.Commenter(device=device, language='en')

comment_params = {
    'speed': 1.35,
    't_min': None,
    't_max': 0.75,
    'pos_rel': 0.0,
    'pos_offset_abs': 0.0,
    'offset_loc': -5.0,
    'offset_glob': -5.0,
    'w_glob_loc': 0.5,
}

x_commented = commenter(x, x_fs, chord_comment_list, **comment_params, show_progress_bar=show_progress_bar)

print('Commented recording:')
play_segment(x_commented, x_fs)

Original recording:


100%|██████████| 231/231 [00:11<00:00, 19.28it/s]

Commented recording:





### b) Schubert Winterreise

In [6]:
# load audio and annotations
path_audio = os.path.join(data_basedir, 'Schubert_Winterreise', '01_RawData', 'audio_wav')
path_annot = os.path.join(data_basedir, 'Schubert_Winterreise', '02_Annotations', 'ann_audio_chord')

fn = 'Schubert_D911-22_SC06'

x, x_fs = librosa.load(os.path.join(path_audio, f'{fn}.wav'))

print('Original recording:')
play_segment(x, x_fs)

# text comment generation 
chord_annot_list = pd.read_csv(os.path.join(path_annot, f'{fn}.csv'), sep=';')[['start', 'majmin']].values.tolist()
chord_comment_list = textalignsynth.get_chord_comments(chord_annot_list, filter_valid=True, remove_repeated=True)

# text-to-speech synthesis, post-processing, and superposition
commenter = textalignsynth.Commenter(device=device, language='en')

comment_params = {
    'speed': 1.3,
    't_min': None,
    't_max': None,
    'pos_rel': 0.1,
    'pos_offset_abs': 0.0,
    'offset_loc': -5.0,
    'offset_glob': -5.0,
    'w_glob_loc': 0.5,
}

x_commented = commenter(x, x_fs, chord_comment_list, **comment_params, show_progress_bar=show_progress_bar)

print('Commented recording:')
play_segment(x_commented, x_fs)

Original recording:


100%|██████████| 74/74 [00:03<00:00, 19.79it/s]

Commented recording:





## 3. Case Study: Leitmotifs

### Wagner Operas

<img src=figures/leitmotifs.jpg width="500">

In [7]:
# load audio and annotations
path_audio = os.path.join(data_basedir, 'WagnerRingShortened', '01_RawData', 'audio_wav')
path_annot = os.path.join(data_basedir, 'WagnerRingShortened', '02_Annotations', 'ann_audio_leitmotifs')

fn = 'Wagner_WWV086D-1_Krauss1953'

x, x_fs = librosa.load(os.path.join(path_audio, f'{fn}.wav'))

print('Original recording:')
play_segment(x, x_fs)   

# text comment generation 
leitmotif_annot = pd.read_csv(os.path.join(path_annot, f'{fn}.csv'), sep=";")
leitmotif_annot = leitmotif_annot.sort_values(by='start')
leitmotif_annot_list = leitmotif_annot[['start', 'motif']].values.tolist()
leitmotif_comment_list = textalignsynth.get_leitmotif_comments(leitmotif_annot_list)

# text-to-speech synthesis, post-processing, and superposition
commenter = textalignsynth.Commenter(device=device, language='de')

comment_params = {
    'speed': 0.8,
    't_min': None,
    't_max': None,
    'pos_rel': 1.0,
    'pos_offset_abs': 0.0,
    'offset_loc': 0.0,
    'offset_glob': 0.0,
    'w_glob_loc': 0.5,
}

x_commented = commenter(x, x_fs, leitmotif_comment_list, **comment_params, show_progress_bar=show_progress_bar)

print('Commented recording:')
play_segment(x_commented, x_fs)

Original recording:


100%|██████████| 6/6 [00:00<00:00, 17.14it/s]

Commented recording:





## 4. Case Study: Structure

### Beethoven Piano Sonatas

<img src=figures/structure.jpg width="500">

In [8]:
# load audio and annotations
path_audio = os.path.join(data_basedir, 'BeethovenPianoSonatas', '1_Audio')
path_annot = os.path.join(data_basedir, 'BeethovenPianoSonatas', '2_Annotations', 'ann_audio_structureFine')

fn = 'Beethoven_Op002No1-01_FG58'

x, x_fs = librosa.load(os.path.join(path_audio, f'{fn}.wav'))

print('Original recording:')
play_segment(x, x_fs)   
    
# text comment generation 
structure_annot_list = pd.read_csv(os.path.join(path_annot, f'{fn}.csv'), sep=';')[['start', 'structure']].values.tolist()
structure_comment_list = textalignsynth.get_structure_comments(structure_annot_list)

# text-to-speech synthesis, post-processing, and superposition
commenter = textalignsynth.Commenter(device=device, language='en')

comment_params = {
    'speed': 1.0,
    't_min': None,
    't_max': None,
    'pos_rel': 1.0,
    'pos_offset_abs': 0.0,
    'offset_loc': -5.0,
    'offset_glob': -5.0,
    'w_glob_loc': 0.0,
}

x_commented = commenter(x, x_fs, structure_comment_list, **comment_params, show_progress_bar=show_progress_bar)

print('Commented recording:')
play_segment(x_commented, x_fs)

Original recording:


100%|██████████| 13/13 [00:00<00:00, 17.77it/s]

Commented recording:



