In [1]:
# Requirements
import pandas as pd
import numpy as np
from madmom.features import DBNBeatTrackingProcessor, RNNBeatProcessor
from madmom.features import DBNDownBeatTrackingProcessor, RNNDownBeatProcessor
from scipy.io import wavfile
import numpy as np
import pandas as pd
import json
import pandas as pd
import networkx as nx

##### Track beats and downbeats, align them manually when possible
Functions to extract beats, downbeats, format detected data into handable dataframe format, compare and align data


In [2]:
def extract_beats(audio_path):
    """ Tracks beats using madmom DBNBeatTracker algorithm 
    Input: music audio wav file
    Output: timestamps of detected beats
    """
    proc = DBNBeatTrackingProcessor(fps=100, correct=True)
    act = RNNBeatProcessor()(audio_path)
    beats = proc(act)
    beats = pd.DataFrame(beats, columns=['timestamp'])
    beats['BeatOnsetLocation']=True
    return beats
    
def extract_downbeats(audio_path, sugg_beats_per_bar=[2, 3, 4, 6, 8, 9]):
    """ Tracks downbeats using madmom DBNDownBeatTracker algorithm 
    Input: music audio wav file, suggested possible number of beats per bar to be modeled
    Output: timestamps of detected beats, downbeats numerotation
    Note: downbeats might not reflect actual beats but a multiple or a fraction of them
    """
    proc = DBNDownBeatTrackingProcessor(beats_per_bar=sugg_beats_per_bar, fps=100, correct=True)
    act = RNNDownBeatProcessor()(audio_path)
    downbeats = proc(act)
    downbeats = pd.DataFrame(downbeats, columns=['timestamp', 'Beat'])
    downbeats['BeatOnsetLocation']=True
    return downbeats

def add_bar_number(df):
    """ Add column containing bar number to dataframe containing beats timestamps and positions within a bar
    Input: beats sequences with timestamps and positions within a bar
    Output: dataframe with extra column containing bar numerotation
    """
    df_ = df.copy()
    df_['Bar'] = df_.Beat.apply(lambda x: 1 if x==1 else 0)
    count_bar = 0
    for id, row in df_.iterrows():
        if row.Bar == 1:
            count_bar+=1
        df_.Bar.iloc[id] = count_bar
    return df_

def adjust_beats_downbeats(beats, downbeats, verbose=True):
    """ Compares beats and downbeats, matches them, filters relevant beats and extracts time signature period
    Input: beats and downbeats in dataframe format
    Output: beats sequences matched with time signature period
    """
    
    # Get time signature first estimation:
    bar_signature = downbeats.Beat.unique()
    beats_per_bar = max(bar_signature)
    if verbose:
        print("Downbeats detected by DBNDownBeatTrackingProcessor coupled with RNNDownBeatProcessor:", bar_signature)
        print("Corresponding time signature:", beats_per_bar)

    # 1. Compare sizes: 
    if verbose:
        print(f"Total number of beats detected by DBNBeatTrackingProcessor coupled with RNNBeatProcessor: {len(beats)}")
        print(f"Total number of downbeats detected by DBNDownBeatTrackingProcessor coupled with RNNDownBeatProcessor: {len(downbeats)}")
    # 2. Compare longest timestamp
    min_max_timestamp = min(max(beats.timestamp), max(downbeats.timestamp))
    # 3. Crop both sequences to minimum max timestamp
    beats_c = beats.query('timestamp<@min_max_timestamp')
    db_c = downbeats.query('timestamp<@min_max_timestamp')
    # 4. Compare sizes again
    comparable = len(beats_c)%len(db_c) == 0 or len(db_c)%len(beats_c) == 0 
    if verbose:
        print("After temporal alignment:")
        print(f"    >> Number of beats: {len(beats_c)}")
        print(f"    >> Number of downbeats: {len(db_c)}")
        print(f"    >> Comparable sizes ?", comparable)
    if not comparable:
        raise ValueError("Beats and downbeats sequences not comparable")
    # 5. Check which is a multiple of the other
    if comparable:
        if len(beats_c)%len(db_c) == 0:
            pass # do nothing yet bc should'nt be the case, but to test with other files to be sure
        if len(db_c)%len(beats_c) == 0:
            # check the multiple:
            multiple = int(len(db_c)/len(beats_c))
            # keep only one downbeat every 'multiple'
            db_filt = db_c.iloc[::multiple, :]
            
    # Get beats refined annotation:
    bar_signature_new = db_filt.Beat.unique()
    adj_time_signature = len(bar_signature_new)
    bar_signature_filt = np.arange(1, len(bar_signature_new)+1)
    if verbose:
        print(f"Beats left after alignment and filtering: {bar_signature_new}, corresponding time signature: {len(bar_signature_new)}")
        print("New annotation:", bar_signature_filt)

    db_filt = db_filt.replace({'Beat': dict(zip(bar_signature_new, bar_signature_filt))})
    db_filt = db_filt.reset_index(drop=True)
    db_filt = add_bar_number(db_filt)
    db_filt['Meter'] = str(adj_time_signature)+'/'+ str(adj_time_signature)
    db_filt['BeatOnsetIndex'] = db_filt.index + 1

    return db_filt

##### Load audio wav file into continuous time dataframe
Loads audio into dataframe format containing timestamps information

In [3]:
def load_audio(audio_path):
    """ Loads audio wav file into handable dataframe containing timestamps and channel values for each sample
    Input: audio wav file path
    Output: dataframe
    """
    sampling_rate, audio_data = wavfile.read(audio_path)
    audio_length = audio_data.shape[0]/sampling_rate
    timestamp = np.linspace(0., audio_length, audio_data.shape[0])
    df_audio = pd.DataFrame({'timestamp': timestamp, 'LeftChannel': audio_data[:,0], 'RightChannel':audio_data[:,1]})
    return df_audio, sampling_rate

##### Adjust beats data to compare with audio data
Rounds beats timestamps to comparable timestamps, labels beats to corresponding audio sample

In [4]:
def adjust_timestamp(df, sampling_rate):
    """Rounds beats timestamps to closest multiple of audio sampling period below, and adds column indicating sample index for identification ease
    Input: beats dataframe containing the beats timestamps
    Output: dataframe with adujsted timestamps and additional column sample_id identifiying to which audio sample the beats align to
    """
    df_ = df.copy()
    if not 'sample_id' in df_.columns:
        # Modify only if no modification has been done before
        df_['timestamp'] = df_['timestamp'].apply(lambda x: x - x%(1/sampling_rate))
        df_['sample_id'] = df_['timestamp'].apply(lambda x: x*sampling_rate)
        df_['sample_id'] = df_['sample_id'].apply(np.floor).apply(int)
    return df_

##### Import tree annotation labels
Read tree, extract leaves chords and positions and format to dataframe

In [5]:
def extract_annotated_chords_sequence(tree_path, output_csv_path=None):
    """ Extracts annotated chords ordered sequence from JSON tree
    Input: JSON tree file, where annotated chords are tree leaves
    Output: dataframe containing ordered chords, bar and beat within a bar positions
    """
    with open(tree_path, 'r') as json_file:
        tree_dict = json.load(json_file)
    graph = nx.json_graph.tree_graph(tree_dict, ident='label', children='children')
    tree_leaves = [x for x in graph.nodes() if graph.out_degree(x)==0 and graph.in_degree(x)==1]
    chords = []
    bars = []
    beats = []
    for x in tree_leaves:
        attr = x.split(sep=':')
        chords.append(attr[0])
        position = attr[1].split(sep='.')
        bars.append(int(position[0]))
        if len(position) > 1: # In case the beat position within a bar is specified
            beats.append(int(position[1]))
        else:
            beats.append(1) # In case beat position not specified, infer 1 as the beat position
    df_sequence = pd.DataFrame({'AnnotatedLabel': chords, 'Bar': bars, 'Beat': beats})
    
    if list(df_sequence.Bar) != list(df_sequence.Bar.sort_values(ascending=True)):
        raise "Sequence not correctly ordered"
    
    if output_csv_path!=None:
        df_sequence.to_csv(output_csv_path)
        
    return df_sequence

##### Merge beats and chords labels information to continuous audio dataframe and format to desired output

In [12]:
def merge_and_format(df_audio, df_beats, df_sequence):
    """ Merges audio, beats and annotated labels dataframes into a single output,
    aligns according to sample index, bar and beat number,
    adds additional columns to identify beats and labels onset.
    
    Input: audio, beats and annotated labels dataframes
    Output: dataframe summarizing audio, beats and labels annotations alignment
    """
    result = pd.merge(
        df_audio, df_beats, right_on="sample_id", left_index=True, how="left", sort=False
    )
    result['BeatOnsetLocation']=result['BeatOnsetLocation'].fillna(False)
    result[['Beat', 'Bar']] = result[['Beat', 'Bar']].fillna(method='ffill')
    result['Meter']=result['Meter'].fillna(method='bfill')
    result['Meter']=result['Meter'].fillna(method='ffill')
    result = result.drop(columns='timestamp_y').rename(columns={'timestamp_x':'timestamp'}).set_index('sample_id')
    final = pd.merge(result, df_sequence, on=['Bar', 'Beat'], how='outer')
    final['LabelOnset'] = False
    final.loc[~final.duplicated(subset=['Bar', 'Beat']) & ~final['AnnotatedLabel'].isna(), 'LabelOnset'] = True
    final.loc[~final['LabelOnset'], 'AnnotatedLabel'] = np.nan
    final['AnnotatedLabelRegion'] = final['AnnotatedLabel'].fillna(method='ffill')
    final = final[['timestamp', 'LeftChannel', 'RightChannel', 'Meter', 'Bar', 'Beat', 'BeatOnsetLocation', 'BeatOnsetIndex', 'LabelOnset', 'AnnotatedLabel', 'AnnotatedLabelRegion']]
    final[['Beat','Bar', 'BeatOnsetIndex', 'AnnotatedLabel','AnnotatedLabelRegion']] = final[['Beat','Bar', 'BeatOnsetIndex', 'AnnotatedLabel','AnnotatedLabelRegion']].replace({np.nan: 0})
    return final


##### In practice: application to working (and not working...) examples

Bach prelude in C example:

In [3]:
# Audio wav file
audio_file = "examples/BachC.wav"
# Annotated JSON tree
tree_path = "examples/BachC.json"
# Path to write output to in CSV format
output_name = "examples/BachC_output"
beats_output_path = output_name + "_beats_seq.csv"
output_path = output_name + ".csv"

In [13]:
beats = extract_beats(audio_file)
downbeats = extract_downbeats(audio_file)
df_beats_adj = adjust_beats_downbeats(beats, downbeats)

df_beats_adj.to_csv(beats_output_path)

df_audio, sampling_rate = load_audio(audio_file)
df_beats_adj = adjust_timestamp(df_beats_adj, sampling_rate)
df_tree = extract_annotated_chords_sequence(tree_path)
final_output = merge_and_format(df_audio, df_beats_adj, df_tree)

final_output.to_csv(output_path)

  file_sample_rate, signal = wavfile.read(filename, mmap=True)
  file_sample_rate, signal = wavfile.read(filename, mmap=True)


Downbeats detected by DBNDownBeatTrackingProcessor coupled with RNNDownBeatProcessor: [1. 2. 3. 4. 5. 6. 7. 8.]
Corresponding time signature: 8.0
Total number of beats detected by DBNBeatTrackingProcessor coupled with RNNBeatProcessor: 142
Total number of downbeats detected by DBNDownBeatTrackingProcessor coupled with RNNDownBeatProcessor: 275
After temporal alignment:
    >> Number of beats: 137
    >> Number of downbeats: 274
    >> Comparable sizes ? True
Beats left after alignment and filtering: [1. 3. 5. 7.], corresponding time signature: 4
New annotation: [1 2 3 4]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_.Bar.iloc[id] = count_bar
  sampling_rate, audio_data = wavfile.read(audio_path)


In [14]:
# For Bach C example, show first beat region in output:
final_output.iloc[33070:33080]

Unnamed: 0,timestamp,LeftChannel,RightChannel,Meter,Bar,Beat,BeatOnsetLocation,BeatOnsetIndex,LabelOnset,AnnotatedLabel,AnnotatedLabelRegion
33070,0.749887,5009920,6165504,4/4,0.0,0.0,False,0.0,False,0,0
33071,0.749909,5394944,5394944,4/4,0.0,0.0,False,0.0,False,0,0
33072,0.749932,4624384,6165760,4/4,0.0,0.0,False,0.0,False,0,0
33073,0.749955,4239104,6936832,4/4,0.0,0.0,False,0.0,False,0,0
33074,0.749977,4624128,6551040,4/4,1.0,1.0,True,1.0,True,C,C
33075,0.75,4239104,6551040,4/4,1.0,1.0,False,0.0,False,0,C
33076,0.750023,3083008,8478208,4/4,1.0,1.0,False,0.0,False,0,C
33077,0.750045,3853568,8092160,4/4,1.0,1.0,False,0.0,False,0,C
33078,0.750068,5009664,6551808,4/4,1.0,1.0,False,0.0,False,0,C
33079,0.750091,3853824,6165248,4/4,1.0,1.0,False,0.0,False,0,C


In [15]:
# Example of close data around the 5th beat
final_output.iloc[183892:183900]

Unnamed: 0,timestamp,LeftChannel,RightChannel,Meter,Bar,Beat,BeatOnsetLocation,BeatOnsetIndex,LabelOnset,AnnotatedLabel,AnnotatedLabelRegion
183892,4.169887,-169559808,-23892736,4/4,1.0,4.0,False,0.0,False,0,C
183893,4.16991,-165705728,-18882816,4/4,1.0,4.0,False,0.0,False,0,C
183894,4.169933,-161852160,-13872896,4/4,1.0,4.0,False,0.0,False,0,C
183895,4.169955,-156842752,-9248768,4/4,1.0,4.0,False,0.0,False,0,C
183896,4.169978,-149906176,-3853824,4/4,2.0,1.0,True,5.0,True,Dm7,Dm7
183897,4.170001,-141427712,2312448,4/4,2.0,1.0,False,0.0,False,0,Dm7
183898,4.170023,-133335552,9634048,4/4,2.0,1.0,False,0.0,False,0,Dm7
183899,4.170046,-124087040,18497280,4/4,2.0,1.0,False,0.0,False,0,Dm7


In [16]:
# Visualize rows corresponding to beat onsets
final_output[final_output['BeatOnsetLocation']]

Unnamed: 0,timestamp,LeftChannel,RightChannel,Meter,Bar,Beat,BeatOnsetLocation,BeatOnsetIndex,LabelOnset,AnnotatedLabel,AnnotatedLabelRegion
33074,0.749977,4624128,6551040,4/4,1.0,1.0,True,1.0,True,C,C
74528,1.689978,47784960,38921984,4/4,1.0,2.0,True,2.0,False,0,C
110690,2.509978,165320192,164935424,4/4,1.0,3.0,True,3.0,False,0,C
147735,3.350001,-38921728,-95954688,4/4,1.0,4.0,True,4.0,False,0,C
183896,4.169978,-149906176,-3853824,4/4,2.0,1.0,True,5.0,True,Dm7,Dm7
...,...,...,...,...,...,...,...,...,...,...,...
5056505,114.659998,610413824,510604800,4/4,34.0,1.0,True,133.0,True,G7,G7
5104133,115.739998,378040832,412723712,4/4,34.0,2.0,True,134.0,False,0,G7
5148674,116.749999,114452480,-45473024,4/4,34.0,3.0,True,135.0,False,0,G7
5196744,117.840021,36609280,50096896,4/4,34.0,4.0,True,136.0,False,0,G7


In [17]:
# Visualize rows corresponding to exact label onsets
final_output[final_output['LabelOnset']]

Unnamed: 0,timestamp,LeftChannel,RightChannel,Meter,Bar,Beat,BeatOnsetLocation,BeatOnsetIndex,LabelOnset,AnnotatedLabel,AnnotatedLabelRegion
33074,0.749977,4624128,6551040,4/4,1.0,1.0,True,1.0,True,C,C
183896,4.169978,-149906176,-3853824,4/4,2.0,1.0,True,5.0,True,Dm7,Dm7
327662,7.429979,95184128,57419264,4/4,3.0,1.0,True,9.0,True,G7,G7
470987,10.679979,-108673024,-45087488,4/4,4.0,1.0,True,13.0,True,C,C
616518,13.980003,589990144,727179264,4/4,5.0,1.0,True,17.0,True,Am,Am
765135,17.350003,-65511680,-72833536,4/4,6.0,1.0,True,21.0,True,D7,D7
913311,20.710004,75145728,266285824,4/4,7.0,1.0,True,25.0,True,G,G
1060605,24.050004,-15028992,-31600128,4/4,8.0,1.0,True,29.0,True,CM7,CM7
1204370,27.309982,214647040,51253248,4/4,9.0,1.0,True,33.0,True,Am,Am
1349900,30.609983,310601984,177652224,4/4,10.0,1.0,True,37.0,True,D7,D7


Bach prelude in Eb example

In [18]:
audio_file = "examples/BachEb.wav"
tree_path = "examples/BachEb.json"
output_path = "examples/BachEb_output.csv"

beats = extract_beats(audio_file)
downbeats = extract_downbeats(audio_file)

# Write temporary beats and downbeats tracked sequences
beats.to_csv("incomplete_beats_"+output_path)
downbeats.to_csv("incomplete_downbeats_"+output_path)

df_beats_adj = adjust_beats_downbeats(beats, downbeats) # Will output message error at this line
df_audio, sampling_rate = load_audio(audio_file)
df_beats_adj = adjust_timestamp(df_beats_adj, sampling_rate)
df_tree = extract_annotated_chords_sequence(tree_path)
final_output = merge_and_format(df_audio, df_beats_adj, df_tree)

final_output.to_csv(output_path)

  file_sample_rate, signal = wavfile.read(filename, mmap=True)
  file_sample_rate, signal = wavfile.read(filename, mmap=True)


Downbeats detected by DBNDownBeatTrackingProcessor coupled with RNNDownBeatProcessor: [1. 2.]
Corresponding time signature: 2.0
Total number of beats detected by DBNBeatTrackingProcessor coupled with RNNBeatProcessor: 245
Total number of downbeats detected by DBNDownBeatTrackingProcessor coupled with RNNDownBeatProcessor: 239
After temporal alignment:
    >> Number of beats: 234
    >> Number of downbeats: 238
    >> Comparable sizes ? False


ValueError: Beats and downbeats sequences not comparable

Bach Eb comment: downbeats tracking algorithm detects audio is binary but fails at recognizing 4 beats a bar. Additionnally, beats and downbeats tracking algorithms mismatch each other about 4 beats.

**Visualization tool:**

- Write df of beats, downbeats or adjusted sequence to CSV format with .to_csv()

- Load audio file into Sonic Visualiser and add CSV sequence as an annotation layer (for example for BachC: load BachC.wav and add BachC_output_beats_seq.csv as a layer, for Bach Eb load the incomplete beats and downbeats CSV files)