# Evaluation

## Preliminaries

### Running the models

We need to run each of the models on each of our test sets and in both decoding modes (greedy and sampling) and store the results in the `out` subdirectory of the model directory. The 4 commands to run for each model are in the `run_model_eval.sh` script, which expects the model directory as an argument, e.g. `./run_model_eval.sh ../v01`. For a speedup, consider parallelizing this on a cluster and tweaking the batch size.

**Note:** The script will try to guess whether or not to input drums by detecting whether the model directory name contains `drums`. If your models are named differently or expect different sets of instruments, you will need to modify the script.

### Computing style profiles

For evaluation on the synthetic test set, we need to compute the reference style profiles for all styles in the test set. This is done using the script `compute.sh` in the `style_profiles` directory. The different types of style profiles are defined in the `config.yaml` and `config_drums.yaml` files.

In [None]:
!(cd style_profiles; ./compute.sh)

## Imports and definitions

In [None]:
import collections
import concurrent.futures as cf
import csv
import functools
import gzip
import itertools
import json
import os
import pickle

from note_seq import notebook_utils
from note_seq import sequences_lib
from note_seq import midi_synth
from note_seq.protobuf import music_pb2
from confugue import Configuration
from museflow.io.note_sequence_io import NoteSequenceDB
from museflow.note_sequence_utils import filter_sequence
import numpy as np
import pandas as pd
import scipy
from tqdm.notebook import tqdm

from groove2groove.eval.style_profiles import extract_all_stats
from groove2groove.eval.notes_chroma_similarity import chroma_similarity

In [None]:
DATA_DIR = '../../data/synth'
BODHIDHARMA_DIR = '../../data/bodhidharma'
STYLE_PROFILE_DIR = 'style_profiles'
STYLE_PROFILE_CFG_PATH = os.path.join(STYLE_PROFILE_DIR, 'config.yaml')
STYLE_PROFILE_DRUMS_CFG_PATH = os.path.join(STYLE_PROFILE_DIR, 'config_drums.yaml')
OUT_PREFIX = 'out/'
INSTRUMENTS = ['Bass', 'Piano', 'Guitar', 'Strings']
DRUMS = ['Drums']

In [None]:
# Load reference style profiles
ref_profiles = {}
for instr in INSTRUMENTS + DRUMS:
    ref_profiles[instr] = collections.defaultdict(dict)
    with open(os.path.join(STYLE_PROFILE_DIR, f'{instr}.json')) as f:
        for profile_type, style_dict in json.load(f).items():
            for style, profile in style_dict.items():
                assert profile is not None
                if profile is not None:
                    ref_profiles[instr][style][profile_type] = np.asarray(profile)
    ref_profiles[instr] = dict(ref_profiles[instr])

In [None]:
def filter_sequences(sequences, **kwargs):
    return [filter_sequence(seq, copy=True, **kwargs) for seq in sequences]

In [None]:
def cosine_similarity(hist1, hist2):
    return 1. - scipy.spatial.distance.cosine(hist1.reshape(1, -1), hist2.reshape(1, -1))

In [None]:
with open(STYLE_PROFILE_CFG_PATH, 'rb') as f:
    STYLE_PROFILE_FN = Configuration.from_yaml(f).bind(extract_all_stats)
with open(STYLE_PROFILE_DRUMS_CFG_PATH, 'rb') as f:
    STYLE_PROFILE_DRUMS_FN = Configuration.from_yaml(f).bind(extract_all_stats)

def evaluate_style(sequences, ref_stats=None, ref_sequences=None, is_drum=False, separate_drums=False):
    """Evaluate the style similarity of a set of sequences to a reference."""
    extract_fn = STYLE_PROFILE_FN if not is_drum else STYLE_PROFILE_DRUMS_FN
    stats = extract_fn(data=sequences)
    if ref_stats is None:
        ref_stats = extract_fn(data=ref_sequences)
    metrics = {name + ('_drums' if is_drum and separate_drums else ''):
                   cosine_similarity(stats[name], ref_stats[name])
               for name in stats if name in ref_stats}

    return metrics

In [None]:
def evaluate_content(sequence, reference):
    """Evaluate the content similarity of a sequence to a reference."""
    sequence = filter_sequence(sequence, drums=False, copy=True)
    reference = filter_sequence(reference, drums=False, copy=True)
    return {
        'content': chroma_similarity(sequence, reference,
                                     sampling_rate=12, window_size=24, stride=12, use_velocity=False)
    }

In [None]:
def evaluate_content_par(sequences, references):
    """Evaluate the content similarity of a list of sequences to a list of references."""
    with cf.ProcessPoolExecutor(max_workers=12) as pool:
        results = pool.map(_evaluate_content_par_task, zip(sequences, references))
        return list(tqdm(results, total=len(sequences), desc='content', leave=False))

def _evaluate_content_par_task(args):
    return evaluate_content(*args)

In [None]:
def evaluate_style_one_instrument(data, outputs, out_dict, instr, ref_instr=None, micro=False):
    """Evaluate one model on all style metrics for one instrument."""
    if not ref_instr:
        ref_instr = instr

    # Filter the outputs to include only the desired instrument, then join them with the metadata
    outputs_filtered = filter_sequences(outputs, instrument_re=f'^{instr}$')
    outputs_and_metadata = pd.concat([
        data[['src_style', 'tgt_style']],
        pd.Series(outputs_filtered, name='output')
    ], axis=1)

    if micro:
        grouped = outputs_and_metadata.groupby(['src_style', 'tgt_style'])
    else:
        grouped = outputs_and_metadata.groupby(['tgt_style'])

    for key, df in tqdm(grouped, desc='style', leave=False):
        tgt_style = key[1] if micro else key
        if tgt_style not in ref_profiles[ref_instr]:
            continue

        out_dict['style'].append(
            evaluate_style(df['output'], ref_stats=ref_profiles[ref_instr][tgt_style], is_drum=instr in DRUMS))

In [None]:
def evaluate_style_nano(data, outputs, out_dict, max_over_programs=False):
    """Evaluate one model on all 'nano' style metrics."""
    with cf.ProcessPoolExecutor(max_workers=12) as pool:
        task_fn = functools.partial(_evaluate_style_nano_par_task, max_over_programs=max_over_programs)
        task_results = pool.map(task_fn, ((output, data_row) for output, (_, data_row) in zip(outputs, data.iterrows())))
        task_results = tqdm(task_results, total=len(outputs), desc='style', leave=False)
        for out_rows in task_results:
            out_dict['style'].extend(out_rows)

def _evaluate_style_nano_par_task(args, max_over_programs):
    output, data_row = args
    reference = data_row['style_seq']
    reference_programs = sorted(set((n.program, n.is_drum) for n in reference.notes))
    out_rows = []
    for program, is_drum in reference_programs:
        reference_filtered = filter_sequence(reference, programs=[program], drums=is_drum, copy=True)

        if max_over_programs:
            output_programs = sorted(set((n.program, n.is_drum) for n in output.notes))
        else:
            output_programs = [(program, is_drum)]

        metrics_to_maximize_over = []
        for out_program, out_is_drum in output_programs:
            if out_is_drum != is_drum:
                continue

            output_filtered = filter_sequence(output, programs=[out_program], drums=out_is_drum, copy=True)
            metrics_to_maximize_over.append(
                evaluate_style([output_filtered], ref_sequences=[reference_filtered], is_drum=is_drum))
        out_row = dict(pd.DataFrame(metrics_to_maximize_over).max())
        out_row['src_key'], out_row['style_key'] = data_row['src_key'], data_row['style_key']
        out_rows.append(out_row)
    return out_rows

In [None]:
def melt_results(results, **kwargs):
    """Convert the results table to the long format."""
    results_long = []
    for col in results:
        for group in results[col]:
            df_melted = results[col][group].melt(**kwargs)
            df_melted['group'] = group
            df_melted['model'] = col
            results_long.append(df_melted)
    return pd.concat(results_long)

We cache the metric values for each model in `metrics_cache`, using a hash of the serialized note sequences as the key. This ensures that when we re-run the evaluation with new outputs, we save time by only recomputing the values we don't have yet. However, if you change the references or the metric definitions, you need to clear the cache, otherwise you will get incorrect results.

In [None]:
metrics_cache = {}

In [None]:
# True forces all requested values to be re-computed from scratch and overwrite the values in
# the cache. However, this does not clear old, unused keys from the cache.
overwrite_cache = False

In [None]:
# with open('metrics_cache.pickle', 'rb') as f:
#     metrics_cache = pickle.load(f)

In [None]:
def metrics_cache_key(outputs_series, tag=None):
    serialized = outputs_series.map(lambda x: x.SerializeToString())
    key = (outputs_series.name, pd.util.hash_pandas_object(serialized).sum())
    if tag:
        key = (*key, tag)
    return key

def metrics_cache_clear(series_name):
    keys = [key for key in metrics_cache if key[0] == series_name]
    for key in keys:
        del metrics_cache[key]

In [None]:
def evaluate_all(data, outputs, eval_style=True, tag=None):
    """Evaluate all models on all metrics."""
    metrics = collections.defaultdict(lambda: collections.defaultdict(list))
    assert np.array_equal(outputs.index, data.index)

    for col in tqdm(outputs.columns):
        cache_key = metrics_cache_key(outputs[col], tag)
        if not overwrite_cache and cache_key in metrics_cache:
            metrics[col] = metrics_cache[cache_key]
            continue

        # Style metrics
        if eval_style:
            if 'tgt_style' in data.columns:
                # We have style labels
                if col == 'source':
                    # We don't know which source instrument to choose, so we compute the maximum over all instruments.
                    for ref_instr in tqdm(INSTRUMENTS + DRUMS, leave=False):
                        metrics_tmp = collections.defaultdict(list)
                        for src_instr in tqdm(INSTRUMENTS if ref_instr not in DRUMS else DRUMS, leave=False):
                            out_dict = collections.defaultdict(list)
                            evaluate_style_one_instrument(data, outputs[col], out_dict, src_instr, ref_instr,
                                                          micro=True)  # group by source-target style pairs
                            for key, vals in out_dict.items():
                                metrics_tmp[key].append(vals)

                        for key in metrics_tmp:
                            # Check that the lists are of the same length
                            assert len({len(m) for m in metrics_tmp[key]}) == 1
                            for metrics_to_maximize_over in zip(*metrics_tmp[key]):
                                metrics[col][key].append(dict(pd.DataFrame(metrics_to_maximize_over).max()))
                else:
                    for instr in tqdm(INSTRUMENTS + DRUMS, leave=False):
                        evaluate_style_one_instrument(data, outputs[col], metrics[col], instr)
            else:
                # No style labels; compute nano metrics
                evaluate_style_nano(data, outputs[col], metrics[col],
                                    max_over_programs=(col == 'source'))

        # Content metric
        outputs_and_metadata = pd.concat([
            data[['src_seq']],
            pd.Series(outputs[col], name='output')
        ], axis=1)
        metrics[col]['content'].extend(evaluate_content_par(*zip(*(
            (row['output'], row['src_seq']) for _, row in outputs_and_metadata.iterrows()))))

        metrics_cache[cache_key] = metrics[col].copy()

    metrics = {
        col: {
            m: pd.DataFrame(metrics[col][m])
            for m in metrics[col]
        }
        for col in metrics
    }
            
    results = pd.DataFrame()
    results_err = pd.DataFrame()
    for col in metrics:
        results = results.join(pd.concat(
            [metrics[col]['style'].mean(),
             metrics[col]['content'].mean()]
        ).rename(col), how='outer')
        results_err = results_err.join(pd.concat(
            [metrics[col]['style'].std(),
             metrics[col]['content'].std()]
        ).rename(col), how='outer')
    
    return results, results_err, metrics

## Synthetic test set

In [None]:
logdirs = ['v01', 'v01_vel', 'v01_drums', 'v01_drums_vel', 'v01_drums_vel_perf']
section = 'test'

data = pd.DataFrame()
outputs = pd.DataFrame()

with gzip.open(os.path.join(DATA_DIR, section, 'final', 'meta.json.gz'), 'rb') as f:
    metadata = json.load(f)

# Load triplets of keys: source, style, target
with open(f'triplets_{section}.tsv') as f:
    key_triplets = list(csv.reader(f, delimiter='\t'))
    data['src_key'], data['style_key'], data['tgt_key'] = zip(*key_triplets)

# Add style names
data['src_style'] = pd.Series(metadata[key]['style'] for key in data['src_key'])
data['tgt_style'] = pd.Series(metadata[key]['style'] for key in data['style_key'])

# Load source and target sequences
with NoteSequenceDB(os.path.join(DATA_DIR, section, 'final', 'all.db')) as db, db.begin() as txn:
    outputs['source'], outputs['style'], outputs['target'] = zip(*((txn.get(src), txn.get(sty), txn.get(tgt)) for src, sty, tgt in key_triplets))
    for seq in itertools.chain(outputs['source'], outputs['style'], outputs['target']):
        for instrument_info in seq.instrument_infos:
            assert instrument_info.name.startswith('BB ')
            instrument_info.name = instrument_info.name[len('BB '):]
    data['src_seq'] = outputs['source']

# Load model outputs
for logdir in logdirs:
    for decoding in ['greedy', 'sample06']:
        col = f'{logdir}_{decoding}'
        with NoteSequenceDB(os.path.join('..', logdir, f'{OUT_PREFIX}{section}_{decoding}.db')) as db, db.begin() as txn:
            outputs[col] = pd.Series(txn.get(f'{src}_{style}') for src, style, _ in key_triplets)

# Compute the metrics
results_test, results_test_err, results_test_all = evaluate_all(data, outputs)
results_test_long = melt_results(results_test_all)

results_test

## Synthetic test set (nano)

In [None]:
logdirs = ['v01', 'v01_vel', 'v01_drums', 'v01_drums_vel', 'v01_drums_vel_perf']
section = 'test'

data = pd.DataFrame()
outputs = pd.DataFrame()

with gzip.open(os.path.join(DATA_DIR, section, 'final', 'meta.json.gz'), 'rb') as f:
    metadata = json.load(f)

# Load triplets of keys: source, style, target
with open(f'triplets_{section}.tsv') as f:
    key_triplets = list(csv.reader(f, delimiter='\t'))
    data['src_key'], data['style_key'], data['tgt_key'] = zip(*key_triplets)

# Load source and target sequences
with NoteSequenceDB(os.path.join(DATA_DIR, section, 'final', 'all.db')) as db, db.begin() as txn:
    outputs['source'], outputs['style'], outputs['target'] = zip(*((txn.get(src), txn.get(sty), txn.get(tgt)) for src, sty, tgt in key_triplets))
    for seq in itertools.chain(outputs['source'], outputs['style'], outputs['target']):
        for instrument_info in seq.instrument_infos:
            assert instrument_info.name.startswith('BB ')
            instrument_info.name = instrument_info.name[len('BB '):]
    data['src_seq'] = outputs['source']
    data['style_seq'] = outputs['style']

# Load model outputs
for logdir in logdirs:
    for decoding in ['greedy', 'sample06']:
        col = f'{logdir}_{decoding}'
        with NoteSequenceDB(os.path.join('..', logdir, f'{OUT_PREFIX}{section}_{decoding}.db')) as db, db.begin() as txn:
            outputs[col] = pd.Series(txn.get(f'{src}_{style}') for src, style, _ in key_triplets)

# Compute the metrics
results_test_nano, results_test_nano_err, results_test_nano_all = evaluate_all(data, outputs, tag='nano')
results_test_nano_long = melt_results(results_test_nano_all)

results_test_nano

## Bodhidharma

In [None]:
logdirs = ['v01', 'v01_vel', 'v01_drums', 'v01_drums_vel', 'v01_drums_vel_perf']
section = 'bodh'

data = pd.DataFrame()
outputs = pd.DataFrame()

with gzip.open(os.path.join(BODHIDHARMA_DIR, 'final', 'meta.json.gz'), 'rb') as f:
    metadata = json.load(f)

# Load pairs of keys: source, style
with open(f'pairs_{section}.tsv') as f:
    key_pairs = list(csv.reader(f, delimiter='\t'))
    data['src_key'], data['style_key'] = zip(*key_pairs)

# Load source and style sequences
with NoteSequenceDB(os.path.join(BODHIDHARMA_DIR, 'final', 'vel_norm_biab', 'all.db')) as db, db.begin() as txn:
    data['src_seq'], data['style_seq'] = zip(*((txn.get(src), txn.get(sty)) for src, sty in key_pairs))
outputs['source'] = data['src_seq']
outputs['style'] = data['style_seq']

# Load model outputs
for logdir in logdirs:
    for decoding in ['greedy', 'sample06']:
        col = f'{logdir}_{decoding}'
        with NoteSequenceDB(os.path.join('..', logdir, f'{OUT_PREFIX}{section}_{decoding}.db')) as db, db.begin() as txn:
            outputs[col] = pd.Series(txn.get(f'{src}_{style}', music_pb2.NoteSequence())
                                     for src, style in key_pairs)

# Compute the metrics
results_bodh, results_bodh_err, results_bodh_all = evaluate_all(data, outputs)
results_bodh_long = melt_results(results_bodh_all)

results_bodh

In [None]:
with open('metrics_cache.pickle', 'wb') as f:
    pickle.dump(metrics_cache, f)