## imports


In [19]:
import sys
import os

# add the parent directory to the system path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [20]:
from pathlib import Path
import json
import numpy as np
from rich.progress import (
    Progress,
    SpinnerColumn,
    TimeElapsedColumn,
    MofNCompleteColumn,
)
import numpy as np
import pretty_midi
import redis
from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import product

## functions

In [21]:
def update_best_match(
    redis_client: redis.Redis, key: str, track_name_row: str, file_bm: str, sim_bm: float, metric: str
) -> None:
    """
    Update the metric in the Redis database based on the given criteria.

    Args:
        redis_client: Redis client object.
        key (str): The key to access the pitch histogram in Redis.
        track_name_row (str): The string to be compared with the substring before the underscore.
        file_bm (str): The filename to be inserted.
        sim_bm (float): The similarity value associated with the filename.
        metric (str): The metric to update.
    """

    # get the current list from Redis
    best_matches = redis_client.json().get(key, f"$.{metric}")
    if best_matches is None or len(best_matches) < 1:
        best_matches = []
        track_names = set()
    else:
        best_matches = best_matches[0]
        track_names = set(
            entry.split("@")[0].split("_")[0] for entry in best_matches
        )

    # check if the new track_name is already in the set or matches track_name_row
    new_track_name = file_bm.split("_")[0]
    if new_track_name in track_names or new_track_name == track_name_row:
        return  # dont update list

    # add the new entry
    best_matches.append(f"{file_bm}@{sim_bm}")

    # sort by similarity in descending order
    best_matches.sort(key=lambda x: float(x.split("@")[1]), reverse=True)

    # redis_client.json().set(key, f"$.{metric}", best_matches[:3])

    return best_matches[:10]


In [22]:
def calc_sims(
    rows,
    all_rows,
    metric,
    mod_table,
    index,
):
    print(f"[SUBR{index:02d}] starting subprocess {index:02d}")
    r = redis.Redis(host="localhost", port=6379, db=0)
    best_matches = []

    progress = Progress(
        SpinnerColumn(),
        *Progress.get_default_columns(),
        TimeElapsedColumn(),
        MofNCompleteColumn(),
        refresh_per_second=1,
    )
    sim_task = progress.add_task(
        f"[SUBR{index:02d}] calculating sims", total=len(rows) * len(all_rows)
    )
    with progress:
        for i, row_file in enumerate(rows):
            print(
                f"[SUBR{index:02d}] {i:04d}/{len(rows):04d} calculating sims for file {row_file}"
            )
            track_name_row, segment_name_row, _ = row_file.split("_")
            for col_file in all_rows:
                if col_file == row_file:
                    value = {
                        "sim": 1.0,
                        "mutations": {"transpose": 0, "shift": 0},
                        "row_file": row_file,
                        "col_file": col_file,
                        "metric": metric,
                    }
                else:
                    track_name_col, segment_name_col, _ = col_file.split("_")
                    sim_best_mutation = -1
                    best_shift = -1
                    best_trans = -1
                    main_metric = list(
                        map(float, r.get(f"{metric}:{track_name_row}_{segment_name_row}:t00s00").decode().split(","))  # type: ignore
                    )
                    for t in range(N_TRANSPOSITIONS):
                        # for t, s in mod_table:
                        s = 0  # TODO: REMOVE BEFORE SHIFTING
                        comp_metric = list(
                            map(float, r.get(f"{metric}:{track_name_col}_{segment_name_col}:t{t:02d}s{s:02d}").decode().split(","))  # type: ignore
                        )
                        similarity = np.dot(main_metric, comp_metric) / (
                            np.linalg.norm(main_metric) * np.linalg.norm(comp_metric)
                        )

                        if similarity > sim_best_mutation:
                            sim_best_mutation = similarity
                            best_trans = t
                            best_shift = s

                    value = {
                        "sim": sim_best_mutation,
                        "mutations": {"transpose": best_trans, "shift": best_shift},
                        "row_file": row_file,
                        "col_file": col_file,
                        "metric": metric,
                    }

                # update comparison object
                # r.json().set(f"cmp:{row_file}:{col_file}:{metric}", "$", value)

                # update row file object
                best_matches = update_best_match(
                    r,
                    f"file:{row_file}",
                    track_name_row,
                    col_file,
                    value["sim"],
                    metric,
                )
                progress.advance(sim_task)

    print(f"[SUBR{index:02d}] subprocess complete")

    return best_matches

In [23]:
def calc_sims_vectorized(
    rows,
    all_rows,
    metric,
    index,
):
    print(f"[SUBR{index:02d}] starting subprocess {index:02d}")
    r = redis.Redis(host="localhost", port=6379, db=0, decode_responses=True)
    best_matches = []
    progress = Progress(
        SpinnerColumn(),
        *Progress.get_default_columns(),
        TimeElapsedColumn(),
        MofNCompleteColumn(),
        refresh_per_second=1,
    )
    sim_task = progress.add_task(
        f"[SUBR{index:02d}] calculating sims", total=len(rows) * len(all_rows)
    )

    with progress:
        for i, row_file in enumerate(rows):
            print(
                f"[SUBR{index:02d}] {i:04d}/{len(rows):04d} calculating sims for file {row_file}"
            )
            track_name_row, segment_name_row, _ = row_file.split("_")
            
            # Get the main metric for the current row
            main_metric = np.array(list(map(float, r.get(f"{metric}:{track_name_row}_{segment_name_row}:t00s00").decode().split(","))))
            
            # Prepare arrays for all comparisons
            all_metrics = np.zeros((len(all_rows), N_TRANSPOSITIONS, len(main_metric)))
            for j, col_file in enumerate(all_rows):
                track_name_col, segment_name_col, _ = col_file.split("_")
                for t in range(N_TRANSPOSITIONS):
                    all_metrics[j, t] = np.array(list(map(float, r.get(f"{metric}:{track_name_col}_{segment_name_col}:t{t:02d}s00").decode().split(","))))
            
            # Calculate similarities for all transpositions at once
            similarities = np.einsum('i,jki->jk', main_metric, all_metrics) / (
                np.linalg.norm(main_metric) * np.linalg.norm(all_metrics, axis=2)
            )
            
            # Find best similarities and corresponding transpositions
            best_similarities = np.max(similarities, axis=1)
            best_transpositions = np.argmax(similarities, axis=1)
            
            for j, (col_file, sim, trans) in enumerate(zip(all_rows, best_similarities, best_transpositions)):
                if col_file == row_file:
                    value = {
                        "sim": 1.0,
                        "mutations": {"transpose": 0, "shift": 0},
                        "row_file": row_file,
                        "col_file": col_file,
                        "metric": metric,
                    }
                else:
                    value = {
                        "sim": sim,
                        "mutations": {"transpose": int(trans), "shift": 0},
                        "row_file": row_file,
                        "col_file": col_file,
                        "metric": metric,
                    }

                # Update row file object
                best_matches = update_best_match(
                    r,
                    f"file:{row_file}",
                    track_name_row,
                    col_file,
                    value["sim"],
                    metric,
                )
                progress.advance(sim_task)

    print(f"[SUBR{index:02d}] subprocess complete")
    return best_matches

## init

In [24]:
P_DATASET = "../data/datasets/test/train"
METRIC = "pitch_histogram"
N_BEATS = 8
N_TRANSPOSITIONS = 12

names = [name[:-4] for name in os.listdir(P_DATASET) if name.endswith(".mid")]
names.sort()

num_processes = os.cpu_count()
split_keys = np.array_split(names, num_processes)  # type: ignore
mod_table = list(product(range(N_TRANSPOSITIONS), range(N_BEATS)))
r = redis.Redis(host="localhost", port=6379, db=0)

## run

In [25]:
import time

start_time = time.time()
calc_sims_vectorized(split_keys[0], names, METRIC, 0)
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

Output()

[SUBR00] starting subprocess 00
