In [None]:
# !/usr/bin/python3 -m pip install --upgrade pip
# !pip install -e ..
# !pip install nvidia-cudnn-cu11

In [None]:
import os
import uuid
from pathlib import Path
from typing import List
from tqdm import tqdm
import json

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

import tensorflow as tf

import numpy as np
import pandas as pd
from midistral.audio_analysis import (
    genre_classes,
    get_chords,
    get_mood_and_genre,
    instruments_classes,
    mood_classes,
)
from midistral.midi_utils import get_instruments, get_midi_and_ogg_from_abc
from midistral.types import AudioTextDescription, InferenceApproach
from midistral.audio_analysis import SIMPLIFIED_MOODS, SIMPLIFIED_GENRES, get_simplified_genres, get_simplified_moods
from midistral.infer import generate_abc_notation
from sklearn.metrics import classification_report
import itertools

In [None]:
def get_vector(genres: List[str], labels: List[str]):
    v = [0] * len(labels)
    for g in genres:
        index = labels.index(g)
        v[index] = 1
    return v


def evaluate_prediction(
    labels: List[str],
    predicted: List[List[str]],
    ground_truth: List[List[str]],
    output_dict: bool = True,
):
    predicted_vector = []
    ground_truth_vector = []
    for pg in predicted:
        predicted_vector.append(get_vector(pg, labels))
    for gtg in ground_truth:
        ground_truth_vector.append(get_vector(gtg, labels))

    y_true = np.array(ground_truth_vector)
    y_pred = np.array(predicted_vector)
    report = classification_report(
        y_true, y_pred, target_names=labels, output_dict=output_dict
    )

    return report


In [None]:
NOTEBOOKS_FOLDER = Path(os.getcwd())
OUTPUT_FOLDER = NOTEBOOKS_FOLDER.parent / "output"
DATA_FOLDER = NOTEBOOKS_FOLDER.parent / "data"

num_samples_by_constraint_p = (
    OUTPUT_FOLDER / "datasets" / "num_samples_by_constraint.json"
)
TEST_CASES = []
with num_samples_by_constraint_p.open("r") as f:
    constraints_split = json.load(f)
    TEST_CASES.extend(
        [
            c["constraints"]
            for c in constraints_split
            if len(c["constraints"]["genre"]) > 0
            or len(c["constraints"]["mood"]) > 0
            or len(c["constraints"]["instrument_summary"]) > 0
        ]
    )

    # TEST_CASES = [{"instrument_summary": ["trumpet"], "mood": ["calm"], "genre": ["emotional"]}]

In [None]:
for approach in [InferenceApproach.FINETUNED_2]:
# for approach in [InferenceApproach.PROMPT_ONLY, InferenceApproach.RAG]:
    ANNOTATION_OUTPUT_PATH = OUTPUT_FOLDER / f"annotations_{approach.value}_output.jsonl"
    TMP_AUDIO_FOLDER = OUTPUT_FOLDER / "tmp_audio" / approach.value
    TMP_AUDIO_FOLDER.mkdir(exist_ok=True, parents=True)

    with ANNOTATION_OUTPUT_PATH.open('a', encoding='utf-8') as annotation_f:
        for ind, r in enumerate(tqdm(TEST_CASES)):
            instrument_summary_gt = [i.lower() for i in r["instrument_summary"]]

            # run inference
            # abc_notation_text = "X: 1\nM: 4/4\nL: 1/8\nQ:1/4=120\nK:D\nV:1\n%%MIDI program 0\n G/2G/2c/2A/2| B/2B/2d/2G/2| A/2A/2F/2G/2| B/2B/2d/2G/2|G/2G/2c/2A/2| B/2B/2d/2G/2| A/2A/2F/2G/2| B/2B/2d/2G/2|G/2G/2c/2A/2| B/2B/2d/2G/2| A/2A/2F/2G/2| B/2B/2d/2G/2| B/2B/2d/2G/2| A/2A/2F/2G/2| B/2B/2d/2G/2| B/2B/2d/2G/2| A/2A/2F/2G/2| B/2B/2d/2G/2|\n"
            des = AudioTextDescription(genre=r["genre"], mood=r["mood"], instruments=instrument_summary_gt, midi_instruments_num=None)
            abc_notation_text, text_description = generate_abc_notation(des, approach)

            # generate audio
            try:
                midi, ogg = get_midi_and_ogg_from_abc(abc_notation_text)
                file_uuid = str(uuid.uuid4())
                for extension, b in [("midi", midi), ("ogg", ogg)]:
                    p = TMP_AUDIO_FOLDER / f"{file_uuid}.{extension}"
                    p.parent.mkdir(parents=True, exist_ok=True)
                    with p.open("wb") as f:
                        f.write(b)

                # analyse it
                chords_out, chord_summary, chord_summary_occurence = get_chords(TMP_AUDIO_FOLDER / f"{file_uuid}.ogg")
                try:
                    mood_tags, mood_cs, genre_tags, genre_cs = get_mood_and_genre(
                        TMP_AUDIO_FOLDER / f"{file_uuid}.ogg"
                    )
                except Exception as e:
                    print(e)
                    mood_tags, mood_cs, genre_tags, genre_cs = [], [], [], []
    
                try:
                    instrument_numbers_sorted, instrument_summary = get_instruments(
                        TMP_AUDIO_FOLDER / f"{file_uuid}.midi"
                    )
                except Exception as e:
                    print(e)
                    instrument_numbers_sorted, instrument_summary = [], []
            except Exception:
                file_uuid = None
                mood_tags, mood_cs, genre_tags, genre_cs = [], [], [], []
                instrument_numbers_sorted, instrument_summary = [], []

            # log it
            row = {
                "file_uuids": file_uuid,
                "abc_notation_texts": abc_notation_text,
                "text_descriptions": text_description,
                "mood_preds": mood_tags,
                "mood_cs": mood_cs,
                "simplified_mood_preds": get_simplified_moods(mood_tags[:2]),
                "simplified_mood_gt": r["mood"],
                "genre_tags": genre_tags,
                "genre_cs": genre_cs,
                "simplified_genre_preds": get_simplified_genres(genre_tags[:2]),
                "simplified_genre_gt": r["genre"],
                "instruments_preds": [i.lower() for i in instrument_summary],
                "instruments_gt": instrument_summary_gt,
            }

            annotation_f.write(json.dumps(row) + "\n")
            annotation_f.flush()

            # if ind > 10:
            #     break


In [None]:
for approach in [InferenceApproach.FINETUNED_2]:
# for approach in [InferenceApproach.PROMPT_ONLY, InferenceApproach.FINETUNED_2, InferenceApproach.RAG]:

    print(approach)

    ANNOTATION_OUTPUT_PATH = OUTPUT_FOLDER / f"annotations_{approach.value}_output.jsonl"
    df = pd.read_json(ANNOTATION_OUTPUT_PATH, lines=True)

    # genre
    genre_gt_l = df["simplified_genre_gt"].to_list()
    genre_preds_l = df["simplified_genre_preds"].to_list()
    no_constraints_genre_preds_l = []
    for i_gt, i_pred in zip(genre_gt_l, genre_preds_l):
        if len(i_gt) > 0:
            no_constraints_genre_preds_l.append(i_pred)
        else:
            no_constraints_genre_preds_l.append([])
    genre_report = evaluate_prediction(
        genre_classes, no_constraints_genre_preds_l, genre_gt_l
    )

    # mood
    mood_gt_l = df["simplified_mood_gt"].to_list()
    mood_preds_l = df["simplified_mood_preds"].to_list()
    no_constraints_mood_preds_l = []
    for i_gt, i_pred in zip(mood_gt_l, mood_preds_l):
        if len(i_gt) > 0:
            no_constraints_mood_preds_l.append(i_pred)
        else:
            no_constraints_mood_preds_l.append([])
    mood_report = evaluate_prediction(
        mood_classes, no_constraints_mood_preds_l, mood_gt_l
    )

    # instrument
    instruments_gt_l = df["instruments_gt"].to_list()
    instruments_preds_l = df["instruments_preds"].to_list()
    no_constraints_instruments_preds_l = []
    for i_gt, i_pred in zip(instruments_gt_l, instruments_preds_l):
        if len(i_gt) > 0:
            no_constraints_instruments_preds_l.append(i_pred)
        else:
            no_constraints_instruments_preds_l.append([])
    instruments_report = evaluate_prediction(
        instruments_classes,
        no_constraints_instruments_preds_l,
        instruments_gt_l
    )
    print(instruments_report)

    print(f"genre : {genre_report['micro avg']}")
    print(f"mood  : {mood_report['micro avg']}")
    print(f"inst  : {instruments_report['micro avg']}")

In [None]:
evaluate_prediction(mood_classes, [[], ["calm", "emotional", "positive"]], [[], ["emotional"]])