## Use Madmom for Chords Recognition

In [None]:
# Install python libray (TODO: Add to requirement.txt for Docker)

# pip install pandas==1.3.5
# pip install numpy==1.19.5
# pip install scipy==1.10
# pip install matplotlib==3.6
# pip install madmom

# install ffmpeg on Ubuntu
# apt update
# apt install ffmpeg

In [None]:
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import pytorch_lightning as pl
from scipy.spatial.distance import cosine

import sys
sys.path.insert(0, '/app')
from evaluate import load_pitchclass2vec_model

from collections import defaultdict
from itertools import groupby
import re

print("done")
RANDOM_SEED = 42
pl.seed_everything(seed=RANDOM_SEED)

In [7]:
# 导入所需的模块
import numpy as np
from madmom.features.chords import CNNChordFeatureProcessor, CRFChordRecognitionProcessor
from madmom.processors import SequentialProcessor


# 创建一个特征提取器实例
feature_processor = CNNChordFeatureProcessor()

# 创建一个和弦识别器实例
chord_recognizer = CRFChordRecognitionProcessor()

# 将两个处理器串联成一个序列处理器
sequential_processor = SequentialProcessor([feature_processor, chord_recognizer])


print("done")

done


In [8]:
# 音频文件路径
audio_file = r"/Users/jie/dev/DBMS/jie_test_music/Bitch_Dont_Kill_My_Vibe.mp3"
# 应用处理器到音频文件上，识别和弦
chords = sequential_processor(audio_file)   

for chord in chords:
    start, end, label = chord
    print(f"Start: {start:.2f}, End: {end:.2f}, Chord: {label}")

print("done")

Start: 0.00, End: 1.00, Chord: N
Start: 1.00, End: 2.30, Chord: F:min
Start: 2.30, End: 8.40, Chord: C#:maj
Start: 8.40, End: 9.50, Chord: F:min
Start: 9.50, End: 15.40, Chord: C#:maj
Start: 15.40, End: 16.40, Chord: F:min
Start: 16.40, End: 16.80, Chord: G#:maj
Start: 16.80, End: 20.10, Chord: C#:maj
Start: 20.10, End: 20.90, Chord: G#:maj
Start: 20.90, End: 21.80, Chord: A#:min
Start: 21.80, End: 22.40, Chord: D#:maj
Start: 22.40, End: 23.50, Chord: F:min
Start: 23.50, End: 23.90, Chord: G#:maj
Start: 23.90, End: 28.90, Chord: C#:maj
Start: 28.90, End: 29.70, Chord: G#:maj
Start: 29.70, End: 30.60, Chord: F:min
Start: 30.60, End: 31.10, Chord: G#:maj
Start: 31.10, End: 36.10, Chord: C#:maj
Start: 36.10, End: 37.10, Chord: D#:maj
Start: 37.10, End: 38.30, Chord: F:min
Start: 38.30, End: 44.30, Chord: C#:maj
Start: 44.30, End: 45.30, Chord: F:min
Start: 45.30, End: 55.80, Chord: C#:maj
Start: 55.80, End: 58.70, Chord: N
Start: 58.70, End: 59.80, Chord: F:min
Start: 59.80, End: 65.70, C

In [9]:
print(type(chords))
print(chords.shape)

<class 'numpy.ndarray'>
(132,)


In [4]:
# Obtain the Chroma of a song
# https://madmom.readthedocs.io/en/v0.16/_modules/madmom/audio/chroma.html#DeepChromaProcessor
# from madmom.audio.chroma.DeepChromaProcessor import DeepChromaProcessor
from madmom.audio.chroma import DeepChromaProcessor
dcp = DeepChromaProcessor()
audio_file = r"/Users/jie/dev/DBMS/jie_test_music/Bitch_Dont_Kill_My_Vibe.mp3"
chroma = dcp(audio_file)

In [5]:
print(type(chroma))
print(chroma.shape)

<class 'numpy.ndarray'>
(3104, 12)


In [None]:
# print the info of chords array
print(len(chords))
print(type(chords))
print(chords[0:3])

In [None]:
from pitchclass2vec.pitchclass2vec import NaiveEmbeddingModel
from pitchclass2vec import encoding, model

encoder = encoding.RootIntervalDataset
embedding_model = NaiveEmbeddingModel(
                        encoding_model=encoder, 
                        embedding_dim=3, # dim=3 because each '24 basic chords' only contain 3 notes
                        norm=False)

 

In [None]:
# from tasks.segmentation.functional import LSTMBaselineModel
from tasks.segmentation.deeplearning_models.lstm import LSTMBaselineModel
import torch 

# Embed the chord by embedding model (store in /out)
def embed_chord(p2v, c):
    try:
        return p2v[c]
    except:
        return p2v["N"]

# Load the pre-trained embedding model
# p2v = load_pitchclass2vec_model("root-interval", "fasttext", "/app/out/root_interval_best/root-interval-fasttext-with-Processed-ChoCo.ckpt")
print("done: loaded the model")

# Load the pre-trained LSTM model for prediction:
CKPT_PATH = '/app/segmentation_out/18_run.ckpt'
model = LSTMBaselineModel.load_from_checkpoint(CKPT_PATH)

# Obtain the embedded chords 
chords_str = [chord[2] for chord in chords]
corpus_embedded = np.stack([
    np.mean(np.array([embed_chord(embedding_model, c) for c in x]), axis=0) for x in chords
])

print(f"corpus_embedded.shape: {corpus_embedded.shape}",end='\n')


# Obtain correct input format: add a batchsize on the first position
corpus_embedded = torch.tensor(corpus_embedded).unsqueeze(0)
print(f"corpus_embedded.shape: {corpus_embedded.shape}",end='\n')
# ------------------------------------------------------------------------------------------------------------

# Evalutaion mode and  Move to GPU
model.eval() 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
corpus_embedded = corpus_embedded.float().to(device) # change float(float 64) into double(float32)



# Prediction: will return (x,y), x is result after classification(x), y is x after softmax(x)
with torch.no_grad():
    predictions = model.evaluation_forward(corpus_embedded)

print(f"Prediction shape: {predictions[1].shape}")



In [None]:
predicted_labels = torch.argmax(predictions[1], dim=-1)  # shape will be (1, 115)
print(f"Input: chords_str: {chords_str}")
print('\n')
print(f"Output: predicted_labels: {predicted_labels}")

In [None]:
label_list = list(predicted_labels)[0]
change_point = []
for i in range(len(label_list)-1):
    if label_list[i] != label_list[i+1]: change_point.append(i)

end_time = [chords[cp][1] for cp in change_point]
print(f"end_time: {end_time}")

In [None]:
from sklearn.preprocessing import OneHotEncoder
predicted_indices = torch.argmax(predictions[1], dim=-1)

# 将这些索引转换成独热编码格式
predicted_onehot = torch.zeros_like(predictions[1]).scatter_(-1, predicted_indices.unsqueeze(-1), 1)

# 转换为numpy数组
predicted_onehot = predicted_onehot.cpu().numpy()

# 使用inverse_transform将独热编码转换回原始标签
label_encoder = 
predicted_labels = label_encoder.inverse_transform(predicted_onehot)

print(f"Predicted labels: {predicted_labels}")