## Use Madmom for Chords Recognition

In [1]:
# Install python libray (TODO: Add to requirement.txt for Docker)

# pip install pandas==1.3.5
# pip install numpy==1.19.5
# pip install scipy==1.10
# pip install matplotlib==3.6
# pip install madmom

# install ffmpeg on Ubuntu
# apt update
# apt install ffmpeg

In [1]:
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import pytorch_lightning as pl
from scipy.spatial.distance import cosine

import sys
sys.path.insert(0, '/app')
from evaluate import load_pitchclass2vec_model

from collections import defaultdict
from itertools import groupby
import re

print("done")
# RANDOM_SEED = 42
# pl.seed_everything(seed=RANDOM_SEED)

done


In [2]:
# 导入所需的模块
import numpy as np
from madmom.features.chords import CNNChordFeatureProcessor, CRFChordRecognitionProcessor
from madmom.processors import SequentialProcessor

# 音频文件路径
audio_file = '/app/jie_test_music/Drake_Passionfruit.mp3'

# 创建一个特征提取器实例
feature_processor = CNNChordFeatureProcessor()

# 创建一个和弦识别器实例
chord_recognizer = CRFChordRecognitionProcessor()

# 将两个处理器串联成一个序列处理器
sequential_processor = SequentialProcessor([feature_processor, chord_recognizer])

# 应用处理器到音频文件上，识别和弦
chords = sequential_processor(audio_file)

# 打印识别出的和弦
for chord in chords:
    start, end, label = chord
    print(f"Start: {start:.2f}, End: {end:.2f}, Chord: {label}")

# 如果你想将和弦信息保存到一个文件中，你可以这样做：
# with open('/app/jie_test_music/Drake_Passionfruit_chords.txt', 'w') as f:
#     for chord in chords:
#         f.write(f"{chord[0]:.2f}\t{chord[1]:.2f}\t{chord[2]}\n")

print("done")

Start: 0.00, End: 4.30, Chord: E:maj
Start: 4.30, End: 7.50, Chord: C#:min
Start: 7.50, End: 10.40, Chord: F#:maj
Start: 10.40, End: 11.60, Chord: D#:min
Start: 11.60, End: 13.00, Chord: B:maj
Start: 13.00, End: 13.90, Chord: G#:min
Start: 13.90, End: 15.20, Chord: F#:maj
Start: 15.20, End: 22.20, Chord: E:maj
Start: 22.20, End: 25.00, Chord: C#:min
Start: 25.00, End: 25.80, Chord: F#:maj
Start: 25.80, End: 26.20, Chord: D#:min
Start: 26.20, End: 30.20, Chord: B:maj
Start: 30.20, End: 31.00, Chord: D#:min
Start: 31.00, End: 32.20, Chord: F#:maj
Start: 32.20, End: 38.60, Chord: E:maj
Start: 38.60, End: 41.20, Chord: N
Start: 41.20, End: 42.50, Chord: A:maj
Start: 42.50, End: 47.30, Chord: B:maj
Start: 47.30, End: 48.50, Chord: G#:min
Start: 48.50, End: 49.40, Chord: G#:maj
Start: 49.40, End: 50.70, Chord: N
Start: 50.70, End: 53.80, Chord: E:maj
Start: 53.80, End: 54.80, Chord: C:maj
Start: 54.80, End: 56.10, Chord: E:maj
Start: 56.10, End: 58.30, Chord: N
Start: 58.30, End: 59.00, Chor

In [3]:
# print the info of chords array
print(len(chords))
print(type(chords))
print(chords[0:3])

115
<class 'numpy.ndarray'>
[(0. ,  4.3, 'E:maj') (4.3,  7.5, 'C#:min') (7.5, 10.4, 'F#:maj')]


In [4]:
# Embed the chord by embedding model (store in /out)
def embed_chord(p2v, c):
    try:
        return p2v[c]
    except:
        return p2v["N"]

# Load the embedding model
p2v = load_pitchclass2vec_model("root-interval", "fasttext", "/app/out/first_run_with_whole_ChocoDataSet.ckpt")
print("done")

done


In [5]:
chords_str = [chord[2] for chord in chords]
embedded_chord_test = embed_chord(p2v,chords_str[0])

corpus_embedded = np.stack([
    np.mean(np.array([embed_chord(p2v, c) for c in x]), axis=0) for x in chords
])

print(f"embedded_chord_test.shape: {embedded_chord_test.shape}") #The length is 100 because we set 'embedding_dim': 100 in embedding trainning process
print(f"corpus_embedded.shape: {corpus_embedded.shape}",end='\n')
print('\n')
print(f"embedded_chord_test: {embedded_chord_test}")
# corpus_embedded = np.stack([
#     np.mean(np.array([embed_chord(p2v, c) for c in x]), axis=0) for x in chords
# ])

embedded_chord_test.shape: (100,)
corpus_embedded.shape: (115, 100)


embedded_chord_test: [ 1.5023417   1.4714259  -1.8749816  -0.19704878  0.32284945  1.6470559
 -0.1832118   0.4308313   0.1617653  -2.3423574  -0.94857323 -1.3680321
  1.8316647   2.4073575  -0.2145201   1.2042994   0.42670178 -2.555657
 -3.300817   -0.6872867   0.09748369 -1.297121   -2.6374292   4.260552
  0.5044306  -2.362741   -0.6666601  -2.8838537   1.414785   -0.41518974
 -0.6042384  -0.8536912  -1.0217173   0.46472374  1.2799902   1.5056918
  1.0058668   3.063817    1.0289304  -0.2772213   0.95502794  1.1806269
  1.7442743  -1.0042367  -1.8859016   3.0172048   2.3404994   2.7114666
  0.02019075 -0.02266622 -2.360383   -0.86830443 -0.7102659   1.6148638
  0.9213932  -1.3960586   0.41863513  2.1318412  -0.6925771   5.36633
 -0.15068114  1.1545246  -1.5879536   1.1653125  -0.6584067   3.99355
 -1.8545915  -1.0097904  -1.0863103  -0.6686289  -3.2702658   1.3489337
 -1.1630509   1.3578134   2.557721    0.84791994  

In [6]:
from tasks.segmentation.functional import LSTMBaselineModel
import torch 

# Obtain correct input format
corpus_embedded = torch.tensor(corpus_embedded).unsqueeze(0)

# Use LSTM model for prediction:
CKPT_PATH = '/app/segmentation_out/third_run.ckpt.ckpt'


model = LSTMBaselineModel.load_from_checkpoint(CKPT_PATH)


model.eval() # evalutaion mode

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
corpus_embedded = corpus_embedded.to(device)


with torch.no_grad():
    predictions = model.evaluation_forward(corpus_embedded)



Result after classification:tensor([[[-1.3092,  0.7941,  0.0061,  ..., -1.3589, -0.7658,  1.0796],
         [-1.8537,  0.9083, -0.1224,  ..., -1.4891, -1.0478,  1.4031],
         [-2.0752,  1.0064, -0.0923,  ..., -1.5634, -1.1503,  1.6124],
         ...,
         [ 1.2023,  3.1554,  1.2440,  ..., -1.6685,  0.7210,  3.3394],
         [ 1.0413,  3.0076,  1.1419,  ..., -1.6034,  0.6502,  3.1438],
         [ 0.7760,  2.6215,  0.9623,  ..., -1.4010,  0.5920,  2.6845]]],
       device='cuda:0')
Result after softmax: tensor([[[-1.3092,  0.7941,  0.0061,  ..., -1.3589, -0.7658,  1.0796],
         [-1.8537,  0.9083, -0.1224,  ..., -1.4891, -1.0478,  1.4031],
         [-2.0752,  1.0064, -0.0923,  ..., -1.5634, -1.1503,  1.6124],
         ...,
         [ 1.2023,  3.1554,  1.2440,  ..., -1.6685,  0.7210,  3.3394],
         [ 1.0413,  3.0076,  1.1419,  ..., -1.6034,  0.6502,  3.1438],
         [ 0.7760,  2.6215,  0.9623,  ..., -1.4010,  0.5920,  2.6845]]],
       device='cuda:0')


In [11]:
print(predictions[0].shape)
print(predictions[1].shape)

torch.Size([1, 115, 11])
torch.Size([1, 115, 11])


In [26]:
predicted_labels = torch.argmax(predictions[1], dim=-1)  # shape will be (1, 115)
print(f"Input: chords_str: {chords_str}")
print('\n')
print(f"Output: predicted_labels: {predicted_labels}")

Input: chords_str: ['E:maj', 'C#:min', 'F#:maj', 'D#:min', 'B:maj', 'G#:min', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'D#:min', 'B:maj', 'D#:min', 'F#:maj', 'E:maj', 'N', 'A:maj', 'B:maj', 'G#:min', 'G#:maj', 'N', 'E:maj', 'C:maj', 'E:maj', 'N', 'B:maj', 'E:maj', 'C#:min', 'F#:maj', 'B:maj', 'D#:min', 'G#:min', 'F#:maj', 'E:maj', 'C#:min', 'B:maj', 'D#:min', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'B:maj', 'F#:maj', 'E:maj', 'C#:min', 'B:maj', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'D#:min', 'B:maj', 'D#:min', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'D#:min', 'B:maj', 'G#:maj', 'E:maj', 'C#:min', 'B:maj', 'D#:min', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'D#:min', 'B:maj', 'D#:min', 'F#:maj', 'E:maj', 'C#:min', 'B:maj', 'D#:min', 'B:maj', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'D#:min', 'B:maj', 'D#:min', 'F#:maj', 'E:maj', 'N', 'C#:maj', 'N', 'D#:maj', 'F#:maj', 'D#:maj', 'D#:min', 'B:maj', 'D#:min', 'N', 'D#:maj', 'B:maj', 'D#:maj', 'C#:maj', 'F#:maj', 'E:maj', 'C#:min', 'F#:maj', 'D#:mi