# Structure Representation Computation from Audio Input

## > Library importing

In [1]:
#Computation
import numpy as np
import scipy
import matplotlib.pyplot as plt

#Data Processing
import sklearn.cluster

#Audio
import librosa
from librosa import display

#System
import glob
import os

## > Loading audio

In [2]:
#Choose directory containing audiofiles
directory = '../../Music'

In [1]:
#Read all paths in specified directory
all_filepaths = []
all_names= []
for root, dirs, files in os.walk(directory):
        for name in files:
            if (('.wav' in name) or ('.aif' in name) or ('.mp3' in name)):
                filepath = os.path.join(root, name)
                all_filepaths.append(filepath)
                all_names.append(name)

#Dictionary containing all batches of matrices as described by pipeline documentation in a linearized, sequential format
X = {}

#Load all audiofiles and store in array
all_audio = []
for i in range(len(all_filepaths)):
    y, sr = librosa.load(filepath, sr=22050, mono=True)
    all_audio.append((y, sr))
    sys.stdout.write("\rLoaded %i/%s pieces." % ((i+1), str(len(all_filepaths))))
    sys.stdout.flush()
    
X["audio"] = all_audio

NameError: name 'os' is not defined

## > Self Similarity for Repetitions

### >> Single-Feature Self Similarity Matrix (no feature fusion on this script)

In [11]:
#Pipeline of primary features to use to compute self similarity
#{stft, log_power_CQT, perceptually_weighted_CQT, mel_spectrogram}

all_stft = []
all_logCQT = []
all_perCQT = []
all_melspec = []

for filepath in range(len(all_filepaths)):
    
    #STFT
    stft = librosa.stft(y=all_audio[filepath][0])
    all_stft.append(stft)

    #Log-power Constant-Q Transform
    bins_per_oct = 12*3
    n_oct = 7
    CQT = librosa.cqt(y=all_audio[filepath][0], sr=all_audio[filepath][1], bins_per_octave=bins_per_oct, n_bins=n_oct*bins_per_oct)
    all_logCQT.append(librosa.amplitude_to_db(CQT))

    #Perceptually-weighted Constant-Q Transform
    freqs = librosa.cqt_frequencies(CQT.shape[0], fmin=librosa.note_to_hz('A1'))
    all_perCQT.append(librosa.perceptual_weighting(np.abs(CQT)**2, freqs))

    #Mel-scaled Spectrogram
    all_melspec.append(librosa.feature.melspectrogram(S=np.abs(stft)**2, sr=all_audio[filepath][1]))

    sys.stdout.write("\rComputed spectral representations for %i/%s pieces." % ((filepath+1), str(len(all_filepaths))))
    sys.stdout.flush()
    
spectral_rep = {"stft":all_stft, "logCQT":all_logCQT, "perCQT":all_perCQT, "melspec":all_melspec}
X["spectral_rep"]=spectral_rep

Computed spectral representations for 8/8 pieces.

In [22]:
#Plotting
fig, axs = plt.subplots(nrows=len(all_filepaths), ncols=4)
for i in range(len(all_filepaths)):
    for j in range(4):
        axs[i, j].matshow(all_spectral_rep[i][j]])
        axs[i, j].set_title(all_names[i])
    # plt.tight_layout()

SyntaxError: unexpected EOF while parsing (<ipython-input-22-2e80232896fd>, line 7)

### >> Dimensionality Reduction

In [11]:
#Select one dimensionality reduction method out of {none, beat-synchronization, 2d-interpolation}
dim_reduction = 'beat-synchronization'

In [12]:
#Replace all spectral representations with their dimensionality reduced version
for audiofile in range(len(all_filepaths)):
    tempo, beats = librosa.beat.beat_track(y=all_audio[audiofile][0], sr=all_audio[audiofile][1], trim=False)

    if dim_reduction=='none':
        break
    if dim_reduction=='beat-synchronization':
        for spec_rep in range(len(Srep[audiofile])):
            Srep[audiofile][spec_rep] = librosa.util.sync(all_spectral_representations[audiofile][spec_rep], beats, aggregate=np.median)

    if dim_reduction=='2d_interpolation': #NEEDS TO BE EVALUATED (is this working as expected, do other methods of downsampling make more sense?)
        for spec_rep in range(len(Srep[audiofile])):
            #Compute interpolation function
            Xindex = np.linspace(0, 1, num=Srep[audiofile][spec_rep].shape[0])
            Yindex = np.linspace(0, 1, num=Srep[audiofile][spec_rep].shape[1])
            f = interp2d(Xindex, Yindex, num=Srep[audiofile][spec_rep].flatten(), kind='linear')
            #Generate new ranges
            Xindex_ds = np.linspace(0, 1, num=Srep[audiofile][spec_rep].shape[0]/50)
            Yindex_ds = np.linspace(0, 1, num=Srep[audiofile][spec_rep].shape[1]/50)
            #Resample
            Srep[audiofile][spec_rep] = np.reshape(f(Xindex_ds, Yindex_ds), (shape[0]/50, shape[1]/50))
    sys.stdout.write("\rDownlsampled %i/%s pieces." % ((audiofile+1), str(len(all_filepaths))))
    sys.stdout.flush()

NameError: name 'Srep' is not defined

In [None]:
#Plotting

### >> Short-term History Embedding of spectral representation

In [None]:
#Choose number of lag steps
n_steps = 4

In [None]:
Cstack = librosa.feature.stack_memory(Csync, 4)

### >> Weighted Recurrence Matrix

### >> Timelag filter / Short-term History Embedding(?) / Windowed Majority Vote