## Inference and Permutation Importance for GBDT

In this notebook, we want to calculate and compare feature importances for the gradient boosting decision tree (GBDT) classifier.

In [None]:
# Import required libraries
from features import Features
import os
import pandas as pd
import librosa
import numpy as np
from azureml.fsspec import AzureMachineLearningFileSystem

feature_generator = Features()

In [None]:
count = 0
max = 100000

folder = "azureml:/"
fs = AzureMachineLearningFileSystem(folder)
test_dir = os.path.join(folder, 'flac_T/')
test_features = []
test_labels = []
slice_size = 3
slice_audio = True
df = pd.read_csv(folder+ 'metadata/train_metadata.csv')
raw_labels = df['KEY'].to_list()[:max]
file_names = [file + '.flac' for file in df['FLAC_FILE_NAME'].to_list()][:max]

for idx, file in enumerate(file_names):
    if file.endswith('.flac'):
        label = raw_labels[idx]
        file_path = os.path.join(test_dir, file)

        # Load files from Azure filesystem (fs)
        with fs.open(file_path) as f:
            if not slice_audio:
                y, sr = librosa.load(f)
                test_features.append(feature_generator.make_features(y, sr))
                test_labels.append(label)
            else:
                y, sr = librosa.load(f)

                segment_length_samples = int(slice_size * sr)

                # Determine the number of segments
                num_segments = int(np.ceil(len(y) / segment_length_samples))

                for i in range(num_segments-1):
                    start_sample = i * segment_length_samples
                    end_sample = min((i + 1) * segment_length_samples, len(y))
                    
                    # Extract the segment
                    segment = y[start_sample:end_sample]
                    test_features.append(feature_generator.make_features(segment, sr))
                    test_labels.append(label)
                
        # Crude limiter
        if count >= max:
            break

print("loaded test audio")
X_test = np.array(test_features)
y_test = np.array(test_labels)

In [None]:
# Load the specified model from Azure file storage

from pickle import load
with open("/home/azureuser/model.pkl", "rb") as f:
    booster = load(f)

In [None]:
# Make sure the labels are correctly mapped to the logits
d = {'spoof':1, 'bonafide':0}
y_test = list(map(lambda x: d[x], list(y_test)))

In [None]:
# Calculate and print performance
from sklearn.metrics import classification_report

y_pred = booster.predict(X_test)
print(classification_report(y_test, y_pred))

## Feature Importances

Graphs for slice lengths of 6, 3, and 1 seconds.

In [None]:
# Load the requisite data from the XGBooster class

from xgboost import XGBooster

slice_length = 3
compressed=False
xg_data = XGBooster(data_only=True, splitvoice=False, compressed=compressed, rerecorded=False, max=1000, slice_size_seconds=slice_length)

In [None]:
# Load the model from Azure
from pickle import load

filename = "/home/azureuser/{}.pkl".format(slice_length)
with open(filename, "rb") as f:
    model = load(f)

In [None]:
# Calculate the permutation importance of each feature
from sklearn.inspection import permutation_importance
r = permutation_importance(model, xg_data.X_test, xg_data.y_test,
                           n_repeats=30,
                           random_state=0)

In [None]:
# Print the importance means and stds
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{i} "
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

In [None]:
# Visualize the importances 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


mfccs = ['mfcc'+str(i+1) for i in range(0, 20)]
chroma = ['chroma'+str(i+1) for i in range(0, 12)]
feature_names = np.array(mfccs + ['cr'] + chroma + ['sc', 'sb', 'rolloff', 'rms'])

forest_importances = pd.Series(r.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=r.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")

plt.rcParams.update({'font.size': 10})

plt.show()

In [None]:
# As many of our features are correlated, 
# we want to vizualize the correlations so that we can choose features with large span

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.figure(figsize=(16, 10))

mfccs = ['mfcc'+str(i+1) for i in range(0, 20)]
chroma = ['chroma'+str(i+1) for i in range(0, 12)]
feature_names = np.array(mfccs + ['cr'] + chroma + ['sc', 'sb', 'rolloff', 'rms'])

forest_importances = pd.Series(r.importances_mean, index=feature_names)

forest_importances.plot(kind='bar', yerr=r.importances_std)
plt.ylim(-0.01, 0.065)

plt.title("Feature importances using permutation on full model")
plt.ylabel("Mean accuracy decrease")

# Scale font size
plt.rcParams.update({'font.size': 25})
plt.rcParams.update({'font.family': 'serif', 'font.serif': 'Times New Roman'})

plt.tight_layout()
plt.savefig('feature_importances_full_model_.png')
plt.show()

In [None]:
# Here, we plot the correlation matrix
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
corr = spearmanr(xg_data.X_test).correlation

# Ensure the correlation matrix is symmetric
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
dendro = hierarchy.dendrogram(
    dist_linkage, labels=feature_names, ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro["ivl"]))

ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
ax2.set_yticklabels(dendro["ivl"])
_ = fig.tight_layout()

In [None]:
# Retrain our model with a subset of the most important features
xg_data_subset = XGBooster(subset=['mfcc_3', 'mfcc_10'], to_save=False, data_only=False, splitvoice=False, compressed=compressed, rerecorded=False, max=1000, slice_size_seconds=slice_length)