# Import

Import data, turn tags into a list of strings and define the available pitches

In [4]:
from ast import literal_eval
import pandas as pd

pitch_symbol = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

df = pd.read_csv("songs-with-preprocessed-tags.csv")
# ignore songs that have no tags
df = df[df["tags"].notna()]

# turn tags from strings into a list of strings
df["artist_genres"] = df["artist_genres"].apply(literal_eval)
df["tags"] = df["tags"].apply(literal_eval)
df.columns

Index(['name', 'duration', 'artist_genres', 'artist_names', 'acousticness',
       'loudness', 'energy', 'danceability', 'mode', 'instrumentalness',
       ...
       'B_90', 'B_91', 'B_92', 'B_93', 'B_94', 'B_95', 'B_96', 'B_97', 'B_98',
       'B_99'],
      dtype='object', length=1240)

In [None]:
df.iloc[:20, :25]

In [5]:
tolower = lambda s: s.lower()
flatmap = lambda list_of_lists: [item for l in list_of_lists for item in l]
tags = pd.Series(flatmap(df[~df["tags"].isna()]["tags"].values.tolist())).apply(tolower)
ratios = [ratio for ratio in tags.value_counts(normalize=True).to_list()]

# Data Visualization

## Spotify Features

Here we plot the correlation matrix of the features we selected from the spotify api.

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 10))
sns.heatmap(df.iloc[:, :16].corr(numeric_only=True), annot=True)
plt.title("Correlation matrix")
plt.show()

We only plot the spotify features here, since we have way too many columns.
This is because we decided to use the pitches of every note as a feature. The problem is, that the amount of pitches is proportional to the length
of the song. To reduce the amount of features, we preprocessed the pitches per note down to 100 values per note. Though this still leaves us with around 1240 feature columns.


In [None]:
df.info()

## Pitches

Since we have over ~100 different tags, we decided to take a closer look at the following tags, to
 get some sense fo our features, we decided to take a closer look at 5 different tags.


In [None]:
df_by_tags = {
    "rock": df[df["tags"].apply(lambda x: "rock" in x)],
    "pop": df[df["pop"].apply(lambda x: "pop" in x)],
    "indie": df[df["tags"].apply(lambda x: "indie" in x)],
    "hip-hop": df[df["tags"].apply(lambda x: "hip-hop" in x)],
    "electronic": df[df["tags"].apply(lambda x: "electronic" in x)],
    "dance": df[df["tags"].apply(lambda x: "dance" in x)],
    "classic rock": df[df["tags"].apply(lambda x: "classic rock" in x)],
    "alternative rock": df[df["tags"].apply(lambda x: "alternative rock" in x)],
    "alternative": df[df["tags"].apply(lambda x: "alternative" in x)],
    "80s": df[df["tags"].apply(lambda x: "80s" in x)],
}

rock_df = df_by_tags["rock"]

### Rock

Here we check if the correlation matrix of just the songs tagged with `rock` is different from the correlation matrix of all songs.
But as we can see, it looks pretty much the same.

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(rock_df.iloc[:, :17].corr(numeric_only=True), annot=True)
plt.title("Correlation matrix of songs tagged 'Rock'")
plt.show()

This plot show the avg pitches of the songs tagged with `rock`.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.set_xticks(range(0, 101, 10))

for i, note in enumerate(pitch_symbol):
    curr_df = rock_df[[*[note + "_" + str(i) for i in range(0, 100)]]].mean().transpose()
    curr_df.plot(ax=ax)
ax.legend(pitch_symbol, loc="center right")
ax.set_title("Avg pitches for songs tagged with 'rock'")

## Plots per tag
### Pitches

In order to compare the information above with other tags, we decided to plot the pitches separately and use the tags as the legend.

And as we can see from the averages, `edm` is very different from the other tags. The `metal` tag has a very strong presents in E and B

In [None]:
fig, ax = plt.subplots(len(pitch_symbol) // 2, 2, figsize=(25, 50))

for i, note in enumerate(pitch_symbol):
    cur_ax = ax[i % (len(pitch_symbol) // 2)][i // (len(pitch_symbol) // 2)]
    cur_ax.set_title(note)
    cur_ax.legend([*df_by_tags.keys()])
    for key, val in df_by_tags.items():
        curr_df = val[[*[note + "_" + str(i) for i in range(0, 100)]]].mean().transpose()
        curr_df.plot(ax=cur_ax, legend=False)

for i in ax:
    for axis in i:
        axis.legend([*df_by_tags.keys()], loc="center right")

## Spotify features

Here we decided to check if we could maybe differentiate our selected tags by plotting the boxplot of the spotify features for our selected tags.
But as we can see, most of them have a high spread and overlap, so we do not think that they are very helpful.

In [None]:
spotify_features = ['duration', 'liveness', 'valence', 'danceability', 'tempo', 'loudness', 'energy',
                    'acousticness']

fig, ax = plt.subplots(len(spotify_features) // 2, 2, figsize=(25, 50))

spotify_feature_df = pd.concat([
    df_by_tags["rock"].assign(tag="rock"),
    df_by_tags["metal"].assign(tag="metal"),
    df_by_tags["hip-hop"].assign(tag="hip-hop"),
#    df_by_tags["jazz"].assign(tag="jazz"),
#    df_by_tags["electro"].assign(tag="electro"),
#    df_by_tags["indie"].assign(tag="indie"),
    df_by_tags["edm"].assign(tag="edm"),
#    df_by_tags["classic"].assign(tag="classic"),
    df_by_tags["piano"].assign(tag="piano"),
])

for i, feature in enumerate(spotify_features):
    cur_ax = ax[i % (len(spotify_features) // 2)][i // (len(spotify_features) // 2)]
    cur_ax.set_title(feature)
    spotify_feature_df[["tag", feature]].boxplot(ax=cur_ax, by="tag")

## Songs per Tag

By counting the number of times a tag appears, we can see that we have an incredibly imbalanced dataset.
We think that this will mean that our classifier will try to tag most things with pop as it is by far the tag that appears the most often.
In the plot we already decided to filter out all tags that appear less than 250 times as they add no real information to the graph.

In [None]:
flatmap = lambda list_of_lists: [item for l in list_of_lists for item in l]
tags = pd.Series(flatmap(df["tags"].values.tolist()))
counts = tags.value_counts()
counts.info()
print()
print("Top  genres")
print(counts)
counts.plot.barh(figsize=(10, 20))
plt.title("Number of songs tagged with x")

# Feature Preprocessing

Transform the labels via the `MultiLabelBinarizer` to a numerical representation, scale the input data and create a held-back test set.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler

# transform tags with MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform([*df['tags']])

X = df.drop(columns=['tags', 'artist_names', 'name', "artist_genres"])

# scale input
scaler = StandardScaler()
X = scaler.fit_transform(X)

# create held-back test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=234634754)  # 70/30 split

Models like the `DecisionTree` might perform better, or even run at all, if they are provided with teh `class_weights` of the targets.

In [13]:
import numpy as np
from collections import Counter
from sklearn.utils import class_weight
flat_labels = [label for sublist in df['tags'] for label in sublist]
label_counts = Counter(flat_labels)
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(flat_labels), y=flat_labels)
class_weights_dict = dict(zip(np.unique(flat_labels), class_weights))

# Create a list of class weight dictionaries for each label
class_weights_list = []
for i in range(y.shape[1]):
    label_column = y[:,i]
    label_counts = Counter(label_column)
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(label_column), y=label_column)
    class_weights_list.append(dict(zip(np.unique(label_column), class_weights)))
class_weights_list


[{0: 0.5039488017429193, 1: 63.810344827586206},
 {0: 0.5487298606306217, 1: 5.630324543610548},
 {0: 0.5104358219933799, 1: 24.455947136563875},
 {0: 198.26785714285714, 1: 0.5012641083521445},
 {0: 0.5025801195002716, 1: 97.39473684210526},
 {0: 0.75, 1: 1.5}]

# Feature Selection

Some models can't effectively use 1241 features, so for them, we need to reduce the amount of features.

Other models like MLP however can use all features, so we do not throw away the other features

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=200)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

## KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=[{
        'weights': ['uniform', 'distance'],
        'algorithm': ['ball_tree', 'kd_tree'],
        'leaf_size': [1, 3],
        'p': [1, 2, 4],
        'metric': ['manhattan', 'cosine', 'euclidean'],
    }],
    n_jobs=-1
)

# Fit the pipeline to the training data
knn.fit(X_train_pca, y_train)
knn.best_params_

In [None]:
# Use the trained model to predict the tags for the new songs
predicted_tags = knn.predict(X_test_pca)
predicted_tags_inversed = mlb.inverse_transform(predicted_tags)

In [None]:
predicted_tags_inversed

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, multilabel_confusion_matrix

def confusion_matrix(y_true, y_actual):
    mats = multilabel_confusion_matrix(y_true=y_true, y_pred=y_actual)
    fig, axs = plt.subplots(nrows=len(mats)//2, ncols=2, figsize=(10, 15))
    fig.tight_layout(pad=3)
    plot_labels = mlb.inverse_transform(np.array([[1, 1, 1, 1, 1, 1]]))[0]
    for idx, mat in enumerate(mats):
        axs[idx//2,idx % 2].set_title(plot_labels[idx])
        axs[idx//2,idx % 2].xaxis.tick_top()
        mat_df = pd.DataFrame(mat, index=[i for i in ["Positive", "Negative"]], columns=[i for i in ["Positive", "Negative"]])
        sns.heatmap(mat_df, annot=True, ax=axs[idx//2,idx % 2])
    plt.show()

def print_performance_report(y_pred):
    # Calculate the accuracy
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.2f}".format(acc))

    # Calculate the precision
    pre = precision_score(y_test, y_pred, average="samples", zero_division=False)
    print("Precision: {:.2f}".format(pre))

    # Calculate the recall
    rec = recall_score(y_test, y_pred, average="samples")
    print("Recall: {:.2f}".format(rec))

    # Calculate the F1-score
    f1 = f1_score(y_test, y_pred, average="samples")
    print("F1-score: {:.2f}".format(f1))
    confusion_matrix(y_test, y_pred)

In [None]:
y_pred_knn = predicted_tags
print("KNN")
print_performance_report(y_pred_knn)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=[{
        'criterion': ['gini'],
        'splitter': ['best', 'random'],
        'max_depth': [1, 2, 8, 16, 48],
        'min_samples_leaf': [1, 10, 30],
        'min_weight_fraction_leaf': [0.0, 0.0001, 0.0001**10],
        'max_features': [None, 'sqrt', 'log2'],
        'max_leaf_nodes': [None, 10, 100, 1000],
        'min_impurity_decrease': [0.0, 0.0001, 0.0001**10],
        'class_weight': [class_weights_list], #TODO(multilabel dict)
        'ccp_alpha': [0.0, 0.0001, 0.0001**10]
    }],
    n_jobs=-1
)
tree.fit(X_train_pca, y_train)
tree.best_params_

In [None]:
# Use the trained model to predict the tags for the new songs
predicted_tags = tree.predict(X_test_pca)
predicted_tags_inversed = mlb.inverse_transform(predicted_tags)

In [None]:
predicted_tags_inversed

In [None]:
y_pred_dec_tree = predicted_tags
print("Decision Tree")
print_performance_report(y_pred_dec_tree)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 150, 200, 300, 400],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [12, 24, 36, 48, 64],
    'min_samples_split': [2, 8, 32],
    #    'min_samples_leaf': [1, 2, 4, 12],
    #    'max_leaf_nodes': [None, 2, 8, 256, 512],
    'max_features': ['sqrt', 'log2', None],
    'n_jobs': [-2],
    'class_weight': [class_weights_list, None]
}
etc = GridSearchCV(ExtraTreesClassifier(random_state=42), param_grid)

# https://scikit-learn.org/stable/modules/multiclass.html#
etc.fit(X_train_pca, y_train)

In [None]:
# Use the trained model to predict the tags for the new songs
predicted_tags = etc.predict(X_test_pca)

In [None]:
etc.best_params_

In [None]:
mlb.inverse_transform(predicted_tags)

In [None]:
y_pred_extra_trees = predicted_tags
print("ExtraTrees")
print_performance_report(y_pred_extra_trees)

In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}
rfc = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
rfc.fit(X_train, y_train)
rfc.best_params_

In [None]:
y_pred_random_forest = rfc.predict(X_test)
print("RandomForest")
print_performance_report(y_pred_random_forest)

## Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(150,150,150), (100,100,100), (150,200,150)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'learning_rate_init': [0.01, 0.001, 0.0001],
}
mlp = GridSearchCV(MLPClassifier(max_iter=1000), param_grid, cv=5)
mlp.fit(X_train, y_train)
mlp.best_params_

In [None]:
y_pred_mlp = mlp.predict(X_test)
print("RandomForest")
print_performance_report(y_pred_mlp)