In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Genre Prediction Based on Music Metadata and Calculated Features

In [4]:
genres_info = pd.read_csv("data/fma/genres.csv", index_col = "genre_id")

FileNotFoundError: [Errno 2] File b'data/fma/genres.csv' does not exist: b'data/fma/genres.csv'

In [None]:
genres_info

In [None]:
genres_info.sort_values("#tracks", ascending = False)

In [None]:
top_level_genre_ids = genres_info.top_level.unique()

In [None]:
top_level_genres = genres_info.loc[top_level_genre_ids]

In [None]:
top_level_genres = top_level_genres.sort_values("#tracks", ascending = True)[["title", "#tracks"]]

In [None]:
len(top_level_genres)

In [None]:
plt.figure(figsize = (10, 8))
plt.barh(top_level_genres["title"], top_level_genres["#tracks"])

plt.xlabel("Number of tracks")
plt.title("Number of tracks per top genre")

plt.show()

In [None]:
tracks_info = pd.read_csv("data/fma/tracks.csv", header = [0, 1])

In [None]:
tracks_info = tracks_info.loc[1:]

In [None]:
tracks_info

In [None]:
tracks_info.columns

In [None]:
tracks_info[("track", "genre_top")].unique()

In [None]:
tracks_info.describe().T

In [None]:
percent_missing_data = tracks_info.isna().sum()/ len(tracks_info)* 100

In [None]:
percent_missing_data

In [None]:
columns_to_keep = percent_missing_data[percent_missing_data < 85].index

In [None]:
tracks_info[columns_to_keep]

In [None]:
track_id_to_genre = tracks_info[[("Unnamed: 0_level_0", "Unnamed: 0_level_1"), ("track", "genre_top")]]

In [None]:
tracks_info.head()

In [None]:
columns_to_remove = [("Unnamed: 0_level_0", "Unnamed: 0_level_1"), ("album", "id"), ("album", "information"), ("album", "tags"), ("album", "title"), ("album", "producer"), ("artist", "bio"), ("artist", "id"), ("artist", "latitude"), ("artist", "longitude"), ("artist", "location"), ("artist", "members"), ("artist", "name"), ("artist", "tags"), ("artist", "website"), ("track", "genres"), ("track", "genres_all"), ("track", "license"), ("track", "tags"), ("track", "title")]

In [None]:
columns_to_keep_manual_selection = set(tracks_info.columns).difference(set(columns_to_remove))

In [None]:
tracks_info = tracks_info[columns_to_keep_manual_selection]

In [None]:
sets = tracks_info[[("set", "subset"), ("set", "split")]]

In [None]:
tracks_info = tracks_info.drop([("set", "subset"), ("set", "split")], axis = 1)

In [None]:
tracks_info

In [None]:
tracks_info.columns

In [None]:
tracks_info

In [None]:
pd.to_datetime(tracks_info[("artist", "date_created")])

In [None]:
tracks_info.columns

In [None]:
tracks_info.dtypes

In [None]:
datetime_variables = [("track", "date_created"), ("artist", "date_created"), ("artist", "active_year_begin"), ("album", "date_created"), ("album", "date_released")]

In [None]:
datetime_variables_processed = pd.DataFrame()
for datetime_variable in datetime_variables:
    date = pd.to_datetime(tracks_info[datetime_variable])
    
    column_name = f"{datetime_variable[0]}_{datetime_variable[1]}"
    datetime_variables_processed[column_name + "_year"] = date.dt.year
    datetime_variables_processed[column_name + "_month"] = date.dt.month

In [None]:
tracks_info = pd.merge(left = tracks_info, right = datetime_variables_processed, left_on = tracks_info.index, right_on = datetime_variables_processed.index, )

In [None]:
tracks_info.columns

In [None]:
tracks_info = tracks_info.drop(datetime_variables, axis = 1)

In [None]:
tracks_info = tracks_info.drop(["key_0"], axis = 1)

In [None]:
tracks_info.dtypes

In [None]:
genres = tracks_info[("track", "genre_top")]

In [None]:
track_attributes = tracks_info.drop(("track", "genre_top"), axis = 1)

In [None]:
track_attributes = pd.get_dummies(track_attributes)

In [None]:
track_attributes = track_attributes.fillna(0)

In [None]:
track_attributes.describe().T

In [None]:
genres.unique()

In [None]:
genres.groupby(genres).size().sort_values(ascending = False)

In [None]:
len(genres[genres.isna()]) / len(genres)

In [None]:
genres = genres[~genres.isna()]

In [None]:
track_attributes = track_attributes.loc[genres.index]

In [None]:
len(genres), len(track_attributes)

In [None]:
track_attributes_scaled = MinMaxScaler().fit_transform(track_attributes)

In [None]:
track_attributes_scaled.head()

In [None]:
track_attributes_train, track_attributes_test, genres_train, genres_test = train_test_split(
    track_attributes_scaled,
    genres,
    test_size = 0.15,
    stratify = genres)

In [None]:
track_attributes_train.shape, track_attributes_test.shape, genres_train.shape, genres_test.shape

In [None]:
logistic_regression = LogisticRegression(C = 1, max_iter = 1000)

In [None]:
logistic_regression.fit(track_attributes_train, genres_train)

In [None]:
def display_scores(estimator, train_attributes, train_labels, test_attributes, test_labels):
    print(f"Train score: {estimator.score(train_attributes, train_labels)}")
    print(f"Test score: {estimator.score(test_attributes, test_labels)}")    

In [None]:
display_scores(logistic_regression, track_attributes_train, genres_train, track_attributes_test, genres_test)

In [None]:
test_predictions = logistic_regression.predict(track_attributes_test)
cm = confusion_matrix(genres_test, test_predictions)

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(
    cm, 
    annot = True, 
    fmt = "d", 
    cmap = plt.cm.Blues, 
    xticklabels = logistic_regression.classes_,
    yticklabels = logistic_regression.classes_)

In [None]:
track_audio_features = pd.read_csv("data/fma/features.csv", header = [0, 1, 2, 3])

In [None]:
track_audio_features

In [None]:
track_ids = track_audio_features[("feature", "statistics", "number", "track_id")]

In [None]:
tracks_for_modelling = track_id_to_genre[track_id_to_genre[("track", "genre_top")].isin(["Rock", "Hip-Hop", "Instrumental"])]

In [None]:
track_ids_for_modelling = tracks_for_modelling[("Unnamed: 0_level_0", "Unnamed: 0_level_1")]
track_genres_for_modelling = tracks_for_modelling[("track", "genre_top")]

In [None]:
track_ids_for_modelling

In [None]:
track_audio_features_for_modelling = track_audio_features[track_ids.isin(track_ids_for_modelling)]

In [None]:
track_audio_features_for_modelling = track_audio_features_for_modelling.drop([("feature", "statistics", "number", "track_id")], axis = 1)

In [None]:
track_audio_features_for_modelling = MinMaxScaler().fit_transform(track_audio_features_for_modelling)

In [None]:
track_audio_features_train, track_audio_features_test, genres_af_train, genres_af_test = train_test_split(
    track_audio_features_for_modelling,
    track_genres_for_modelling,
    test_size = 3500,
    stratify = track_genres_for_modelling
)

In [None]:
track_audio_features_train.shape, track_audio_features_test.shape, genres_af_train.shape, genres_af_test.shape

In [None]:
pca = PCA(n_components = 2)

In [None]:
pca.fit(track_audio_features_train)

In [None]:
pcs = pca.transform(track_audio_features_train)

In [None]:
first_pc, second_pc = pcs[:, 0], pcs[:, 1]

In [None]:
genre_colors = LabelEncoder().fit_transform(genres_af_train)

In [None]:
plt.scatter(first_pc, second_pc, c = genre_colors, s = 1)
plt.xlabel("First PC")
plt.ylabel("Second PC")
plt.show()

In [None]:
logistic_regression_af = LogisticRegression(max_iter = 1000)
logistic_regression_af.fit(track_audio_features_train, genres_af_train)

In [None]:
display_scores(logistic_regression_af, track_audio_features_train, genres_af_train, track_audio_features_test, genres_af_test)

In [None]:
def plot_confusion_matrix(estimator, attributes, labels, title = None):
    predictions = estimator.predict(attributes)
    cm = confusion_matrix(labels, predictions, normalize = "all")
    
    plt.figure(figsize = (5, 5))
    if title is not None:
        plt.title(title)
    sns.heatmap(
        cm, 
        annot = cm, 
        fmt = ".3%", 
        cmap = plt.cm.Blues, 
        xticklabels = estimator.classes_,
        yticklabels = estimator.classes_)
    plt.show()

In [None]:
plot_confusion_matrix(logistic_regression_af, track_audio_features_train, genres_af_train, title = "CM: train data")

In [None]:
plot_confusion_matrix(logistic_regression_af, track_audio_features_test, genres_af_test, title = "CM: test data")

In [None]:
print(classification_report(genres_af_test, logistic_regression_af.predict(track_audio_features_test)))

In [None]:
params_lr = {
    "solver": ["liblinear"],
    "penalty": ["l1"],
#     "C": 10 ** np.arange(-4, 5, dtype = np.float64)
    "C":[1e-2, 1e-3]
}
grid_search_lr = GridSearchCV(LogisticRegression(), params_lr, cv = 5)

In [None]:
grid_search_lr.fit(track_audio_features_train, genres_af_train)

In [None]:
grid_search_lr.best_estimator_

In [None]:
grid_search_lr.cv_results_

In [None]:
grid_search_lr.cv_results_["params"]

In [None]:
display_scores(grid_search_lr.best_estimator_, track_audio_features_train, genres_af_train, track_audio_features_test, genres_af_test)

In [None]:
len(grid_search_lr.best_estimator_.coef_[grid_search_lr.best_estimator_.coef_ != 0])

In [None]:
pca_af = PCA(n_components = 20)

In [None]:
pca_af.fit(track_audio_features_train)

In [None]:
track_audio_features_pca_train = pca_af.transform(track_audio_features_train)
track_audio_features_pca_test = pca_af.transform(track_audio_features_test)

In [None]:
plt.plot(pca_af.explained_variance_ratio_)

In [None]:
params_lr = {
    "solver": ["liblinear"],
    "penalty": ["l1"],
    "C": 10 ** np.arange(-4, 5, dtype = np.float64)
}
grid_search_lr = GridSearchCV(LogisticRegression(), params_lr, cv = 5)

In [None]:
grid_search_lr.fit(track_audio_features_pca_train, genres_af_train)

In [None]:
grid_search_lr.cv_results_

In [None]:
grid_search_lr.best_params_

In [None]:
grid_search_lr.best_estimator_.coef_

In [None]:
k_means = KMeans(n_clusters = 3)

In [None]:
k_means.fit(track_audio_features_pca_train)

In [None]:
plt.scatter(track_audio_features_pca_train[:, 0], track_audio_features_pca_train[:, 1], c = k_means.predict(track_audio_features_pca_train), s = 1)