In [2]:
## Preparing Datasets
import pandas as pd

# Read track metadata with genre labels
tracks = pd.read_csv('datasets/fma-rock-vs-hiphop.csv')
tracks.head()

# Read track metrics with the features
echonest_metrics = pd.read_json('datasets/echonest-metrics.json', precise_float = True)

# Merge the relevant columns of tracks and echonest_metrics
echo_tracks = echonest_metrics.merge(tracks[['genre_top', 'track_id']], on='track_id')

echo_tracks.info()
echo_tracks.head()

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/fma-rock-vs-hiphop.csv'

In [None]:
## Find strongly correlated features by creating a correlation matrix

corr_metrics = echonest_metrics.corr()
corr_metrics.style.background_gradient()

In [1]:
## Reduce number of features using PCA (principal component analysis)

# Define features 
features = echo_tracks.drop(columns=['genre_top', 'track_id'])

# Define labels
labels = echo_tracks['genre_top']

# Scale the features and set the values to a new variable
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
scaled_train_features = scaler.fit_transform(features)

%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Get explained variance ratios from PCA using all features
pca = PCA()
pca.fit(scaled_train_features)
exp_variance = pca.explained_variance_ratio_

# plotting
fig, ax = plt.subplots()
ax.bar(range(pca.n_components_), exp_variance)
ax.set_xlabel('Principal Component #')

NameError: name 'echo_tracks' is not defined

In [None]:
## Run cumulative explained variance plot to determine how many features are required to explain(85% of the variance)

import numpy as np

cum_exp_variance = np.cumsum(exp_variance)

fig, ax = plt.subplots()
ax.plot(cum_exp_variance)
ax.axhline(y=0.85, linestyle='--')

n_components = 6

# Perform PCA with the chosen number of components
pca = PCA(n_components, random_state=10)
pca.fit(scaled_train_features)
pca_projection = pca.transform(scaled_train_features)

In [None]:
## Train a decision tree to classify genre

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Split data
train_features, test_features, train_labels, test_labels = train_test_split( pca_projection, labels, random_state=10)

# Train
tree = DecisionTreeClassifier(random_state=10)
tree.fit(train_features, train_labels)

# Predict the labels for the test data
pred_labels_tree = tree.predict(test_features)

In [None]:
## Train a logistic regression to clasify genre (comparing with decision tree model)

from sklearn.linear_model import LogisticRegression

# Train logistic regression and predict labels for the test set
logreg = LogisticRegression(random_state=10)
logreg.fit(train_features, train_labels)
pred_labels_logit = logreg.predict(test_features)

# Create classification report for both models
from sklearn.metrics import classification_report
class_rep_tree = classification_report(test_labels, pred_labels_tree)
class_rep_log = classification_report(test_labels, pred_labels_logit)

print("Decision Tree: \n", class_rep_tree)
print("Logistic Regression: \n", class_rep_log)

In [None]:
## Balance sample sizes for better performance

hop_only = echo_tracks.loc[echo_tracks['genre_top'] == 'Hip-Hop']
rock_only = echo_tracks.loc[echo_tracks['genre_top'] == 'Rock']

# sample the rocks songs to be the same number as there are hip-hop songs
rock_only = rock_only.sample(hop_only.shape[0], random_state = 10)

rock_hop_bal = pd.concat([rock_only, hop_only])

# The features, labels, and pca projection are created for the balanced dataframe
features = rock_hop_bal.drop(['genre_top', 'track_id'], axis=1) 
labels = rock_hop_bal['genre_top']
pca_projection = pca.fit_transform(scaler.fit_transform(features))

# Redefine the train and test set with the pca_projection from the balanced data
train_features, test_features, train_labels, test_labels = train_test_split(pca_projection, labels, random_state=10)

# Train decision tree on the balanced data
tree = DecisionTreeClassifier(random_state=10)
tree.fit(train_features, train_labels)
pred_labels_tree = tree.predict(test_features)

# Train logistic regression on the balanced data
logreg = LogisticRegression(random_state=10)
logreg.fit(train_features, train_labels)
pred_labels_logit = logreg.predict(test_features)

# Compare the models
print("Decision Tree: \n", classification_report(test_labels, pred_labels_tree))
print("Logistic Regression: \n", classification_report(test_labels, pred_labels_logit))

In [None]:
## Using K-fold cross-validation to evaluate models

from sklearn.model_selection import KFold, cross_val_score

kf = KFold(10)

tree = DecisionTreeClassifier(random_state=10)
logreg = LogisticRegression(random_state=10)

# Train our models using KFold cv
tree_score = cross_val_score(tree, pca_projection, labels, cv=kf)
logit_score = cross_val_score(logreg, pca_projection, labels, cv=kf)
print("Decision Tree:", np.mean(tree_score), "Logistic Regression:", np.mean(logit_score))