In [129]:
# Import dataset
import pandas as pd
url = 'https://raw.githubusercontent.com/enhatl/ML-Semester-Proj/main/dataset.csv'
df = pd.read_csv(url,index_col=0)

# Decision Tree

In [130]:
# Imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.calibration import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler

Data Preprocessing

In [131]:
df_dt = df.copy() #make copy so that we can prepare the data correctly
df_dt = df_dt.dropna(axis = 0) #remove na values

In [132]:
#unique variable, will not be useful to us
df_dt = df_dt.drop('track_id', axis=1)
#album name and track name are not going to be useful either
df_dt = df_dt.drop(['album_name','track_name'], axis=1)

In [136]:
#artist name can be useful, but need to get them as numerical values

label_encoder = LabelEncoder()

#We are assuming the first artist is the main artist and will give the most information.
df_dt['artist_encoded'] = df_dt['artists'].str.split(',').str[0].str.strip()  #Extract first artist
df_dt['artist_encoded'] = label_encoder.fit_transform(df_dt['artist_encoded'])  #Apply label encoding
X = df_dt.drop(['track_genre', 'artists'], axis=1) #dont need artists column anymore
y = df_dt['track_genre']

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [138]:
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, y_train)

In [139]:
y_pred = tree_classifier.predict(X_test)

classificationreport=classification_report(y_pred,y_test)
print(classificationreport)

                   precision    recall  f1-score   support

         acoustic       0.09      0.08      0.09       119
         afrobeat       0.27      0.25      0.26       112
         alt-rock       0.03      0.01      0.02       222
      alternative       0.07      0.06      0.07       136
          ambient       0.21      0.22      0.21       105
            anime       0.15      0.12      0.13       117
      black-metal       0.59      0.46      0.52       123
        bluegrass       0.27      0.29      0.28        90
            blues       0.13      0.10      0.11       135
           brazil       0.05      0.03      0.04       176
        breakbeat       0.31      0.34      0.33       102
          british       0.06      0.06      0.06       120
         cantopop       0.27      0.26      0.26       105
    chicago-house       0.43      0.46      0.44       101
         children       0.28      0.32      0.30        74
            chill       0.15      0.15      0.15       

In [116]:
accuracy =accuracy_score(y_pred,y_test)           # calculating accuracy
accuracy

0.22964912280701755

We want to improve that base accuracy. Lets try grid search for an optimal depth.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [3, 5, 7, 10]}
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_max_depth = grid_search.best_params_['max_depth']

# Use the best hyperparameters to train the pruned tree
pruned_tree = DecisionTreeClassifier(max_depth=best_max_depth)
pruned_tree.fit(X_train, y_train)

In [None]:
y_pred_pruned = pruned_tree.predict(X_test)

accuracy_pruned = accuracy_score(y_test, y_pred_pruned)
confusion_mat_pruned = confusion_matrix(y_test, y_pred_pruned)
classification_rep_pruned = classification_report(y_test, y_pred_pruned)

print(f"Accuracy: {accuracy_pruned}")
print(f"Confusion Matrix:\n{confusion_mat_pruned}")
print(f"Classification Report:\n{classification_rep_pruned}")

Accuracy still isn't great. Lets try binning some of our attributes. This has a trade off of some information loss, but having the precise value of some of our attributes isn't neccessary.

Let's bin these variables : popularity, duration_ms, danceability, energy, loudness, speechiness, acousticness, instrumentalness, liveness, valence, and tempo. We set the bins based on the data description given in the readme file.

In [None]:
#make new dataframe so that we dont mess up the old one
X_new = X.copy()

In [None]:
popularity_bins = [0, 25, 50, 75, 100]
popularity_labels = ['Very Low', 'Low', 'Moderate', 'High']

duration_bins = [0, 180000, 200000, 220000, float('inf')]
duration_labels = ['Short', 'Medium', 'Long', 'Very Long']

danceability_bins = [0, 0.5, 0.7, 0.8, 1.0]
danceability_labels = ['Low', 'Moderate', 'High', 'Very High']

energy_bins = [0, 0.4, 0.6, 0.8, 1.0]
energy_labels = ['Low', 'Moderate', 'High', 'Very High']

loudness_bins = [-float('inf'), -8.0, -6.0, -4.0, 0]
loudness_labels = ['Very Low', 'Low', 'Moderate', 'High']

speechiness_bins = [0, 0.33, 0.66, 1.0]
speechiness_labels = ['Music', 'Mixed', 'Speech']

acousticness_bins = [0, 0.2, 0.4, 0.6, 1.0]
acousticness_labels = ['Low', 'Moderate', 'High', 'Very High']

instrumentalness_bins = [0, 0.2, 0.4, 0.6, 1.0]
instrumentalness_labels = ['Low', 'Moderate', 'High', 'Very High']

liveness_bins = [0, 0.2, 0.4, 0.6, 1.0]
liveness_labels = ['Low', 'Moderate', 'High', 'Very High']

valence_bins = [0, 0.25, 0.5, 0.75, 1.0]
valence_labels = ['Low', 'Moderate', 'High', 'Very High']

tempo_bins = [0, 90, 110, 130, float('inf')]
tempo_labels = ['Very Slow', 'Slow', 'Moderate', 'Fast']

X_new['popularity_bin'] = pd.cut(X_new['popularity'], bins=popularity_bins, labels=popularity_labels)
X_new['duration_bin'] = pd.cut(X_new['duration_ms'], bins=duration_bins, labels=duration_labels)
X_new['danceability_bin'] = pd.cut(X_new['danceability'], bins=danceability_bins, labels=danceability_labels)
X_new['energy_bin'] = pd.cut(X_new['energy'], bins=energy_bins, labels=energy_labels)
X_new['loudness_bin'] = pd.cut(X_new['loudness'], bins=loudness_bins, labels=loudness_labels)
X_new['speechiness_bin'] = pd.cut(X_new['speechiness'], bins=speechiness_bins, labels=speechiness_labels)
X_new['acousticness_bin'] = pd.cut(X_new['acousticness'], bins=acousticness_bins, labels=acousticness_labels)
X_new['instrumentalness_bin'] = pd.cut(X_new['instrumentalness'], bins=instrumentalness_bins, labels=instrumentalness_labels)
X_new['liveness_bin'] = pd.cut(X_new['liveness'], bins=liveness_bins, labels=liveness_labels)
X_new['valence_bin'] = pd.cut(X_new['valence'], bins=valence_bins, labels=valence_labels)
X_new['tempo_bin'] = pd.cut(X_new['tempo'], bins=tempo_bins, labels=tempo_labels)


In [None]:
X_new.drop(['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
            'liveness', 'valence', 'tempo'], axis=1, inplace=True)
#need numerical not categorical data
label_encoder = LabelEncoder()
X_new_encoded = X_new.apply(label_encoder.fit_transform)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new_encoded, y, test_size=0.2, random_state=42)

param_grid = {'max_depth': [3, 5, 7, 10]}
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_max_depth = grid_search.best_params_['max_depth']

# Use the best hyperparameters to train the pruned tree
pruned_binned_tree = DecisionTreeClassifier(max_depth=best_max_depth)
pruned_binned_tree.fit(X_train, y_train)

In [None]:
y_pred_pruned = pruned_binned_tree.predict(X_test)

accuracy_pruned_binned = accuracy_score(y_test, y_pred_pruned)
confusion_mat_pruned_binned = confusion_matrix(y_test, y_pred_pruned)
classification_rep_pruned_binned = classification_report(y_test, y_pred_pruned)

print(f"Accuracy: {accuracy_pruned_binned}")
print(f"Confusion Matrix:\n{confusion_mat_pruned_binned}")
print(f"Classification Report:\n{classification_rep_pruned_binned}")

Even after binning and re-prunning our tree, we still are stuck at below 25% accuracy. Since we have seemingly exhausted our options for improving our tree, we can safely assume that decision tree is not a great algorithm for our goal. Alternatively, maybe our data just is not capable of making a good prediction overall.

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

feature_names = list(X.columns)

plt.figure(figsize=(15, 10))
plot_tree(pruned_tree, filled=True, feature_names=feature_names, class_names=list(map(str, pruned_tree.classes_)))
plt.show()

# Naive Bayes

In [None]:
df_nb = df.copy()

# Logistic Regression

In [None]:
df_lr = df.copy()

# Support Vector Machine

In [None]:
df_svm = df.copy()

# Multi-Layer Perceptron

In [None]:
df_mlp = df.copy()