In [51]:
import pandas as pd
import random
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LassoCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import plot_roc_curve, roc_auc_score

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    mean_squared_error,
    accuracy_score
)

In [52]:
data = pd.read_csv("joined_data.csv")


data = data.drop(["id", "name", "release_date", "year", "mode", 'duration_ms', 'liveness', 'key'], axis = 1)
X = data.iloc[:,:-1]
y = data["new_genres"]

In [53]:
data["new_genres"].unique()

array(['classical', 'jazz', 'rock', 'country', 'pop', 'edm', 'hip hop',
       'alternative', 'metal'], dtype=object)

In [54]:
sss = StratifiedShuffleSplit(n_splits=1, test_size = 0.25, random_state = 42)

for train_index, test_index in sss.split(X, y):
    X_train = np.array(X)[train_index]
    X_test = np.array(X)[test_index]
    y_train = np.array(y)[train_index]
    y_test = np.array(y)[test_index]
    
X_train = preprocessing.scale(X_train)

In [63]:
# Log regression with Lasso penalty

clf = LogisticRegression(random_state=0, penalty='l1', solver='liblinear')
#clf = LassoCV(cv=5, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

test_pred = clf.predict(X_test)
train_pred = clf.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

  _warn_prf(average, modifier, msg_start, len(result))


Train Accuracy: 0.4223660367589956
Train Error: 0.5776339632410044
Train Recall: 0.4223660367589956
Train Precision: 0.35918780820368224
-----------------------------------------
Test Accuracy: 0.25103729836472966
Test Error: 0.7489627016352703
Test Recall: 0.25103729836472966
Test Precision: 0.22873837863022153
-----------------------------------------


In [66]:
# Log. Regression with Cross Validation and Lasso

clf = LogisticRegressionCV(cv=5, random_state=0, penalty='l1', solver='liblinear')
#clf = LassoCV(cv=5, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

test_pred = clf.predict(X_test)
train_pred = clf.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

  _warn_prf(average, modifier, msg_start, len(result))


Train Accuracy: 0.4106061166376983
Train Error: 0.5893938833623017
Train Recall: 0.4106061166376983
Train Precision: 0.32030464689854315
-----------------------------------------
Test Accuracy: 0.25842596906964876
Test Error: 0.7415740309303512
Test Recall: 0.25842596906964876
Test Precision: 0.2062431762427709
-----------------------------------------


In [None]:
n_classifiers = [10, 50, 100]

for n in n_classifiers:
    ada = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth = 5), 
        n_estimators = n, learning_rate = 1, random_state = 1)
    ada.fit(X_train, y_train)
    test_pred = ada.predict(X_test)
    train_pred = ada.predict(X_train)
    test_acc = (accuracy_score(test_pred, y_test))
    test_error = 1 - test_acc
    train_acc = (accuracy_score(train_pred, y_train))
    train_error = (1 - train_acc)
    test_prec = precision_score(y_test, test_pred, average='weighted')
    test_recall = recall_score(y_test, test_pred, average='weighted')
    train_prec = precision_score(y_train, train_pred, average='weighted')
    train_recall = recall_score(y_train, train_pred, average='weighted')
    print(n, "Classifiers:")
    print("Test Accuracy:", test_acc)
    print("Test Error:", test_error)
    print("Test Recall:", test_recall)
    print("Test Precision:", test_prec)
    print("-----------------------------------------")
    print("Train Accuracy:", train_acc)
    print("Train Error:", train_error)
    print("Train Recall:", train_recall)
    print("Train Precision:", train_prec)
    print("-----------------------------------------")
    print("\n")

In [56]:
# Get feature importance
n_trees = [50]#, 100, 150, 300, 500]

for n in n_trees:
    rf = RandomForestClassifier(n_estimators = n).fit(X_train, y_train)
    test_pred = rf.predict(X_test)
    train_pred = rf.predict(X_train)
    test_acc = (accuracy_score(test_pred, y_test))
    test_error = 1 - test_acc
    train_acc = (accuracy_score(train_pred, y_train))
    train_error = (1 - train_acc)
    #test_prec = precision_score(y_test, test_pred)
    #test_recall = recall_score(y_test, test_pred)
    #train_prec = precision_score(y_train, train_pred)
    #train_recall = recall_score(y_train, train_pred)
    print(n, "Estimators:")
    print("Test Accuracy:", test_acc)
    print("Test Error:", test_error)
    print("-----------------------------------------")
    print("Train Accuracy:", train_acc)
    print("Train Error:", train_error)
    print("-----------------------------------------")

    importances = rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
    indices = np.argsort(importances)[::-1]
    print("Feature ranking:")

    for f in range(X_train.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    print("\n")
    print(X.columns)

50 Estimators:
Test Accuracy: 0.298320353236149
Test Error: 0.701679646763851
-----------------------------------------
Train Accuracy: 0.7128360637550386
Train Error: 0.2871639362449614
-----------------------------------------
Feature ranking:
1. feature 5 (0.140858)
2. feature 0 (0.128512)
3. feature 6 (0.118112)
4. feature 1 (0.117211)
5. feature 4 (0.111341)
6. feature 2 (0.102256)
7. feature 8 (0.100891)
8. feature 7 (0.093006)
9. feature 3 (0.087813)


Index(['acousticness', 'danceability', 'energy', 'instrumentalness',
       'loudness', 'popularity', 'speechiness', 'tempo', 'valence'],
      dtype='object')


In [None]:
#np.delete(X_test, [5,6], axis = 0)
#X_test
n_trees = [50]#, 100, 150, 300, 500]

for n in n_trees:
    rf = RandomForestClassifier(n_estimators = n).fit(X_train, y_train)
    test_pred = rf.predict(X_test)
    train_pred = rf.predict(X_train)
    test_acc = (accuracy_score(test_pred, y_test))
    test_error = 1 - test_acc
    train_acc = (accuracy_score(train_pred, y_train))
    train_error = (1 - train_acc)
    print(n, "Estimators:")
    print("Test Accuracy:", test_acc)
    print("Test Error:", test_error)
    print("-----------------------------------------")
    print("Train Accuracy:", train_acc)
    print("Train Error:", train_error)
    print("-----------------------------------------")

In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

In [67]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

  _warn_prf(average, modifier, msg_start, len(result))


Train Accuracy: 0.46833327169853184
Train Error: 0.5316667283014682
Train Recall: 0.46833327169853184
Train Precision: 0.4501132072157608
-----------------------------------------
Test Accuracy: 0.03421420488584171
Test Error: 0.9657857951141583
Test Recall: 0.03421420488584171
Test Precision: 0.0533647179980828
-----------------------------------------


In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")