In [52]:
import pandas as pd
import random
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import plot_roc_curve, roc_auc_score

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    mean_squared_error,
    accuracy_score
)

In [53]:
data = pd.read_csv("joined_data.csv")


data = data.drop(["id", "name", "release_date", "year", "mode"], axis = 1)
X = data.iloc[:,:-1]
y = data["new_genres"]

In [54]:
data["new_genres"].unique()

array(['classical', 'jazz', 'rock', 'country', 'pop', 'edm', 'hip hop',
       'alternative', 'metal'], dtype=object)

In [55]:
sss = StratifiedShuffleSplit(n_splits=1, test_size = 0.25, random_state = 42)

for train_index, test_index in sss.split(X, y):
    X_train = np.array(X)[train_index]
    X_test = np.array(X)[test_index]
    y_train = np.array(y)[train_index]
    y_test = np.array(y)[test_index]
    
X_train = preprocessing.scale(X_train)

In [63]:
clf = LogisticRegressionCV(cv = 5, random_state=0).fit(X_train, y_train)
clf.score(X_train, y_train)

test_pred = clf.predict(X_test)
train_pred = clf.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

  _warn_prf(average, modifier, msg_start, len(result))


Train Accuracy: 0.4295477238267816
Train Error: 0.5704522761732185
Train Recall: 0.4295477238267816
Train Precision: 0.3871955114135871
-----------------------------------------
Test Accuracy: 0.025915817968004615
Test Error: 0.9740841820319954
Test Recall: 0.025915817968004615
Test Precision: 0.0006716296209507509
-----------------------------------------


In [58]:
n_classifiers = [10, 50, 100]

for n in n_classifiers:
    ada = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth = 1), 
        n_estimators = n, learning_rate = 1, random_state = 1)
    ada.fit(X_train, y_train)
    test_pred = ada.predict(X_test)
    train_pred = ada.predict(X_train)
    test_acc = (accuracy_score(test_pred, y_test))
    test_error = 1 - test_acc
    train_acc = (accuracy_score(train_pred, y_train))
    train_error = (1 - train_acc)
    test_prec = precision_score(y_test, test_pred, average='weighted')
    test_recall = recall_score(y_test, test_pred, average='weighted')
    train_prec = precision_score(y_train, train_pred, average='weighted')
    train_recall = recall_score(y_train, train_pred, average='weighted')
    print(n, "Classifiers:")
    print("Test Accuracy:", test_acc)
    print("Test Error:", test_error)
    print("Test Recall:", test_recall)
    print("Test Precision:", test_prec)
    print("-----------------------------------------")
    print("Train Accuracy:", train_acc)
    print("Train Error:", train_error)
    print("Train Recall:", train_recall)
    print("Train Precision:", train_prec)
    print("-----------------------------------------")
    print("\n")

  _warn_prf(average, modifier, msg_start, len(result))


10 Classifiers:
Test Accuracy: 0.30056136146797136
Test Error: 0.6994386385320286
Test Recall: 0.30056136146797136
Test Precision: 0.26561348850238303
-----------------------------------------
Train Accuracy: 0.39959321031026956
Train Error: 0.6004067896897305
Train Recall: 0.39959321031026956
Train Precision: 0.338863321508874
-----------------------------------------




  _warn_prf(average, modifier, msg_start, len(result))


50 Classifiers:
Test Accuracy: 0.23242139830038386
Test Error: 0.7675786016996161
Test Recall: 0.23242139830038386
Test Precision: 0.2529165382867627
-----------------------------------------
Train Accuracy: 0.4312636366998262
Train Error: 0.5687363633001739
Train Recall: 0.4312636366998262
Train Precision: 0.39378141223999774
-----------------------------------------




  _warn_prf(average, modifier, msg_start, len(result))


100 Classifiers:
Test Accuracy: 0.0668308593489982
Test Error: 0.9331691406510018
Test Recall: 0.0668308593489982
Test Precision: 0.23901465006236447
-----------------------------------------
Train Accuracy: 0.43418512628970823
Train Error: 0.5658148737102917
Train Recall: 0.43418512628970823
Train Precision: 0.3989436441934663
-----------------------------------------




In [62]:
n_trees = [50, 100, 150, 300, 500]

for n in n_trees:
    rf = RandomForestClassifier(n_estimators = n).fit(X_train, y_train)
    test_pred = rf.predict(X_test)
    train_pred = rf.predict(X_train)
    test_acc = (accuracy_score(test_pred, y_test))
    test_error = 1 - test_acc
    train_acc = (accuracy_score(train_pred, y_train))
    train_error = (1 - train_acc)
    #test_prec = precision_score(y_test, test_pred)
    #test_recall = recall_score(y_test, test_pred)
    #train_prec = precision_score(y_train, train_pred)
    #train_recall = recall_score(y_train, train_pred)
    print(n, "Estimators:")
    print("Test Accuracy:", test_acc)
    print("Test Error:", test_error)
    #print("Test Recall:", test_recall)
    #print("Test Precision:", test_prec)
    print("-----------------------------------------")
    print("Train Accuracy:", train_acc)
    print("Train Error:", train_error)
    #print("Train Recall:", train_recall)
    #print("Train Precision:", train_prec)
    print("-----------------------------------------")
    print("\n")

50 Decision Trees:
Test Accuracy: 0.2533892475981273
Test Error: 0.7466107524018727
-----------------------------------------
Train Accuracy: 0.7128878369882771
Train Error: 0.28711216301172293
-----------------------------------------


100 Decision Trees:
Test Accuracy: 0.3150724444740287
Test Error: 0.6849275555259713
-----------------------------------------
Train Accuracy: 0.712895233164454
Train Error: 0.287104766835546
-----------------------------------------


150 Decision Trees:
Test Accuracy: 0.3047771195278351
Test Error: 0.695222880472165
-----------------------------------------
Train Accuracy: 0.712895233164454
Train Error: 0.287104766835546
-----------------------------------------


300 Decision Trees:
Test Accuracy: 0.3053318245357119
Test Error: 0.694668175464288
-----------------------------------------
Train Accuracy: 0.712895233164454
Train Error: 0.287104766835546
-----------------------------------------


500 Decision Trees:
Test Accuracy: 0.30029510306419044
T

In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")

In [None]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier().fit(X_train, y_train)

test_pred = nn.predict(X_test)
train_pred = nn.predict(X_train)
test_acc = (accuracy_score(test_pred, y_test))
test_error = 1 - test_acc
train_acc = (accuracy_score(train_pred, y_train))
train_error = (1 - train_acc)
test_prec = precision_score(y_test, test_pred, average='weighted')
test_recall = recall_score(y_test, test_pred, average='weighted')
train_prec = precision_score(y_train, train_pred, average='weighted')
train_recall = recall_score(y_train, train_pred, average='weighted')
print("Train Accuracy:", train_acc)
print("Train Error:", train_error)
print("Train Recall:", train_recall)
print("Train Precision:", train_prec)
print("-----------------------------------------")
print("Test Accuracy:", test_acc)
print("Test Error:", test_error)
print("Test Recall:", test_recall)
print("Test Precision:", test_prec)
print("-----------------------------------------")