In [2]:
%matplotlib inline

import pandas as pd
import numpy as np

# sklearn modules
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier

import plotly.express as px
from statistics import mean
import matplotlib.pyplot as plt
from random import randint
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
echo_genres = pd.read_csv('//Users/gaetanoantonicchio/Documents/GitHub/DataMining-2/clean_datasets/echo_genres.csv')
echo_genres.set_index('track_id', inplace=True)
echo_genres.head()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,Hip-Hop
3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,Hip-Hop
5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,Hip-Hop
10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,Pop
134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,Hip-Hop


## Binary Classification Rock - Pop

In [29]:
rock_pop = echo_genres[echo_genres['genre_top'].isin(['Hip-Hop','Pop'])]
rock_pop.head()

Unnamed: 0_level_0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,Hip-Hop
3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,Hip-Hop
5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,Hip-Hop
10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,Pop
134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,Hip-Hop


In [30]:
# check shape
rock_pop.shape

(1256, 9)

In [31]:
rock_pop.genre_top.value_counts()

Hip-Hop    910
Pop        346
Name: genre_top, dtype: int64

In [32]:
# encoding labels: 0 --> not popular,  1 --> popular
le = preprocessing.LabelEncoder()
rock_pop['genre_top'] = le.fit_transform(rock_pop['genre_top'])

In [34]:
print('legend:\n0: Rock\n1: Pop')
rock_pop.genre_top.value_counts()

legend:
0: Rock
1: Pop


0    910
1    346
Name: genre_top, dtype: int64

In [35]:
genre_labels = rock_pop['genre_top']
rock_pop.drop('genre_top', axis=1, inplace=True)

In [36]:
# shuffle and split into development set and internal test set
X_develop, X_test, y_develop, y_test = train_test_split(
rock_pop, genre_labels, test_size=0.30, random_state=42)

In [37]:
# check partition
print(f"____Legend____\n0: Rock\n1: Pop\n\nDevelopment set:\
\n{y_develop.value_counts()}\n\nInternal Test:\n{y_test.value_counts()}")

____Legend____
0: Rock
1: Pop

Development set:
0    645
1    234
Name: genre_top, dtype: int64

Internal Test:
0    265
1    112
Name: genre_top, dtype: int64
