In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA

### Preparing our dataset

In [42]:
fma_rock_vs_hiphop = pd.read_csv('fma-rock-vs-hiphop.csv')

In [43]:
fma_rock_vs_hiphop

Unnamed: 0,track_id,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
0,135,256000,1,,2008-11-26 01:43:26,2008-11-26 00:00:00,837,0,Rock,"[45, 58]",...,,2484,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1832,,0,,[],Father's Day
1,136,256000,1,,2008-11-26 01:43:35,2008-11-26 00:00:00,509,0,Rock,"[45, 58]",...,,1948,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1498,,0,,[],Peel Back The Mountain Sky
2,151,192000,0,,2008-11-26 01:44:55,,192,0,Rock,[25],...,,701,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,148,,4,,[],Untitled 04
3,152,192000,0,,2008-11-26 01:44:58,,193,0,Rock,[25],...,,637,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,98,,11,,[],Untitled 11
4,153,256000,0,Arc and Sender,2008-11-26 01:45:00,2008-11-26 00:00:00,405,5,Rock,[26],...,,354,en,Attribution-NonCommercial-NoDerivatives (aka M...,424,,2,,[],Hundred-Year Flood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17729,155063,320000,0,,2017-03-24 19:40:43,,283,3,Hip-Hop,"[21, 811]",...,,1283,,Attribution,1050,,4,,"['old school beats', '2017 free instrumentals'...",Been On
17730,155064,320000,0,,2017-03-24 19:40:44,,250,2,Hip-Hop,"[21, 811]",...,,1077,,Attribution,858,,2,,"['old school beats', '2017 free instrumentals'...",Send Me
17731,155065,320000,0,,2017-03-24 19:40:45,,219,3,Hip-Hop,"[21, 811]",...,,1340,,Attribution,1142,,1,,"['old school beats', '2017 free instrumentals'...",The Question
17732,155066,320000,0,,2017-03-24 19:40:47,,252,6,Hip-Hop,"[21, 811]",...,,2065,,Attribution,1474,,3,,"['old school beats', '2017 free instrumentals'...",Roy


In [44]:
echonest_metrics = pd.read_json('echonest-metrics.json',precise_float=True)

In [45]:
echonest_metrics

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661
1,3,0.374408,0.528643,0.817461,0.001851,0.105880,0.461818,126.957,0.269240
2,5,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661
3,10,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590
4,134,0.452217,0.513238,0.560410,0.019443,0.096567,0.525519,114.290,0.894072
...,...,...,...,...,...,...,...,...,...
13124,124857,0.007592,0.790364,0.719288,0.853114,0.720715,0.082550,141.332,0.890461
13125,124862,0.041498,0.843077,0.536496,0.865151,0.547949,0.074001,101.975,0.476845
13126,124863,0.000124,0.609686,0.895136,0.846624,0.632903,0.051517,129.996,0.496667
13127,124864,0.327576,0.574426,0.548327,0.452867,0.075928,0.033388,142.009,0.569274


In [46]:
echo_tracks = echonest_metrics.merge(fma_rock_vs_hiphop[['track_id','genre_top']],on='track_id')

In [47]:
echo_tracks

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,genre_top
0,2,0.416675,0.675894,0.634476,1.062807e-02,0.177647,0.159310,165.922,0.576661,Hip-Hop
1,3,0.374408,0.528643,0.817461,1.851103e-03,0.105880,0.461818,126.957,0.269240,Hip-Hop
2,5,0.043567,0.745566,0.701470,6.967990e-04,0.373143,0.124595,100.260,0.621661,Hip-Hop
3,134,0.452217,0.513238,0.560410,1.944269e-02,0.096567,0.525519,114.290,0.894072,Hip-Hop
4,153,0.988306,0.255661,0.979774,9.730057e-01,0.121342,0.051740,90.241,0.034018,Rock
...,...,...,...,...,...,...,...,...,...,...
4797,124718,0.412194,0.686825,0.849309,6.000000e-10,0.867543,0.367315,96.104,0.692414,Hip-Hop
4798,124719,0.054973,0.617535,0.728567,7.215700e-06,0.131438,0.243130,96.262,0.399720,Hip-Hop
4799,124720,0.010478,0.652483,0.657498,7.098000e-07,0.701523,0.229174,94.885,0.432240,Hip-Hop
4800,124721,0.067906,0.432421,0.764508,1.625500e-06,0.104412,0.310553,171.329,0.580087,Hip-Hop


In [48]:
echo_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4802 entries, 0 to 4801
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          4802 non-null   int64  
 1   acousticness      4802 non-null   float64
 2   danceability      4802 non-null   float64
 3   energy            4802 non-null   float64
 4   instrumentalness  4802 non-null   float64
 5   liveness          4802 non-null   float64
 6   speechiness       4802 non-null   float64
 7   tempo             4802 non-null   float64
 8   valence           4802 non-null   float64
 9   genre_top         4802 non-null   object 
dtypes: float64(8), int64(1), object(1)
memory usage: 375.3+ KB


### Pairwise relationships between continuous variables


In [49]:
echo_tracks_continuous = echo_tracks.drop('genre_top',axis=1)

In [50]:
echo_tracks_continuous.corr()

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
track_id,1.0,-0.372282,0.049454,0.140703,-0.275623,0.048231,-0.026995,-0.025392,0.01007
acousticness,-0.372282,1.0,-0.028954,-0.281619,0.19478,-0.019991,0.072204,-0.02631,-0.013841
danceability,0.049454,-0.028954,1.0,-0.242032,-0.255217,-0.106584,0.276206,-0.242089,0.473165
energy,0.140703,-0.281619,-0.242032,1.0,0.028238,0.113331,-0.109983,0.195227,0.038603
instrumentalness,-0.275623,0.19478,-0.255217,0.028238,1.0,-0.091022,-0.366762,0.022215,-0.219967
liveness,0.048231,-0.019991,-0.106584,0.113331,-0.091022,1.0,0.041173,0.002732,-0.045093
speechiness,-0.026995,0.072204,0.276206,-0.109983,-0.366762,0.041173,1.0,0.008241,0.149894
tempo,-0.025392,-0.02631,-0.242089,0.195227,0.022215,0.002732,0.008241,1.0,0.052221
valence,0.01007,-0.013841,0.473165,0.038603,-0.219967,-0.045093,0.149894,0.052221,1.0


### Splitting our data


In [51]:
y = echo_tracks['genre_top']
X = echo_tracks.drop(['genre_top','track_id'],axis=1)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Normalizing the feature data

In [53]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Principal Component Analysis on our scaled data


In [60]:
pca = PCA()
pca.fit(X_train)
explained_variance = pca.explained_variance_ratio_