# MALIS Project - Music Genre Classification

## Logistic Regression

This iPython notebook aim is to implement a logistic regression model to classify music by genre, with the dataset we built.

In [867]:
import numpy as np
from numpy import linalg
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler


import pandas as pd
import numpy as np
import utils
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

In [868]:
#this function allows to select only a certain number of genres, 
#the first argument is the dataset dataframe
#the second argument is a list of strings of the genres considered the different choices are:
#blues classical country disco hiphop jazz metal pop reggae rock

def data_set_select_genre(df, selected_genres):
    data = df.copy()
    all_genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
    for genre in all_genres:
        if (genre not in selected_genres):
            data = data.drop(data[data['genre'] == genre].index)
    return data

#### Two genres classification

We start by importing our dataset, shuffling it, rescaling the features with sklearn StandardScaler and splitting into a training and testing set. We also select with the function defined above, the genres we want to classify. For this section, we choose to do it with two genres.

In [869]:
#importing our dataset and spliting into training and testing set

data_set = pd.read_csv('../csv_files/Music_data_set.csv')
genres_df = data_set_select_genre(data_set, ['classical', 'pop']) #selection the two genres
genres_df
sc = StandardScaler()
genres_df[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'contrast', 'zero_crossing']] = sc.fit_transform(genres_df[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'contrast', 'zero_crossing']])

x = genres_df.drop(['Unnamed: 0', 'song', 'genre'], axis=1)
y = genres_df['genre']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 27) 

print(genres_df.dtypes)     # Prints out the data types associated to each of the fields in the table
genres_df.head()

Unnamed: 0            int64
song                 object
mean_mfccs          float64
mean_chroma_stft    float64
tempo               float64
pulse               float64
flatness            float64
contrast            float64
zero_crossing       float64
genre                object
dtype: object


Unnamed: 0.1,Unnamed: 0,song,mean_mfccs,mean_chroma_stft,tempo,pulse,flatness,contrast,zero_crossing,genre
100,100,classical.00000.wav,-0.442585,-0.9299,-1.275904,0.43754,0.000904,1.315589,-0.174364,classical
101,101,classical.00001.wav,-0.846283,-1.374626,-0.126886,0.334646,0.000324,2.055785,-0.42105,classical
102,102,classical.00002.wav,-0.24023,-0.625866,-1.081455,0.597942,0.001094,1.118003,-0.367396,classical
103,103,classical.00003.wav,-0.620142,-1.145891,0.852803,-0.408544,0.000453,2.249984,0.079313,classical
104,104,classical.00004.wav,-0.824075,-1.296391,-0.643944,0.189608,0.00074,1.985205,0.171851,classical


In [870]:
#Here, we give a linear importance to all the features, you can modify what features to take into account or not
def prepare_data(ds):
    X_cols = ds[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'flatness', 'contrast', 'zero_crossing']].copy()
    
    X = X_cols.values
    X = X.reshape(len(X_cols),-1)
    
    #We add the dummy x_0
    poly = PolynomialFeatures(1)  
    X = poly.fit_transform(X)
    
    return X

In [871]:
X_clf = prepare_data(x_train)
X_clf_test = prepare_data(x_test)


In [872]:
#Using the function fit_logreg defined in the utils.py file, that uses the function LogisticRegression of sklearn
#It returns the two parameters, intercept and coeficient of our logistic regression
logreg = utils.fit_logreg(X_clf, y_train)

In [873]:
print('*************** Estimated parameters: ***********************')
print('[W_0,W] : [',logreg.intercept_,',', logreg.coef_, ']' )

*************** Estimated parameters: ***********************
[W_0,W] : [ [-8.96760671] , [[-8.96760671e+00  6.01148379e+00 -8.20099850e+00 -3.19912133e+00
   2.31539670e+00  1.24021128e+03 -4.02657404e-01 -7.16354921e+00]] ]


In [874]:
def predict_and_test(model, X_test, y_test):
    '''
    Predicts using a model received as input and then evaluates the accuracy of the predicted data. 
    As inputs it receives the model, an input dataset X_test and the corresponding targets (ground thruth) y_test
    It returs the classification accuracy.
    '''
    y_hat = model.predict(X_test)
    #print(y_hat)               #These two lines can be uncommented to print y_hat and y_test and comparing them by hand
    #print(np.array(y_test))    #They are list of strings
    #Your code here
    correct = 0
    for i in range(len(y_hat)):
        if (y_hat[i] == np.array(y_test)[i]):
            compt = 1
        else :
            compt = 0
        correct += compt
    acc = correct / len(y_hat)
    
    return acc

In [875]:
acc = predict_and_test(logreg, X_clf, y_train)

print('*******************  Training accuracy (genre identification) ***************************')
print('ACC: ', acc)
print('*******************************************************************************************')

*******************  Training accuracy (genre identification) ***************************
ACC:  1.0
*******************************************************************************************


In [876]:
print('******************  Testing accuracy *********************')
print('ACC: ', predict_and_test(logreg,X_clf_test, y_test))
print('*******************************************************************************************')

******************  Testing accuracy *********************
ACC:  0.975
*******************************************************************************************


The table belows is a matrix of accuracies obtained after running the code above for all the tuples ['genre1', 'genre2'].
The accuracies are generally high, ranging between 0.65 and 1.0 with an average of 0.88. We can notice however that some genres tend to differ more easily to others, for example classical music always obtained very high accuracies. On the other side, other genres have difficulties differencing from, the others, for instance rock music. This encouraged us consider the cas "one versus all" that we implemented for the neural network.


<img src="../images/logistic_regression_2genres.jpg" width=500 />

#### Multi genres classification

Here, we apply our logistic regression for more than two genres, the accuracy gets lower as we add more genres, which is understandable.

In [877]:
#importing our dataset and spliting into training and testing set

data_set = pd.read_csv('../csv_files/Music_data_set.csv')
genres_df = data_set_select_genre(data_set, ['classical', 'metal', 'blues', 'country']) #selecting genres
genres_df
sc = StandardScaler()
genres_df[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'contrast', 'zero_crossing']] = sc.fit_transform(genres_df[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'contrast', 'zero_crossing']])

x_3 = genres_df.drop(['Unnamed: 0', 'song', 'genre'], axis=1)
y_3 = genres_df['genre']

x_train_3, x_test_3, y_train_3, y_test_3 = train_test_split(x_3, y_3, test_size = 0.2, random_state = 27) 

genres_df.head()

Unnamed: 0.1,Unnamed: 0,song,mean_mfccs,mean_chroma_stft,tempo,pulse,flatness,contrast,zero_crossing,genre
0,0,blues.00000.wav,0.674759,-0.075346,-0.024087,1.448547,0.004498,-0.57633,-0.321842,blues
1,1,blues.00001.wav,0.35379,-0.175293,-0.749397,1.310258,0.002298,-0.498334,-0.949518,blues
2,2,blues.00002.wav,0.529241,0.072256,1.78919,1.20666,0.002631,0.295763,-0.478586,blues
3,3,blues.00003.wav,0.374229,0.520514,2.296907,1.095834,0.000954,-0.106795,-1.477914,blues
4,4,blues.00004.wav,-0.999112,-0.52812,0.586701,-0.043627,0.003238,-0.08588,0.105941,blues


In [881]:
X_clf_3 = prepare_data(x_train_3)
X_clf_test_3 = prepare_data(x_test_3)

In [882]:
#Using the function fit_logreg defined in the utils.py file, that uses the function LogisticRegression of sklearn
#It returns the two parameters, intercept and coeficient of our logistic regression
logreg_3 = utils.fit_logreg(X_clf_3, y_train_3)



In [883]:
print('*************** Estimated parameters: ***********************')
print('[W_0,W] : [',logreg_3.intercept_,',', logreg_3.coef_, ']' )

*************** Estimated parameters: ***********************
[W_0,W] : [ [ 0.09444098 -1.05568233 -1.7166839  -1.51480931] , [[ 9.44409785e-02  8.46196438e-01  2.07627926e-01  1.25632316e-01
   4.62997097e-01 -2.62780825e+02  1.15771020e-01  3.95117462e-02]
 [-1.05568233e+00 -3.43510076e+00 -5.41530614e+00  3.35656783e-01
  -1.95839300e+00 -5.26720172e+02 -3.33075237e+00  5.00769457e+00]
 [-1.71668390e+00  1.08659183e+00 -1.23097989e+00 -2.68160901e-01
   8.08421427e-01  1.85378040e+02  5.56457129e-01 -2.15968282e+00]
 [-1.51480931e+00 -1.77925716e-01  4.11803726e+00  1.69914004e-01
   2.68890212e-02 -7.46444312e+01 -6.07831174e-01  1.58929297e+00]] ]


In [884]:
acc_3 = predict_and_test(logreg_3, X_clf_3, y_train_3)

print('*******************  Training accuracy (genre identification) ***************************')
print('ACC: ', acc_3)
print('*******************************************************************************************')

*******************  Training accuracy (genre identification) ***************************
ACC:  0.79375
*******************************************************************************************


In [885]:
print('******************  Testing accuracy *********************')
print('ACC: ', predict_and_test(logreg_3,X_clf_test_3, y_test_3))
print('*******************************************************************************************')

******************  Testing accuracy *********************
ACC:  0.725
*******************************************************************************************


#### Ten genres classifcation

Here, we apply the logistic regression algorithm with the whole data, we obtain an accuracy of 48%, which is not spectacular, but considering that there are ten different classes, it is at least much better than the dummy model.

In [886]:
#importing our dataset and spliting into training and testing set

genres_df_all = pd.read_csv('../csv_files/Music_data_set.csv')
sc = StandardScaler()
genres_df_all[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'contrast', 'zero_crossing']] = sc.fit_transform(genres_df_all[['mean_mfccs', 'mean_chroma_stft', 'tempo', 'pulse', 'contrast', 'zero_crossing']])

x2 = genres_df_all.drop(['Unnamed: 0', 'song', 'genre'], axis=1)
y2 = genres_df_all['genre']

x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size = 0.2, random_state = 27) 

genres_df_all.head()


Unnamed: 0.1,Unnamed: 0,song,mean_mfccs,mean_chroma_stft,tempo,pulse,flatness,contrast,zero_crossing,genre
0,0,blues.00000.wav,0.498907,-0.350137,0.093269,1.249743,0.004498,0.001767,-0.493125,blues
1,1,blues.00001.wav,0.110828,-0.462482,-0.642775,1.103971,0.002298,0.070351,-1.138619,blues
2,2,blues.00002.wav,0.322963,-0.184224,1.933379,0.994767,0.002631,0.768619,-0.654318,blues
3,3,blues.00003.wav,0.135541,0.319639,2.448609,0.877943,0.000954,0.414641,-1.682014,blues
4,4,blues.00004.wav,-1.524946,-0.859077,0.713096,-0.32318,0.003238,0.433031,-0.053199,blues


In [887]:
X_clf2 = prepare_data(x_train2)
X_clf_test2 = prepare_data(x_test2)

logreg2 = utils.fit_logreg(X_clf2, y_train2)




In [888]:
def dummy_model(X):
    '''
    Returns a zero (no disease) no matter the input
    '''
    return np.array(['blues']*(len(X)))

In [889]:
y_test_dummy = dummy_model(X_clf_test2)
acc_dummy = predict_and_test(logreg2, X_clf_test2, y_test_dummy)

acc_test = predict_and_test(logreg2, X_clf_test2, y_test2)
print('******************  Testing accuracy *********************')
print('ACC multinomial: ', acc_test)
print('ACC dummy: ', acc_dummy)


******************  Testing accuracy *********************
ACC multinomial:  0.48
ACC dummy:  0.1
