<a href="https://colab.research.google.com/github/datascience-uniandes/classification_tutorial/blob/master/music/music.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification: Estimate the gender of a song based on some intrinsic attribues (binary)

MINE-4101: Applied Data Science  
Univerisdad de los Andes  
  
Last update: October, 2023

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
pd.options.display.max_columns = 100

### Reading the dataset

In [None]:
music_df = pd.read_csv('./data/msd_genre_dataset.txt')

In [None]:
music_df.shape

In [None]:
music_df.dtypes

In [None]:
music_df.head()

In [None]:
music_df['genre'].value_counts(normalize = True)

In [None]:
genres = ['jazz and blues', 'soul and reggae']
music_df_filter = music_df.loc[music_df['genre'].isin(genres)]

In [None]:
music_df_filter.shape

In [None]:
music_df_filter.loc[music_df_filter['genre'] == 'jazz and blues', 'genre'] = 0
music_df_filter.loc[music_df_filter['genre'] == 'soul and reggae', 'genre'] = 1

In [None]:
music_df_filter['genre'].value_counts(normalize = True)

### Splitting train and test datasets

In [None]:
X = music_df_filter[music_df_filter.columns.tolist()[4:]]

In [None]:
Y = music_df_filter['genre']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y)

In [None]:
pd.Series(Y_train).value_counts(normalize = True)

In [None]:
pd.Series(Y_test).value_counts(normalize = True)

### Preprocessing feature matrix

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

### Training a Logistic Regression model

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, Y_train.astype(int))

In [None]:
logmodel_predictions = logmodel.predict(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(Y_test.astype(int), logmodel_predictions, normalize = 'all')

In [None]:
print('Precision:', precision_score(Y_test.astype(int), logmodel_predictions))
print('Recall:', recall_score(Y_test.astype(int), logmodel_predictions))
print('F1:', f1_score(Y_test.astype(int), logmodel_predictions))

### Training a Neural Networks model

In [None]:
neuralmodel = MLPClassifier()

In [None]:
neuralmodel.fit(X_train, Y_train.astype(int))

In [None]:
neuralmodel_predictions = neuralmodel.predict(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(Y_test.astype(int), neuralmodel_predictions, normalize = 'all')

In [None]:
print('Precision:', precision_score(Y_test.astype(int), neuralmodel_predictions))
print('Recall:', recall_score(Y_test.astype(int), neuralmodel_predictions))
print('F1:', f1_score(Y_test.astype(int), neuralmodel_predictions))

### Training a Support Vector Machines model

In [None]:
svmmodel = SVC()

In [None]:
svmmodel.fit(X_train,Y_train.astype(int))

In [None]:
svmmodel_predictions = svmmodel.predict(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(Y_test.astype(int), svmmodel_predictions, normalize = 'all')

In [None]:
print('Precision:', precision_score(Y_test.astype(int), svmmodel_predictions))
print('Recall:', recall_score(Y_test.astype(int), svmmodel_predictions))
print('F1:', f1_score(Y_test.astype(int), svmmodel_predictions))