# Genre Classification

## 1 Introduction

### 1.1 Environment Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from matplotlib import pyplot as plt

### 1.2 Loading the Data

In [2]:
# load in song data
country = pd.read_csv("/Users/candacemckeag/Documents/School/datasci/country.csv")
metal = pd.read_csv("/Users/candacemckeag/Documents/School/datasci/metal.csv")
rap = pd.read_csv("/Users/candacemckeag/Documents/School/datasci/rap.csv")
country['genre'] = 'country'
metal['genre'] = 'metal'
rap['genre'] = 'rap'
songs = pd.concat([country, metal, rap])
songs = songs.drop(['title','artist'],1)

In [3]:
# peek at df
songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,0.552,0.402,11,-7.431,1,0.0262,0.676,0.0,0.0928,0.382,103.313,country
1,0.677,0.556,9,-6.544,1,0.0306,0.263,0.0,0.105,0.521,80.009,country
2,0.631,0.44,1,-5.948,1,0.028,0.328,0.0,0.239,0.418,136.033,country
3,0.533,0.907,11,-3.793,1,0.0406,0.0292,0.0,0.386,0.7,150.99,country
4,0.375,0.889,3,-3.704,1,0.0991,0.0848,0.0,0.136,0.881,199.669,country


### 1.3 Data Cleaning

want to convert categorical variables key and mode to object

In [4]:
songs.dtypes

danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object

In [5]:
songs["key"] = songs["key"].astype('object')
songs["mode"] = songs["mode"].astype('object')

In [6]:
songs.dtypes

danceability        float64
energy              float64
key                  object
loudness            float64
mode                 object
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object

In [7]:
X = songs.drop(columns=['genre'])
y = songs.genre

## 3 Model Building

We will try the following algorithms: KNN, RF, LogReg, MultNB, LinearSVC.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
results = pd.DataFrame()

### 3.1 K-Nearest-Neighbors

#### Parameter Tuning

In [10]:
# create model object
knn = KNeighborsClassifier()
# get params
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [11]:
# define grid
knn_grid = {'n_neighbors': list(range(1,21))}
# perform search
knn_srch = GridSearchCV(knn, knn_grid, cv=5)
knn_srch.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
# overall best score
print(knn_srch.best_score_)
# parameter which achieved best score
print(knn_srch.best_params_)
bestknn = knn_srch.best_params_['n_neighbors']

0.498371335504886
{'n_neighbors': 12}


In [13]:
knn = KNeighborsClassifier(n_neighbors=bestknn)

In [14]:
scknn = cross_val_score(knn, X, y, cv=10)

In [15]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scknn.mean(), scknn.std() * 2))

Accuracy: 0.50 (+/- 0.02)


In [16]:
results['KNN'] = [scknn.mean()]

### 3.2 Random Forest

#### Parameter Tuning

In [17]:
# create model object
rf = RandomForestClassifier()
# get params
rf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [18]:
# define grid
rf_grid = {'n_estimators': list(range(10,151,10))}
# perform search
rf_srch = GridSearchCV(rf, rf_grid, cv=5)
rf_srch.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
# overall best score
print(rf_srch.best_score_)
# parameter which achieved best score
print(rf_srch.best_params_)
bestrf = rf_srch.best_params_['n_estimators']

0.8335801006810779
{'n_estimators': 150}


In [None]:
rf = RandomForestClassifier(n_estimators=bestrf)

In [None]:
scrf = cross_val_score(rf, X, y, cv=10)

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scrf.mean(), scrf.std() * 2))

Accuracy: 0.84 (+/- 0.03)


In [None]:
results['RandomForest'] = [scrf.mean()]

### 3.3 Logistic Regression

#### Parameter Tuning

In [None]:
# create model object
logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)
# get params
logit.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 5000,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
# define grid
logit_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
# perform search
logit_srch = GridSearchCV(logit, logit_grid, cv=5)
logit_srch.fit(X, y)

In [None]:
# overall best score
print(logit_srch.best_score_)
# parameter which achieved best score
print(logit_srch.best_params_)
bestlogit = logit_srch.best_params_['C']

In [None]:
logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000, C = bestlogit)

In [None]:
sclogit = cross_val_score(logit, X, y, cv=10)

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (sclogit.mean(), sclogit.std() * 2))

In [None]:
results['LogisticRegression'] = [sclogit.mean()]

### 3.4 Multinomial Naive Bayes

#### Parameter Tuning

In [None]:
# create model object
nb = GaussianNB()
# get params
nb.get_params()

In [None]:
# define grid
nb_grid = {'var_smoothing':[1e-09,1e-08,1e-07,1e-06,1e-05,1e-04,1e-03,1e-02,1e-01]}
# perform search
nb_srch = GridSearchCV(nb, nb_grid, cv=5)
nb_srch.fit(X, y)

In [None]:
# overall best score
print(nb_srch.best_score_)
# parameter which achieved best score
print(nb_srch.best_params_)
bestnb = nb_srch.best_params_['var_smoothing']

In [None]:
nb = GaussianNB(var_smoothing=bestnb)

In [None]:
scnb = cross_val_score(nb, X, y, cv=10)

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scnb.mean(), scnb.std() * 2))

In [None]:
results['NaiveBayes'] = [scnb.mean()]

### 3.5 Gradient Boosting

#### Parameter Tuning

In [None]:
# create model object
gb = GradientBoostingClassifier()
# get params
gb.get_params()

In [None]:
# define grid
gb_grid = {'n_estimators':list(range(50,150,10))}
# perform search
gb_srch = GridSearchCV(gb, gb_grid, cv=5)
gb_srch.fit(X, y)

In [None]:
# overall best score
print(gb_srch.best_score_)
# parameter which achieved best score
print(gb_srch.best_params_)
bestgb = gb_srch.best_params_['n_estimators']

In [None]:
gb = GradientBoostingClassifier(n_estimators=bestgb)

In [None]:
scgb = cross_val_score(gb, X, y, cv=10)

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scgb.mean(), scgb.std() * 2))

In [None]:
results['GradientBoosting'] = [scgb.mean()]

## 4 Final Model

In [None]:
results

Gradient Boosting has the highest accuracy rate so we will use this algorithm for our model.