This nb builds a classifier to predict gender from genre using a random forest model.

We look at the following encoding/embeddings:
- [ ] BOW
- [ ] TFIDF
- [ ] LSI
- [ ] LDA
- [ ] Word2Vec

In [39]:
import numpy as np
import pandas as pd

from scipy import sparse

from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import re

import os
from gensim import corpora
from gensim.corpora import MmCorpus
from gensim.models import TfidfModel, LsiModel
from gensim.matutils import corpus2dense

import json

seed = 23

For gensim we need logging:

In [2]:
import logging
logging.basicConfig(format = '(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

Import the cleaned data:

In [3]:
%store -r now
now

'2020-05-18-10-06'

In [4]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])

In [5]:
X_train.shape, y_train.shape

((12376, 2), (12376, 1))

In [6]:
X_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Pablo_Holman,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,['country'],1
La_Palabra,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,"['r_and_b', 'soul']",2
Allen_Collins,['southern_rock'],1


In [7]:
y_train.head()

Unnamed: 0_level_0,gender
artist,Unnamed: 1_level_1
Pablo_Holman,male
Bobby_Edwards,male
La_Palabra,male
Sherrick,male
Allen_Collins,male


### Genre Labels -- as a string

Each value of the genre column is a _string_ of comma separated genre labels using the spotify abbreviations. This function strips the brackets and commas and quotes, but leaves it as a string.

In [8]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets, commas, and extra quotes."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","").replace(",","")
    return string

Now we apply it to the whole column and put the lists in a new column:

In [9]:
X_train['genrelist']= X_train['genrelist'].apply(genrelist)

In [10]:
X_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Pablo_Holman,pop rock emo_pop,3
Bobby_Edwards,country,1
La_Palabra,afro_cuban_jazz son_montuno guaracha salsa_rom...,4
Sherrick,r_and_b soul,2
Allen_Collins,southern_rock,1


In [11]:
X_train.genrelist.iloc[0]

'pop rock emo_pop'

In [12]:
data_train = X_train.join(y_train, how = 'inner', on = 'artist')

In [13]:
data_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length,gender
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,pop rock emo_pop,3,male
Bobby_Edwards,country,1,male
La_Palabra,afro_cuban_jazz son_montuno guaracha salsa_rom...,4,male
Sherrick,r_and_b soul,2,male
Allen_Collins,southern_rock,1,male


In [14]:
data_train = data_train.sample(frac = 1, random_state = 13)

In [15]:
data_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length,gender
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Julia_Kedhammar,pop dance_pop synth_pop,3,female
Debby_Ryan,indie_pop,1,female
Gepe,pop folk electro_pop indie_folk,4,male
Nash_the_Slash,progressive_rock electronic,2,male
Dhani_Harrison,alternative_rock rock,2,male


In [16]:
data_train.reset_index(inplace = True)
data_train.index.name = 'artist_id'

In [17]:
data_train.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,gender
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Julia_Kedhammar,pop dance_pop synth_pop,3,female
1,Debby_Ryan,indie_pop,1,female
2,Gepe,pop folk electro_pop indie_folk,4,male
3,Nash_the_Slash,progressive_rock electronic,2,male
4,Dhani_Harrison,alternative_rock rock,2,male


Switched to using full genre_list, not just that for the training set

In [18]:
genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))
genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [19]:
genre_list.head()

Unnamed: 0,genre_list
0,chilean
1,zamba
2,afro_punk_blues
3,crunk
4,spanish_guitar


Import to DataFrame genres and their frequencies: 

In [20]:
genre_label_counts = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_label_counts_TRAINING_{}.csv'.format(now))
#genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
#genre_label_counts.set_index(['Unnamed: 0'], inplace = True)
genre_label_counts.index.name = 'genre_id'
genre_label_counts.columns = ['genre','freqency']

In [21]:
genre_label_counts.head(12)

Unnamed: 0_level_0,genre,freqency
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,pop,2617
1,rock,1765
2,r_and_b,1647
3,country,1613
4,hip_hop,1114
5,folk,1046
6,soul,1023
7,jazz,962
8,alternative_rock,937
9,blues,859


Create a dictionary of {genre : genre_id}

In [22]:
genre_label_id_dict = dict(zip(genre_label_counts.genre.values.tolist(), genre_label_counts.index.tolist()))

In [23]:
genre_label_id_dict['hard_rock']

11

Now create a sparse data structure encoding of the genre labels:

In [24]:
vec = CountVectorizer(vocabulary = genre_label_id_dict) # this implementation uses scipy.sparse.csr_matrix representation

In [25]:
X_sparse = vec.fit_transform(data_train.genrelist)

In [26]:
X_sparse

<12376x1353 sparse matrix of type '<class 'numpy.int64'>'
	with 33457 stored elements in Compressed Sparse Row format>

In [27]:
X_sparse.nnz #number of stored values

33457

Let's look at the nonzero entries of a row and make sure the encoding worked properly:

In [28]:
X_sparse[677].nonzero()

(array([0, 0, 0, 0], dtype=int32), array([ 0,  5, 49, 84], dtype=int32))

Convert those id's to genre labels using the genre_label_counts DF:

In [29]:
[genre_label_counts.loc[id] for id in X_sparse[11000].nonzero()[1]]

[genre       electronic
 freqency           224
 Name: 25, dtype: object,
 genre       psychedelic
 freqency             33
 Name: 134, dtype: object]

Compare to the data_train entry:

In [30]:
data_train.loc[11000]

artist                         Nick_Nicely
genrelist           psychedelic electronic
genrelist_length                         2
gender                                male
Name: 11000, dtype: object

Checks out on some examples.

Encode labels:

In [31]:
le = preprocessing.LabelEncoder()
le.fit(['male', 'female'])
le.classes_

array(['female', 'male'], dtype='<U6')

In [32]:
le.transform(['female'])

array([0])

In [33]:
le.inverse_transform([1,0,1])

array(['male', 'female', 'male'], dtype='<U6')

In [34]:
y = le.transform(data_train.gender.values)

### Normalization
Convert inputs to a numpy array and then create a scaler class to normalize the feature values that can be applied to training and test data.

In [35]:
# X = X_train.vector.values.tolist()
# X = np.stack(X, axis = 0)
#scaler = preprocessing.StandardScaler(with_mean = False).fit(X_sparse) # need with_mean = False for sparse data
transformer = preprocessing.MaxAbsScaler(copy = False).fit(X_sparse)

In [36]:
transformer.scale_, transformer.max_abs_

(array([2., 1., 1., ..., 1., 1., 1.]), array([2., 1., 1., ..., 1., 1., 1.]))

Apply the scaler to the training data:

In [37]:
X_scaled = transformer.transform(X_sparse)

In [38]:
X_scaled.shape

(12376, 1353)

## Ceate the model: GBT

In [55]:
model = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.1, random_state = seed)

In [56]:
model.fit(X_sparse,y)
model.score(X_sparse, y)

0.781431803490627

### Cross validation.

This function can be run on the normalized and un-normalized data:

In [57]:
def train_validate(x_data, y_data):
    """This function takes X,y data and returns 
    (list of cvscores, list of confusion matrices). It
    prints basic stats.
    """

    kf = KFold(n_splits = 10, shuffle = True, random_state = seed)

    cvscores = []
    cms = []
    for train, val in kf.split(x_data,y_data):

    #CODE BELOW NEEDS TO BE ADAPTED TO THIS NB
        X_train = x_data[train]
        y_train = y_data[train]

        model.fit(X_train, y_train);

        X_val = x_data[val]
        y_val = y_data[val]

        score = model.score(X_val, y_val)
        cvscores.append(score)

        # compute confusion matrices and store them in a list
        y_pred = model.predict(X_val)
        cms.append(confusion_matrix(y_val, y_pred))

    print(f'Mean accuracy is {100*np.mean(cvscores):.2f}% and 100*STD is {100*np.std(cvscores):.2f}%')
    print(f'This is a {100*(100*np.mean(cvscores)-69)/69:.2f}% improvement over a random guess.')
    
    return cvscores, cms

Without normalization:

In [58]:
cvscores, cms = train_validate(X_sparse, y)

Mean accuracy is 73.79% and 100*STD is 1.35%
This is a 6.94% improvement over a random guess.
