This nb builds a classifier to predict gender from genre using a random forest model.

We look at the following encoding/embeddings:
- [ ] BOW
- [ ] TFIDF
- [ ] LSI
- [ ] LDA
- [ ] Word2Vec

In [1]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()

# import re

import os
# from gensim import corpora
# from gensim.corpora import MmCorpus
# from gensim.models import TfidfModel, LsiModel
# from gensim.matutils import corpus2dense

# import json

seed = 23

In [2]:
import genre_data_loader, genre_upperbound

In [3]:
# get currrent date for latest version of data set
%store -r now

X_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)
X_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now)
y_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now)

In [4]:
now

'2020-07-07-09-58'

In [5]:
# call data loader script for training data
genre_data = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train,
                                                  X_path_test = X_path_test, y_path_test = y_path_test)
# call it for train data
genre_data_train = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train)
# call it for test data
genre_data_test = genre_data_loader.LoadGenreData(now, X_path_train = X_path_test, y_path_train = y_path_test)

In [6]:
# load data with genre sets
data_train = genre_data_train.as_strings()
data_test = genre_data_test.as_strings()
data = genre_data.as_strings()
# create list of all genres
list_of_genres = genre_data.get_list_of_genres()

In [7]:
data_train.shape, data_test.shape, data.shape

((12376, 3), (3094, 3), (15470, 3))

In [8]:
data_train.head()

Unnamed: 0_level_0,genrelist_length,gender,genre_string
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,3,male,rock emo_pop pop
Bobby_Edwards,1,male,country
La_Palabra,4,male,salsa_romántica son_montuno afro_cuban_jazz gu...
Sherrick,2,male,r_and_b soul
Allen_Collins,1,male,southern_rock


Don't randomize yet!

In [9]:
# data = data.sample(frac = 1, random_state = 13)
# data_train = data_train.sample(frac = 1, random_state = 13)
# data_test = data_test.sample(frac = 1, random_state = 13)

In [10]:
# data_train.reset_index(inplace = True)
# data_train.index.name = 'artist_id'

- [ ] should we put the dict in order of frequency?

In [11]:
genre_list = genre_data.get_list_of_genres()

In [12]:
genre_list[:11]

['1960s',
 '2_step',
 '2_step_garage',
 '2_tone',
 'a_cappella',
 'aaa',
 'abakuá',
 'aboriginal_country',
 'ac',
 'acapella',
 'acid_folk']

Create a dictionary of {genre : genre_id}

In [13]:
dict_gid = genre_data.get_dict_genre_to_id()
dict_idg = genre_data.get_dict_id_to_genre()

Now create a sparse data structure encoding of the genre labels:

In [14]:
vec = CountVectorizer(vocabulary = dict_gid) # this implementation uses scipy.sparse.csr_matrix representation

In [15]:
X_sparse = genre_data_train.get_sparse_X_vector()

In [16]:
X_sparse

<12376x1350 sparse matrix of type '<class 'numpy.int64'>'
	with 33461 stored elements in Compressed Sparse Row format>

Encode labels:

In [17]:
le = preprocessing.LabelEncoder()
le.fit(['male', 'female'])
le.classes_

array(['female', 'male'], dtype='<U6')

In [18]:
le.transform(['female'])

array([0])

In [19]:
le.inverse_transform([1,0,1])

array(['male', 'female', 'male'], dtype='<U6')

In [20]:
y = le.transform(data_train.gender.values)

### Normalization
Convert inputs to a numpy array and then create a scaler class to normalize the feature values that can be applied to training and test data.

In [21]:
# X = X_train.vector.values.tolist()
# X = np.stack(X, axis = 0)
#scaler = preprocessing.StandardScaler(with_mean = False).fit(X_sparse) # need with_mean = False for sparse data
transformer = preprocessing.MaxAbsScaler(copy = False).fit(X_sparse)

In [22]:
transformer.scale_, transformer.max_abs_

(array([1., 1., 1., ..., 1., 1., 1.]), array([1., 1., 1., ..., 1., 1., 1.]))

Apply the scaler to the training data:

In [23]:
X_scaled = transformer.transform(X_sparse)

In [24]:
X_scaled.shape

(12376, 1350)

## Ceate the model: GBT

In [25]:
model = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.1, random_state = seed)

In [26]:
model.fit(X_sparse,y)
model.score(X_sparse, y)

0.7816742081447964

### Cross validation.

This function can be run on the normalized and un-normalized data:

In [27]:
def train_validate(x_data, y_data):
    """This function takes X,y data and returns 
    (list of cvscores, list of confusion matrices). It
    prints basic stats.
    """

    kf = KFold(n_splits = 10, shuffle = True, random_state = seed)

    cvscores = []
    cms = []
    for train, val in kf.split(x_data,y_data):

    #CODE BELOW NEEDS TO BE ADAPTED TO THIS NB
        X_train = x_data[train]
        y_train = y_data[train]

        model.fit(X_train, y_train);

        X_val = x_data[val]
        y_val = y_data[val]

        score = model.score(X_val, y_val)
        cvscores.append(score)

        # compute confusion matrices and store them in a list
        y_pred = model.predict(X_val)
        cms.append(confusion_matrix(y_val, y_pred))

    print(f'Mean accuracy is {100*np.mean(cvscores):.2f}% and 100*STD is {100*np.std(cvscores):.2f}%')
    print(f'This is a {100*(100*np.mean(cvscores)-69)/69:.2f}% improvement over a random guess.')
    
    return cvscores, cms

Without normalization:

In [28]:
cvscores, cms = train_validate(X_sparse, y)

Mean accuracy is 73.59% and 100*STD is 1.16%
This is a 6.66% improvement over a random guess.


### Use tf-idf and then LSA in scikit learn

In [None]:
# new version with upper bound on accuracy calculated
def train_validate(x_data, y_data):
    """This function takes X,y data and returns 
    (list of cvscores, list of confusion matrices). It
    prints basic stats.
    """

    kf = KFold(n_splits = 2, shuffle = True, random_state = seed)

    cvscores = []
    cms = []
    for train, val in kf.split(x_data,y_data):

    #CODE BELOW NEEDS TO BE ADAPTED TO THIS NB
        X_train = x_data[train]
        y_train = y_data[train]

        model.fit(X_train, y_train);

        X_val = x_data[val]
        y_val = y_data[val]

        score = model.score(X_val, y_val)
        cvscores.append(score)

        # compute confusion matrices and store them in a list
        y_pred = model.predict(X_val)
        cms.append(confusion_matrix(y_val, y_pred))

    print(f'Mean accuracy is {100*np.mean(cvscores):.2f}% and 100*STD is {100*np.std(cvscores):.2f}%')
    print(f'This is a {100*(100*np.mean(cvscores)-69)/69:.2f}% improvement over a random guess.')
    
    return cvscores, cms