# Switch terminology to lower bound on error!

- [ ] introduce tests to make sure all versions of data (sparse, DF, etc) align

This nb builds a classifier to predict gender from genre using a random forest model.

We look at the following encoding/embeddings:
- [ ] BOW
- [ ] TFIDF
- [ ] LSI
- [ ] LDA
- [ ] Word2Vec

In [1]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

import genre_data_loader, genre_upperbound

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()

# import re

# import os
# from gensim import corpora
# from gensim.corpora import MmCorpus
# from gensim.models import TfidfModel, LsiModel
# from gensim.matutils import corpus2dense

# import json

seed = 23

In [2]:
# get currrent date for latest version of data set
%store -r now

X_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)
X_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now)
y_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now)

In [3]:
# call data loader script for training data
genre_data = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train,
                                                  X_path_test = X_path_test, y_path_test = y_path_test)
# call it for train data
genre_data_train = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train)
# call it for test data
genre_data_test = genre_data_loader.LoadGenreData(now, X_path_train = X_path_test, y_path_train = y_path_test)

In [4]:
# load data with genre sets
data_train = genre_data_train.as_strings()
data_train = genre_data_train.as_lists()
data_test = genre_data_test.as_strings()
data = genre_data.as_strings()
data = genre_data.as_sets()

# create list of all genres
list_of_genres = genre_data.get_list_of_genres()

In [5]:
data_train.shape, data_test.shape, data.shape

((12376, 4), (3094, 3), (15470, 4))

Don't randomize yet!

In [6]:
# data = data.sample(frac = 1, random_state = 13)
# data_train = data_train.sample(frac = 1, random_state = 13)
# data_test = data_test.sample(frac = 1, random_state = 13)

In [7]:
# data_train.reset_index(inplace = True)
# data_train.index.name = 'artist_id'

- [ ] should we put the dict in order of frequency?

Create a dictionary of {genre : genre_id}

In [8]:
dict_gid = genre_data.get_dict_genre_to_id()
dict_idg = genre_data.get_dict_id_to_genre()

Now create a sparse data structure encoding of the genre labels:

In [15]:
vec = CountVectorizer(vocabulary = dict_gid) # uses scipy.sparse.csr_matrix representation
# create sparse matrix of training features NOTE: cannot use .get_sparse_matrix() method of genre_data_loader:
# that will use a dict_gid built from the list of genres only from training data 
X_train = vec.fit_transform(data_train.genre_string)

# Encode labels:
le = preprocessing.LabelEncoder()
le.fit(['male', 'female'])
le.classes_
y_train = le.transform(data_train.gender.values)
# le.transform(['female'])
# le.inverse_transform([1,0,1])

Test alignment of data, X, y

From genre_data_loader.ipynb:

In [17]:
def decode_sparse_list(X_sparse, row_number):
    zeros, cols = X_sparse[row_number].nonzero()
    cols_labels = [dict_idg[ind] for ind in cols]
    cols_labels.sort()
    return cols_labels

In [22]:
n = np.random.randint(data_train.shape[0])
sorted(decode_sparse_list(X_train, n)), sorted(data_train.genre_list.iloc[n])

(['country', 'pop', 'southern_gospel'], ['country', 'pop', 'southern_gospel'])

In [38]:
n = np.random.randint(data_train.shape[0])
y_train[n], data_train.gender.iloc[n]

(0, 'female')

### Normalization
Convert inputs to a numpy array and then create a scaler class to normalize the feature values that can be applied to training and test data.

In [15]:
# #scaler = preprocessing.StandardScaler(with_mean = False).fit(X_sparse) # need with_mean = False for sparse data
# # transformer
# transformer = preprocessing.MaxAbsScaler(copy = False).fit(X_train)
# transformer.scale_.max(), transformer.max_abs_.max()
# Apply the scaler to the training data:
# X_scaled = transformer.transform(X_sparse)

## Ceate the model: GBT

In [64]:
model = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.05, random_state = seed)

In [65]:
model.fit(X_train,y_train)
model.score(X_train, y_train)

0.7601001939237233

### Cross validation.

In [66]:
def train_validate(x_data, y_data, data):
    """This function takes X,y, data and returns
    Input:
        X - sparse matrix representing features: genre labels
        y - list of genders
        data - DataFrame of the full data set used for the upperbound calculation
    Output:
        cvscores - list of cvscores, 
        cms - list of confusion matrices, 
        vals - list of validation set indices. 
    It also prints basic stats.
    """

    kf = KFold(n_splits = 5, shuffle = True, random_state = seed)

    cvscores = []
    cms = []
    vals = []
    
    for train, val in kf.split(x_data,y_data):
        X_train = x_data[train]
        y_train = y_data[train]

        model.fit(X_train, y_train);

        X_val = x_data[val]
        y_val = y_data[val]

        score = model.score(X_val, y_val)
        cvscores.append(round(score,3))

        # compute confusion matrices and store them in a list
        y_pred = model.predict(X_val)
        cms.append(confusion_matrix(y_val, y_pred))
        
        vals.append(val)

    print(f'Mean accuracy is {100*np.mean(cvscores):.2f}% and 100*STD is {100*np.std(cvscores):.2f}%')
    print(f'This is a {100*(100*np.mean(cvscores)-69)/69:.2f}% improvement over a random guess.')
        
        
    #data_val = data.iloc[val]
    #upper = genre_upperbound.UpperBound()
        
    return cvscores, cms, vals

In [67]:
cvscores, cms, vals = train_validate(X_train, y_train, data_train)

KeyboardInterrupt: 

Calculate upper bounds on accuracy for each validation set:

In [None]:
def upper_bounds(vals, data):
    """Create a list of the upperboundson accuracy for each validation set."""
    uppers = []
    for val in vals:
        data_val = data.iloc[val] 
        upper, error = genre_upperbound.UpperBound(data_val)
        uppers.append(round(1-error,3))
    return uppers

In [None]:
uppers = upper_bounds(vals, data)

In [None]:
print(uppers)
print(cvscores)
print(np.array(uppers)-np.array(cvscores))

In [28]:
# check alignment of genders
# n = np.random.randint(y_val0.shape[0])
# print('y_val')
# print(y_val0[n])
# print('data_val')
# print(data_val0.gender.iloc[n])

### Use tf-idf and then LSA in scikit learn