# Switch terminology to lower bound on error!

- [ ] introduce tests to make sure all versions of data (sparse, DF, etc) align

This nb builds a classifier to predict gender from genre using a random forest model.

We look at the following encoding/embeddings:
- [ ] BOW
- [ ] TFIDF
- [ ] LSI
- [ ] LDA
- [ ] Word2Vec

In [1]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

import genre_data_loader, genre_upperbound

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()

# import re

# import os
# from gensim import corpora
# from gensim.corpora import MmCorpus
# from gensim.models import TfidfModel, LsiModel
# from gensim.matutils import corpus2dense

# import json

seed = 23

In [2]:
# get currrent date for latest version of data set
%store -r now

X_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)
X_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now)
y_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now)

# call data loader script for training data
genre_data = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train,
                                                  X_path_test = X_path_test, y_path_test = y_path_test)
# call it for train data
genre_data_train = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train)
# call it for test data
genre_data_test = genre_data_loader.LoadGenreData(now, X_path_train = X_path_test, y_path_train = y_path_test)

# load data with genre sets
data_train = genre_data_train.as_strings()
data_train = genre_data_train.as_lists()
data_train = genre_data_train.as_sets()
data_test = genre_data_test.as_strings()
data = genre_data.as_strings()
data = genre_data.as_sets()

# create list of all genres
list_of_genres = genre_data.get_list_of_genres()

# Create a dictionary of {genre : genre_id}
dict_gid = genre_data.get_dict_genre_to_id()
dict_idg = genre_data.get_dict_id_to_genre()

In [3]:
data_train.shape, data_test.shape, data.shape

((12376, 5), (3094, 3), (15470, 4))

Now create a sparse data structure encoding of the genre labels:

In [4]:
vec = CountVectorizer(vocabulary = dict_gid) # uses scipy.sparse.csr_matrix representation
# create sparse matrix of training features NOTE: cannot use .get_sparse_matrix() method of genre_data_loader:
# that will use a dict_gid built from the list of genres only from training data 
X_train = vec.fit_transform(data_train.genre_string)

# Encode labels:
# le = preprocessing.LabelEncoder()
# le.fit(['male', 'female'])
# #le.classes_
# y_train = le.transform(data_train.gender.values)
# # le.transform(['female'])
# # le.inverse_transform([1,0,1])

# Encode Labels with UDF so can control encoding:
def encode_targets(row):
    if row.gender == 'female':
        return 1
    else:
        return 0
    
y_train = data_train.apply(encode_targets, axis = 1)
y_train = y_train.values

Test alignment of data, X, y

From genre_data_loader.ipynb:

In [5]:
def decode_sparse_list(X_sparse, row_number):
    zeros, cols = X_sparse[row_number].nonzero()
    cols_labels = [dict_idg[ind] for ind in cols]
    cols_labels.sort()
    return cols_labels

### Check alignment of data_train and (X_train, y_train)

In [6]:
n = np.random.randint(data_train.shape[0])
sorted(decode_sparse_list(X_train, n)), sorted(data_train.genre_list.iloc[n])

(['blues', 'jazz', 'rock', 'smooth_jazz'],
 ['blues', 'jazz', 'rock', 'smooth_jazz'])

In [7]:
n = np.random.randint(data_train.shape[0])
y_train[n], data_train.gender.iloc[n]

(0, 'male')

### Normalization
Convert inputs to a numpy array and then create a scaler class to normalize the feature values that can be applied to training and test data.

In [8]:
# #scaler = preprocessing.StandardScaler(with_mean = False).fit(X_sparse) # need with_mean = False for sparse data
# # transformer
# transformer = preprocessing.MaxAbsScaler(copy = False).fit(X_train)
# transformer.scale_.max(), transformer.max_abs_.max()
# Apply the scaler to the training data:
# X_scaled = transformer.transform(X_sparse)

## Ceate the model: GBT

In [9]:
model = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, random_state = seed)

In [10]:
model.fit(X_train,y_train)
print(f'Accuracy on the training set is {model.score(X_train, y_train)}.')
uppers, err = genre_upperbound.UpperBound(data_train)
print(f'Upper bound to accuracy on the training set is {1-err}')

Accuracy on the training set is 0.7385261797026503.
Upper bound to accuracy on the training set is 0.734762


# Caution!
The upper bound is lower than the accuracy of the model -><-

- [x] calculate score from model.predict
- [x] Create new columns in upper with model_predictions
- [ ] Create new columns in data_train with:
    - [x] model_prediction
    - [ ] upperbound prediction

Then compare.



In [11]:
model_predictions = model.predict(X_train)

In [12]:
# create column with encoded genders
data_train['gender_codes'] = data_train.gender.apply(lambda x: 0 if x == 'male' else 1)

In [13]:
model_error = np.sum(data_train.gender_codes.values != model_predictions)
(data_train.shape[0]-model_error)/data_train.shape[0]

0.7385261797026503

In [14]:
#convert uppers genre into string

def set_to_string(row):
    """This function takes in a set and produces a string."""
    L_new = list(row.genre_set_encoded)
    L_string = " ".join(L_new)
    return L_string

uppers['genre_string'] = uppers.apply(set_to_string, axis = 1)

# apply vec.fit_transform to create sparse matrix 
uppers_encoded_sets = vec.fit_transform(uppers.genre_string.values)

# apply model.predict to uppers_encoded_sets
uppers_model_predictions = model.predict(uppers_encoded_sets)
# put predictions of model into new column in uppers
uppers_model_preds = pd.DataFrame({'model_predictions':uppers_model_predictions}, index = uppers.index)

# join to uppers
uppers = uppers.join(uppers_model_preds)

# calculate model error on each set
def model_error(row):
    if row.model_predictions == 0:
        return row.female
    else:
        return row.male
    
uppers['model_errors'] = uppers.apply(model_error, axis = 1)

In [16]:
discrepency = uppers[uppers.classifier != uppers.model_predictions]

In [17]:
discrepency.shape

(1492, 9)

In [18]:
discrepency.columns

Index(['total', 'female', 'male', 'genre_set_encoded', 'classifier',
       'error_bound', 'genre_string', 'model_predictions', 'model_errors'],
      dtype='object')

In [19]:
discrepency[['total', 'female', 'male', 
         'genre_string', 'classifier',
       'model_predictions', 'model_errors', 'error_bound']]

Unnamed: 0_level_0,total,female,male,genre_string,classifier,model_predictions,model_errors,error_bound
set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7,1,0,1,jazz irish_traditional folk celtic,0,1,1,0
10,4,2,2,pop_rock new_wave,1,0,2,2
12,21,12,9,soul,1,0,12,9
16,418,215,203,pop,1,0,215,203
28,1,1,0,baroque_pop alternative_rock,1,0,1,0
...,...,...,...,...,...,...,...,...
6225,1,1,0,rock hard_rock pop_rock folk_rock,1,0,1,0
6229,1,0,1,blues_rock soul pop gospel a_cappella rock_sta...,0,1,1,0
6238,1,1,0,rock childrens pop,1,0,1,0
6244,1,1,0,future_pop hip_hop electro_pop pop_rap,1,0,1,0


In [23]:
uppers[uppers.model_errors < uppers.error_bound]

Unnamed: 0_level_0,total,female,male,genre_set_encoded,classifier,error_bound,genre_string,model_predictions,model_errors
set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


Putting predictions in data_train:

In [20]:
model_preds = pd.DataFrame({'model_predictions':model_predictions}, index = data_train.index)

In [23]:
#data_train.drop(['model_predictions'], axis = 1, inplace = True)
data_train = data_train.join(model_preds)

In [24]:
def code_to_gender(row):
    return le.transform([row.gender])[0]

data_train['gender_code'] = data_train.apply(code_to_gender, axis = 1)

In [27]:
data_train.head()

Unnamed: 0_level_0,genrelist_length,gender,genre_string,genre_list,genre_set,model_predictions,gender_code
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Pablo_Holman,3,male,rock emo_pop pop,"[rock, emo_pop, pop]","{rock, emo_pop, pop}",1,1
Bobby_Edwards,1,male,country,[country],{country},1,1
La_Palabra,4,male,afro_cuban_jazz son_montuno salsa_romántica gu...,"[afro_cuban_jazz, son_montuno, salsa_romántica...","{afro_cuban_jazz, son_montuno, salsa_romántica...",1,1
Sherrick,2,male,soul r_and_b,"[soul, r_and_b]","{soul, r_and_b}",1,1
Allen_Collins,1,male,southern_rock,[southern_rock],{southern_rock},1,1


### Cross validation.

In [66]:
def train_validate(x_data, y_data, data):
    """This function takes X,y, data and returns
    Input:
        X - sparse matrix representing features: genre labels
        y - list of genders
        data - DataFrame of the full data set used for the upperbound calculation
    Output:
        cvscores - list of cvscores, 
        cms - list of confusion matrices, 
        vals - list of validation set indices. 
    It also prints basic stats.
    """

    kf = KFold(n_splits = 5, shuffle = True, random_state = seed)

    cvscores = []
    cms = []
    vals = []
    
    for train, val in kf.split(x_data,y_data):
        X_train = x_data[train]
        y_train = y_data[train]

        model.fit(X_train, y_train);

        X_val = x_data[val]
        y_val = y_data[val]

        score = model.score(X_val, y_val)
        cvscores.append(round(score,3))

        # compute confusion matrices and store them in a list
        y_pred = model.predict(X_val)
        cms.append(confusion_matrix(y_val, y_pred))
        
        vals.append(val)

    print(f'Mean accuracy is {100*np.mean(cvscores):.2f}% and 100*STD is {100*np.std(cvscores):.2f}%')
    print(f'This is a {100*(100*np.mean(cvscores)-69)/69:.2f}% improvement over a random guess.')
        
        
    #data_val = data.iloc[val]
    #upper = genre_upperbound.UpperBound()
        
    return cvscores, cms, vals

In [27]:
cvscores, cms, vals = train_validate(X_train, y_train, data_train)

Calculate upper bounds on accuracy for each validation set:

In [None]:
def upper_bounds(vals, data):
    """Create a list of the upper bounds on accuracy for each validation set."""
    uppers = []
    for val in vals:
        data_val = data.iloc[val] 
        upper, error = genre_upperbound.UpperBound(data_val)
        uppers.append(round(1-error,3))
    return uppers

In [None]:
uppers = upper_bounds(vals, data_train)

In [None]:
print(uppers)
print(cvscores)
print(np.array(uppers)-np.array(cvscores))

In [28]:
# check alignment of genders
# n = np.random.randint(y_val0.shape[0])
# print('y_val')
# print(y_val0[n])
# print('data_val')
# print(data_val0.gender.iloc[n])

In [60]:
# convert uppers gender code to same as used in X,y
def convert_gender_code(row):
    code = row.classifier
    if code == 0:
        gender = 'male'
    else: 
        gender = 'female'
    code = le.transform([gender])[0]
    return code

### Use tf-idf and then LSA in scikit learn