This nb builds a classifier to predict gender from genre using Gradient Boosted Trees. Transformations to sparse matrix reps, tfidf, and truncated svd appear to be very helpful.

- [ ] Feature Importance: ESL 10.13
- [ ] Create a further dimension reduction to 2D using most important features
- [ ] Create a further dimension reduction to 2D by recalculating truncated SVD
- [ ] Implement subset selection of features: https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/

In [1]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import genre_data_loader, genre_upperbound

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()

# import re

# import os
# from gensim import corpora
# from gensim.corpora import MmCorpus
# from gensim.models import TfidfModel, LsiModel
# from gensim.matutils import corpus2dense

# import json

seed = 23

In [2]:
# get currrent date for latest version of data set
%store -r now

X_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)
X_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now)
y_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now)

# call data loader script for training data
genre_data = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train,
                                                  X_path_test = X_path_test, y_path_test = y_path_test)
# call it for train data
genre_data_train = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train)
# call it for test data
genre_data_test = genre_data_loader.LoadGenreData(now, X_path_train = X_path_test, y_path_train = y_path_test)

# load data with genre sets
data_train = genre_data_train.as_strings()
data_train = genre_data_train.as_lists()
data_train = genre_data_train.as_sets()
data_balanced = genre_data_train.get_balanced_sample()

data_test = genre_data_test.as_strings()

data = genre_data.as_strings()
data = genre_data.as_sets()

# create list of all genres
list_of_genres = genre_data.get_list_of_genres()

# Create a dictionary of {genre : genre_id}
dict_gid = genre_data.get_dict_genre_to_id()
dict_idg = genre_data.get_dict_id_to_genre()

In [3]:
data_train.shape, data_test.shape, data.shape

((12376, 5), (3094, 3), (15470, 4))

Now create a sparse data structure encoding of the genre labels:

In [4]:
def transform_to_sparse(data):

    vec = CountVectorizer(vocabulary = dict_gid) # uses scipy.sparse.csr_matrix representation
    # create sparse matrix of training features NOTE: cannot use .get_sparse_matrix() method of genre_data_loader:
    # that will use a dict_gid built from the list of genres only from training data 
    # full training set
    X = vec.fit_transform(data.genre_string)
    # balanced sample
    X = vec.fit_transform(data.genre_string)


    # Encode labels:
    # le = preprocessing.LabelEncoder()
    # le.fit(['male', 'female'])
    # #le.classes_
    # y = le.transform(data_train.gender.values)
    # # le.transform(['female'])
    # # le.inverse_transform([1,0,1])

    # Encode Labels with UDF so can control encoding:
    def encode_targets(row):
        if row.gender == 'female':
            return 1
        else:
            return 0

    y = data.apply(encode_targets, axis = 1)
    y = y.values
    return X, y

In [5]:
# X_bal, y_bal = transform_to_sparse(data_balanced)
# X_train, y_train = transform_to_sparse(data_train)

### Introduce sparse -> tfidf -> truncated svd -> model.

In [6]:
def transform_to_svd(data):

    vec = CountVectorizer(vocabulary = dict_gid) # uses scipy.sparse.csr_matrix representation
    # create sparse matrix of training features NOTE: cannot use .get_sparse_matrix() method of genre_data_loader:
    # that will use a dict_gid built from the list of genres only from training data 
    # full training set
    X = vec.fit_transform(data.genre_string)
    # balanced sample
    X = vec.fit_transform(data.genre_string)

    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X)
    svd_transformer = TruncatedSVD(n_components = 100)
    svd_transformer.fit(X)
    X = svd_transformer.transform(X)
    
    # Encode Labels with UDF so can control encoding:
    def encode_targets(row):
        if row.gender == 'female':
            return 1
        else:
            return 0

    y = data.apply(encode_targets, axis = 1)
    y = y.values
    return X, y

In [7]:
X_train, y_train = transform_to_svd(data_train)
X_bal, y_bal = transform_to_svd(data_balanced)

Test alignment of data, X, y

From genre_data_loader.ipynb:

In [8]:
# def decode_sparse_list(X_sparse, row_number):
#     zeros, cols = X_sparse[row_number].nonzero()
#     cols_labels = [dict_idg[ind] for ind in cols]
#     cols_labels.sort()
#     return cols_labels

### Check alignment of data_train and (X_train, y_train)

In [32]:
# n = np.random.randint(data_balanced.shape[0])
# sorted(decode_sparse_list(X, n)), sorted(data_balanced.genre_list.iloc[n])
# n = np.random.randint(data_balanced.shape[0])
# y[n], data_balanced.gender.iloc[n]

### Normalization
Convert inputs to a numpy array and then create a scaler class to normalize the feature values that can be applied to training and test data.

In [33]:
# #scaler = preprocessing.StandardScaler(with_mean = False).fit(X_sparse) # need with_mean = False for sparse data
# # transformer
# transformer = preprocessing.MaxAbsScaler(copy = False).fit(X_train)
# transformer.scale_.max(), transformer.max_abs_.max()
# Apply the scaler to the training data:
# X_scaled = transformer.transform(X_sparse)

## Ceate the model: GBT

In [17]:
model = GradientBoostingClassifier(n_estimators = 1000, 
                                   learning_rate = 1.5,
                                   subsample = 1,
                                   loss = 'deviance',
                                   max_depth = 4,
                                   min_samples_split = 2,
                                   random_state = seed)

### Naive train and score insample:

In [18]:
def report_in_sample(model, data, X, y):
    mal = data[data.gender == 'male'].shape[0]
    fem = data[data.gender == 'female'].shape[0]
    p_mal = mal/(mal+fem)
    model.fit(X,y)
    score = model.score(X, y)
    print("_______________________________")
    print('Basic Report on In Sample Score')
    print(f'The baseline accuracy by always classifying male on the training set is {round(p_mal,2)}.')
    print(f'Accuracy on the training set is {round(score,3)}.')
    uppers, err = genre_upperbound.UpperBound(data)
    print(f'Upper bound to accuracy on the training set is {round(1-err,3)}')
    print(f'Accuracy is {round(100*(score)/(1-err),1)}% of the upperbound.')
    print(f'Accuracy is {round(100*(score-p_mal)/(1-err-p_mal),1)}% improvement over the baseline of {100*round(p_mal,2)}%.')

In [20]:
report_in_sample(model, data_balanced, X_bal, y_bal)
report_in_sample(model, data_train, X_train, y_train)

In [38]:
importances  = model.feature_importances_

In [49]:
feat_import = pd.DataFrame({'importance':importances})
feat_import.index.name = 'feature'
feat_import.reset_index()
feat_import.sort_values(['importance'], ascending = False, inplace = True)
feat_import.head(10)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
0,0.513501
63,0.090733
54,0.058909
55,0.054423
24,0.052637
26,0.046937
3,0.034095
1,0.014711
7,0.010172
74,0.00536


In [43]:
feat_import

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
0,0.513501
1,0.014711
2,0.002034
3,0.034095
4,0.005066
...,...
95,0.000282
96,0.001545
97,0.000323
98,0.001570


### Grid Search with CV:

In [26]:
# # split into train and validate for the grid search
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .3)


# tuning_parameters = [{'n_estimators': np.linspace(200, 2000, 2, dtype = 'int32').tolist(), 
#                       'learning_rate': np.linspace(.1, 2, 2).tolist()}]

# clf = GridSearchCV(GradientBoostingClassifier(), tuning_parameters) 

# clf.fit(X_train, y_train)

# # # scores on training folds
# # means = clf.cv_results_['mean_test_score']
# # std = clf.cv_results_['std_test_score']

In [None]:
grid_search.best_params_

In [None]:
y_true, y_pred = y_val, clf.predict(X_val)

In [None]:
classification_report(y_true, y_pred)

### Cross validation.

In [61]:
x = np.ones(6)
x.sum()

6.0

In [9]:
def train_validate(model, x_data, y_data, n_splits = 2):
    """This function takes a model, features, targets, and number of folds and returns
    accuracy scores and the validation set index for each fold
    Input:
        model - a model that has .fit, .score, .predict methods
        X - sparse matrix representing features: genre labels
        y - list of genders
        
    Output:
        cvscores - list of cvscores, 
        cms - list of confusion matrices, 
        vals - list of validation set indices. 
    It also prints basic stats.
    """

    kf = KFold(n_splits = n_splits, shuffle = True, random_state = seed)

    cvscores = []
    #cms = []
    vals = []
    
    for train, val in kf.split(x_data,y_data):
        X_train = x_data[train]
        y_train = y_data[train]

        model.fit(X_train, y_train);

        X_val = x_data[val]
        y_val = y_data[val]

        score = model.score(X_val, y_val)
        cvscores.append(round(score,3))

        # compute confusion matrices and store them in a list
        #y_pred = model.predict(X_val)
        #cms.append(confusion_matrix(y_val, y_pred))
        
        vals.append(val)

        # calculate percent male
        number_fem = y_data.sum()
        number_mal = y_data.shape[0]-number_fem
        percent_mal = 100*round(number_mal/(number_fem+number_mal),1)
        
    print(f'Given that {percent_mal}% of the artists are male, a random guess would have an accuracy of {percent_mal}%.')
    print(f'For the model, the mean accuracy is {100*np.mean(cvscores):.2f}% and 100*STD is {100*np.std(cvscores):.2f}%')
    print(f'This is a {100*(100*np.mean(cvscores)-percent_mal)/percent_mal:.2f}% improvement over a random guess.')
        
    return cvscores, vals

In [12]:
model = GradientBoostingClassifier(n_estimators = 1000, 
                                   learning_rate = 1.5,
                                   subsample = 1,
                                   loss = 'deviance',
                                   max_depth = 4,
                                   min_samples_split = 2,
                                   random_state = seed)
cvscores, vals = train_validate(model, X_bal, y_bal, 5)

Given that 50.0% of the artists are male, a random guess would have an accuracy of 50.0%.
For the model, the mean accuracy is 66.10% and 100*STD is 0.78%
This is a 32.20% improvement over a random guess.


In [13]:
cvscores

[0.654, 0.654, 0.656, 0.671, 0.67]

Calculate upper bounds on accuracy for each validation set:

In [14]:
def upper_bounds(vals, data):
    """Create a list of the upper bounds on accuracy for each validation set."""
    uppers = []
    for val in vals:
        data_val = data.iloc[val] 
        upper, error = genre_upperbound.UpperBound(data_val)
        uppers.append(round(1-error,3))
    return uppers

In [15]:
uppers = upper_bounds(vals, data_balanced)

In [16]:
print(uppers)
print(cvscores)
print(np.array(uppers)-np.array(cvscores))

[0.886, 0.891, 0.882, 0.901, 0.886]
[0.654, 0.654, 0.656, 0.671, 0.67]
[0.232 0.237 0.226 0.23  0.216]


In [26]:
# check alignment of genders
# n = np.random.randint(y_val0.shape[0])
# print('y_val')
# print(y_val0[n])
# print('data_val')
# print(data_val0.gender.iloc[n])