This notebook creates a dataframe with genre frequencies by female, male, total using sparse matrix structure.


In [1]:
import genre_scripts.genre_data_loader as genre_data_loader
from genre_scripts.nested_subsets import NestedSubsets

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
# get currrent date for latest version of data set
%store -r now

X_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)
X_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now)
y_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now)

In [3]:
# call data loader script
genre_data = genre_data_loader.LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train, 
                                             X_path_test = X_path_test, y_path_test = y_path_test)

In [4]:
# load data with genre sets
data = genre_data.as_lists()
X = genre_data.get_sparse_X_vector()
# # create list of all genres
# list_of_genres = genre_data.get_list_of_genres()

Get percentages of male and female:

In [5]:
percent_fem = genre_data.get_percent_female()
percent_mal = genre_data.get_percent_male()

In [6]:
percent_fem, percent_mal

(0.3138332255979315, 0.6861667744020685)

Check alignment of data and X:

In [7]:
def unit_test_sparse_encoding_batch(loadgenredata_instance):
    """compare the dictionary decoded sparse vector to the entry in the data frame"""
    X_sparse = loadgenredata_instance.get_sparse_X_vector()
    data = loadgenredata_instance.as_lists()
    dictidg = loadgenredata_instance.get_dict_id_to_genre()

    row_errors = []
    errors = 0
    for num in range(data.shape[0]):
        zeros, cols = X_sparse[num].nonzero()
        cols_labels = [dictidg[ind] for ind in cols]
        cols_labels.sort()
        if (set(cols_labels) != set(data.iloc[num].genre_list)): 
            errors += 1
            row_errors.append(num)
    return errors, row_errors

In [8]:
unit_test_sparse_encoding_batch(genre_data)

(0, [])

In [30]:
freq = X.sum(0)

Let's check that is is the correct calculation:

In [70]:
dictidg = genre_data.get_dict_id_to_genre()
dictgid = genre_data.get_dict_genre_to_id()

The calculation from the sparse matrix:

In [71]:
freq[0,[dictgid['pop'], dictgid['rock']]]

matrix([[3279, 2208]], dtype=int64)

In [72]:
genre_frequency = pd.DataFrame(freq).transpose()
genre_frequency.index.name = 'genre'
genre_frequency.columns = ['frequency']

In [73]:
genre_frequency.head()

Unnamed: 0_level_0,frequency
genre,Unnamed: 1_level_1
0,1
1,2
2,2
3,6
4,19


In [74]:
genre_frequency.loc[[dictgid['pop'], dictgid['rock']]]

Unnamed: 0_level_0,frequency
genre,Unnamed: 1_level_1
1044,3279
1167,2208


In [75]:
genre_frequency.reset_index(inplace = True)
genre_frequency['genre'] = genre_frequency.apply(lambda x: dictidg[x.genre], axis = 1)
genre_frequency.set_index(['genre'], inplace = True)
genre_frequency

Unnamed: 0_level_0,frequency
genre,Unnamed: 1_level_1
1960s,1
2_step,2
2_step_garage,2
2_tone,6
a_cappella,19
...,...
yé_yé,6
zamba,1
zouk,6
zumba,1


The calculation from the dataframe:

In [None]:
genre_list_1 = data.genre_list.values.tolist()
genre_list_1 = [x for y in genre_list_1 for x in y]
genre_counts = pd.Series(genre_list_1, name = 'frequency')
genre_stats = genre_counts.value_counts().to_frame()
genre_stats.index.name = 'label'

In [76]:
genre_stats.loc['zouk']

frequency    6
Name: zouk, dtype: int64

Now the frequencies for each gender: