# Data Loader Class

This nb develops a class for loading the genre data with methods to put the genre info into different data types:
- [x] lists
- [x] sets
- [x] strings
- [x] scipy sparse vector
- [ ] tensorflow sparse tensor

In [1]:
# import numpy as np
# import pandas as pd

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()

# import re

In [7]:
mylist = ['a','c','b']
mylist.sort()
mylist

['a', 'b', 'c']

In [204]:
# imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import string

def remove_punctuation_from_word(word):
    # remove '!'
    table = str.maketrans('', '', '!')
    stripped = word.translate(table) 
    print(stripped)

# get currrent date for latest version of data set
%store -r now

# Functions needed

def to_strings(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets, commas, and extra quotes."""
    string = string.strip("[").strip("]").replace("'","").replace(",","").strip("!")
    return string

def to_sets(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets and extra quotes and
    returns a list of strings where each string is a genre label."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_").strip("!"))
    while (str("") in L_new):
        L_new.remove("")
    return set(L_new)


def to_lists(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets and extra quotes and
    returns a list of strings where each string is a genre label."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_").strip("!"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

# # For sparse vector rep of X

# genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))
# genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
# genre_list['genre_id'] = list(range(1,genre_list.shape[0]+1))

# #Size of the vocab:
# vocab_size = genre_list.shape[0]

# #Create a dictionary {genre_label: genre_id}
# temp = genre_list.set_index(['genre_list'])
# label_id_dict = temp['genre_id'].to_dict()

# # set genre_id to index
# genre_list.set_index(['genre_id'], inplace = True)

# #Find max length of genre lists:
# max_list_length = data.genrelist_length.max()


X_path = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)


class LoadGenreData():
    """Load and prepreocess the genre label data.
    NOTE: "!" are removed from genre labels. This affects "oi!" and "cuidado!"
    """
    def __init__(self, X_path, y_path, date):
        self.X_path = X_path
        self.y_path = y_path
        self.date = date

        # import from CSV
        self.X = pd.read_csv(X_path, index_col = ['artist'])
        self.y = pd.read_csv(y_path, index_col = ['artist'])
        
        # assemble X,y into DF
        self.data = self.X.join(self.y, how = 'inner', on = 'artist')

    def data(self):
        return self.data
    
    def as_sets(self):
        """Return view of data with genre labels in a set for each artist;
        'genrelist' column is not shown"""
        self.data['genre_set']= self.data['genrelist'].apply(to_sets)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    def as_lists(self):
        """Return view of data with genre labels in a list for each artist;
        'genrelist' column is not shown"""
        self.data['genre_list']= self.data['genrelist'].apply(to_lists)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    def as_strings(self):
        """Return view of data with genre labels as string for each artist;
        'genrelist' column is not shown"""
        self.data['genre_string']= self.data['genrelist'].apply(to_strings)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    # WARNING: don't add a column to self.X in this method; use a temp DF instead
    def get_list_of_genres(self):
        """Returns a sorted list of genres for the dataset provided to the instance."""
        self.X['genre_list']= self.X['genrelist'].apply(to_lists)
        self.list_of_genres = self.X['genre_list'].values.tolist()
        self.list_of_genres = [label for artists_labels in self.list_of_genres for label in artists_labels]
        self.list_of_genres = list(set(self.list_of_genres))
        self.list_of_genres.sort()
        return self.list_of_genres
    
    def get_sparse_X_vector(self):
        """Return X as a sparse vector with a 1 in the entry (row, id) if the artist has the label with id
        Notes on sparse vector commands: 
        To get the number of nonzero entries: X_sparse.nnz
        To get the nonzero entries of a row: X_sparse[n:m].nonzero() -- returns list of rows and columns with nonzero entries
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(self.list_of_genres,range(len(self.list_of_genres))))
        vec = CountVectorizer(vocabulary = dict_genre_to_id) # uses scipy.sparse.csr_matrix representation
        self.data_genre_strings = self.as_strings()
        self.X_genre_string = self.data_genre_strings['genre_string']
        self.X_sparse = vec.fit_transform(self.X_genre_string)
        return self.X_sparse
    
    def get_dict_genre_to_id(self):
        """Return dictionary of the form {'label':id_number}
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(self.list_of_genres,range(len(self.list_of_genres))))
        return dict_genre_to_id
    
    def get_dict_id_to_genre(self):
        """Return dictionary of the form {id_number:'label'}
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(range(len(self.list_of_genres)),self.list_of_genres))
        return dict_genre_to_id

In [205]:
genre_data = LoadGenreData(X_path,y_path,now)

In [206]:
dictgid = genre_data.get_dict_genre_to_id()
dictidg = genre_data.get_dict_id_to_genre()

Create Unit tests for the sparse encoding:

In [207]:
def unit_test_sparse_encoding(loadgenredata_instance, row_number):
    """compare the dictionary decoded sparse vector to the entry in the data frame"""
    X_sparse = loadgenredata_instance.get_sparse_X_vector()
    data = loadgenredata_instance.as_lists()
    dictidg = loadgenredata_instance.get_dict_id_to_genre()
    
    zeros, cols = X_sparse[row_number].nonzero()
    cols_labels = [dictidg[ind] for ind in cols]
    cols_labels.sort()
    if (set(cols_labels) != set(data.iloc[row_number].genre_list)): 
        print('False')
        return cols_labels
    else:
        return 'True'

In [208]:
def unit_test_sparse_encoding_batch(loadgenredata_instance):
    """compare the dictionary decoded sparse vector to the entry in the data frame"""
    X_sparse = loadgenredata_instance.get_sparse_X_vector()
    data = loadgenredata_instance.as_lists()
    dictidg = loadgenredata_instance.get_dict_id_to_genre()

    row_errors = []
    errors = 0
    for num in range(data.shape[0]):
        zeros, cols = X_sparse[num].nonzero()
        cols_labels = [dictidg[ind] for ind in cols]
        cols_labels.sort()
        if (set(cols_labels) != set(data.iloc[num].genre_list)): 
            errors += 1
            row_errors.append(num)
    return errors, row_errors

In [209]:
num_errors, error_list = unit_test_sparse_encoding_batch(genre_data)

In [210]:
error_list

[]

In [211]:
def decode_sparse_list(X_sparse, row_number):
    zeros, cols = X_sparse[row_number].nonzero()
    cols_labels = [dictidg[ind] for ind in cols]
    cols_labels.sort()
    return cols_labels