# Data Loader Class

This nb develops a class for loading the genre data with methods to put the genre info into different data types:
- [x] lists
- [x] sets
- [x] strings
- [x] scipy sparse vector
- [x] always train; optional test set
- [x] accept DF as input instead of path
- [ ] tensorflow sparse tensor

In [1]:
# get currrent date for latest version of data set
%store -r now

X_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path_train = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)
X_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now)
y_path_test = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now)

In [2]:
import importlib.util
spec = importlib.util.spec_from_file_location("genre_data_loader", "/Users/Daniel/Code/Genre/project_code/analysis/genre_scripts/genre_data_loader.py")
genre_data_loader = importlib.util.module_from_spec(spec)
spec.loader.exec_module(genre_data_loader)
LoadGenreData = genre_data_loader.LoadGenreData

In [3]:
genre_data = LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train,
                          X_path_test = X_path_test, y_path_test = y_path_test)

genre_data_train = LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train)

In [4]:
data = genre_data.as_lists()

In [5]:
data

Unnamed: 0_level_0,genrelist_length,gender,genre_list
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,3,male,"[pop, rock, emo_pop]"
Bobby_Edwards,1,male,[country]
La_Palabra,4,male,"[salsa_romántica, afro_cuban_jazz, son_montuno..."
Sherrick,2,male,"[soul, r_and_b]"
Allen_Collins,1,male,[southern_rock]
...,...,...,...
Katy_Perry,3,female,"[pop, rock, disco]"
Tyler_Joseph,7,male,"[electro_pop, alternative_hip_hop, indie_pop, ..."
Delta_Goodrem,3,female,"[contemporary, pop, pop_rock]"
Chosen_Effect,4,male,"[pop, r_and_b, dance, hip_hop]"


In [6]:
data_coo = genre_data.get_coo_matrix()

In [7]:
data_coo

(array([1, 1, 1, ..., 1, 1, 1]),
 (array([    0,     0,     0, ..., 15468, 15469, 15469]),
  array([1041, 1164,  467, ...,  661, 1041,  747])))

This is the code that goes in the script:

In [31]:
# imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import string


class LoadGenreData():
    """Load and prepreocess the genre label data.
    NOTE: "!" are removed from genre labels. This affects "oi!" and "cuidado!"
    Input:
    date: used for names of data files
    df_X: optional, X data as DF; overrides paths
    df_y: optional, y data as DF; overrides paths
    train and test data paths for X,y -- only used if DF is not provided
    """
    
    
    def __repr__(self):
        return "Data Frame {}".format(self.data.iloc[:3])

    def __str__(self):
        return "Data Frame {}".format(self.__repr__())
    
    
    def __init__(self, date, df_X = None, df_y = None, X_path_train = None, y_path_train = None, X_path_test = None, y_path_test = None):
    
        self.date = date
        
        if df_X  is not None:
            self.X = df_X
            self.y = df_y
        else:
            self.X_path_train = X_path_train
            self.y_path_train = y_path_train
            self.X_path_test = X_path_test
            self.y_path_test = y_path_test

            # import from CSV
            if X_path_test is None:
                self.X = pd.read_csv(self.X_path_train, index_col = ['artist'])
            else:
                self.X_train = pd.read_csv(self.X_path_train, index_col = ['artist'])
                self.X_test = pd.read_csv(self.X_path_test, index_col = ['artist'])
                self.X = pd.concat([self.X_train,self.X_test])

            if y_path_test is None:
                self.y = pd.read_csv(self.y_path_train, index_col = ['artist'])
            else:
                self.y_train = pd.read_csv(self.y_path_train, index_col = ['artist'])
                self.y_test = pd.read_csv(self.y_path_test, index_col = ['artist'])
                self.y = pd.concat([self.y_train,self.y_test])

        # assemble X,y into DF
        self.data = self.X.join(self.y, how = 'inner', on = 'artist')

    def data(self):
        return self.data
    
    def as_sets(self):
        """Return view of data with genre labels in a set for each artist;
        'genrelist' column is not shown"""
        self.data['genre_set']= self.data['genrelist'].apply(to_sets)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    def as_lists(self):
        """Return view of data with genre labels in a list for each artist;
        'genrelist' column is not shown"""
        self.data['genre_list']= self.data['genrelist'].apply(to_lists)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    def as_strings(self):
        """Return view of data with genre labels as string for each artist;
        'genrelist' column is not shown"""
        self.data['genre_string']= self.data['genrelist'].apply(to_strings)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    # WARNING: don't add a column to self.X in this method; use a temp DF instead
    def get_list_of_genres(self):
        """Returns a sorted list of genres for the dataset provided to the instance."""
        self.X['genre_list']= self.X['genrelist'].apply(to_lists)
        self.list_of_genres = self.X['genre_list'].values.tolist()
        self.list_of_genres = [label for artists_labels in self.list_of_genres for label in artists_labels]
        self.list_of_genres = list(set(self.list_of_genres))
        self.list_of_genres.sort()
        return self.list_of_genres
    
    def get_sparse_X_vector(self):
        """Return X as a sparse vector with a 1 in the entry (row, id) if the artist has the label with id
        Notes on sparse vector commands: 
        To get the number of nonzero entries: X_sparse.nnz
        To get the nonzero entries of a row: X_sparse[n:m].nonzero() -- returns list of rows and columns with nonzero entries
        """
        dict_genre_to_id = self.get_dict_genre_to_id()
        vec = CountVectorizer(vocabulary = dict_genre_to_id) # uses scipy.sparse.csr_matrix representation
        self.data_genre_strings = self.as_strings()
        self.X_genre_string = self.data_genre_strings['genre_string']
        self.X_sparse = vec.fit_transform(self.X_genre_string)
        return self.X_sparse
    
    def get_dict_genre_to_id(self):
        """Return dictionary of the form {'label':id_number}
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(self.list_of_genres,range(len(self.list_of_genres))))
        return dict_genre_to_id
    
    def get_dict_id_to_genre(self):
        """Return dictionary of the form {id_number:'label'}
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(range(len(self.list_of_genres)),self.list_of_genres))
        return dict_genre_to_id
    
    def get_coo_matrix(self):
        """Return the (values, (rows, cols)) for a COO matrix
        of the genre sets"""
        self.as_lists()
        dict_gid = self.get_dict_genre_to_id()
        
        def coo_rows(row):
            """Get the row info for the COO sparse matrix
            version of the genre sets"""
            row = [self.data.index.get_loc(row.name) for genre in row.genre_list]
            return np.array(row)

        def coo_cols(row):
            """Get the col info for the COO sparse matrix
            version of the genre sets"""
            col = [dict_gid[genre] for genre in row.genre_list]
            return np.array(col)

        def coo_values(row):
            """Get the values info for the COO sparse matrix
            version of the genre sets"""
            values = [1 for genre in row.genre_list]
            return np.array(values)

        self.data['coorows'] = self.data.apply(coo_rows, axis = 1)
        self.data['coocols'] = self.data.apply(coo_cols, axis = 1)
        self.data['coovalues'] = self.data.apply(coo_values, axis = 1)

        rows = create_coo_list(self.data.coorows)
        cols = create_coo_list(self.data.coocols)
        values = create_coo_list(self.data.coovalues)

        coo_info = (values, (rows, cols))
        
        return coo_info
    

# Functions needed

def remove_punctuation_from_word(word):
    # remove '!'
    table = str.maketrans('', '', '!')
    stripped = word.translate(table) 
    print(stripped)

def to_strings(string):
    """This function takes in a string of the form
     appearing in the genrelist of the dataframe.
     It converts it to a list, then a set (to remove duplicates), and then a string."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_").strip("!").replace("+","_"))
    while (str("") in L_new):
        L_new.remove("")
    L_new = list(set(L_new))
    L_string = " ".join(L_new)
    return L_string


def to_sets(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets and extra quotes and
    returns a set of strings where each string is a genre label."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_").strip("!").replace("+","_"))
    while (str("") in L_new):
        L_new.remove("")
    return set(L_new)


def to_lists(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets and extra quotes and
    returns a list of strings where each string is a genre label."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_").strip("!").replace("+","_"))
    while (str("") in L_new):
        L_new.remove("")
    L_new = list(set(L_new))
    return L_new

# for getting a coo matrix

def create_coo_list(series):
    """turn series (column of df) whose values
    are numpy arrays into a list
    used as info for a coo matrix"""
    info = series.values.tolist()
    info = np.hstack(info)
    return info


In [32]:
genre_data = LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train, X_path_test = X_path_test, y_path_test = y_path_test)
#genre_data = LoadGenreData(now, X_path_train = X_path_train, y_path_train = y_path_train)

In [33]:
# data = genre_data.as_lists()

In [34]:
data

Unnamed: 0_level_0,genrelist_length,gender,genre_set,genre_list
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pablo_Holman,3,male,"{rock, pop, emo_pop}","[rock, pop, emo_pop]"
Bobby_Edwards,1,male,{country},[country]
La_Palabra,4,male,"{guaracha, son_montuno, afro_cuban_jazz, salsa...","[guaracha, son_montuno, afro_cuban_jazz, salsa..."
Sherrick,2,male,"{soul, r_and_b}","[soul, r_and_b]"
Allen_Collins,1,male,{southern_rock},[southern_rock]
...,...,...,...,...
Katy_Perry,3,female,"{rock, pop, disco}","[rock, pop, disco]"
Tyler_Joseph,7,male,"{electro_pop, reggae, alternative_rock, rap_ro...","[electro_pop, reggae, alternative_rock, rap_ro..."
Delta_Goodrem,3,female,"{pop, contemporary, pop_rock}","[pop, contemporary, pop_rock]"
Chosen_Effect,4,male,"{hip_hop, r_and_b, pop, dance}","[hip_hop, r_and_b, pop, dance]"


In [35]:
data_coo = genre_data.get_coo_matrix()

In [36]:
data_coo

(array([1, 1, 1, ..., 1, 1, 1]),
 (array([    0,     0,     0, ..., 15468, 15469, 15469]),
  array([1164, 1041,  467, ...,  372, 1041,  747])))

Create Unit tests for the sparse encoding:

In [11]:
def unit_test_sparse_encoding(loadgenredata_instance, row_number):
    """compare the dictionary decoded sparse vector to the entry in the data frame"""
    X_sparse = loadgenredata_instance.get_sparse_X_vector()
    data = loadgenredata_instance.as_lists()
    dictidg = loadgenredata_instance.get_dict_id_to_genre()
    
    zeros, cols = X_sparse[row_number].nonzero()
    cols_labels = [dictidg[ind] for ind in cols]
    cols_labels.sort()
    if (set(cols_labels) != set(data.iloc[row_number].genre_list)): 
        print('False')
        return cols_labels
    else:
        return 'True'

In [12]:
def unit_test_sparse_encoding_batch(loadgenredata_instance):
    """compare the dictionary decoded sparse vector to the entry in the data frame"""
    X_sparse = loadgenredata_instance.get_sparse_X_vector()
    data = loadgenredata_instance.as_lists()
    dictidg = loadgenredata_instance.get_dict_id_to_genre()

    row_errors = []
    errors = 0
    for num in range(data.shape[0]):
        zeros, cols = X_sparse[num].nonzero()
        cols_labels = [dictidg[ind] for ind in cols]
        cols_labels.sort()
        if (set(cols_labels) != set(data.iloc[num].genre_list)): 
            errors += 1
            row_errors.append(num)
    return errors, row_errors

In [13]:
num_errors, error_list = unit_test_sparse_encoding_batch(genre_data)

In [14]:
error_list

[]

In [15]:
def decode_sparse_list(X_sparse, row_number):
    zeros, cols = X_sparse[row_number].nonzero()
    cols_labels = [dictidg[ind] for ind in cols]
    cols_labels.sort()
    return cols_labels

In [17]:
data = genre_data.data
X_sparse = genre_data.get_sparse_X_vector()

In [20]:
n = 456

In [21]:
data.iloc[n]

genrelist           ['mississippi_blues', 'electric_blues']
genrelist_length                                          2
gender                                                 male
genre_set               {electric_blues, mississippi_blues}
genre_string               electric_blues mississippi_blues
genre_list              [electric_blues, mississippi_blues]
Name: Eddie_Cusic, dtype: object

In [22]:
decode_sparse_list(X_sparse, n)

['electric_blues', 'mississippi_blues']