# Data Loader Class

This nb develops a class for loading the genre data with methods to put the genre info into different data types:
- [ ] lists
- [ ] sets
- [ ] strings
- [ ] scipy sparse vector
- [ ] tensorflow sparse tensor

In [1]:
# import numpy as np
# import pandas as pd

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()

# import re

In [7]:
mylist = ['a','c','b']
mylist.sort()
mylist

['a', 'b', 'c']

In [34]:
# imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


# get currrent date for latest version of data set
%store -r now

# Functions needed


def to_strings(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets, commas, and extra quotes."""
    string = string.strip("[").strip("]").replace("'","").replace(",","")
    return string

def to_sets(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets and extra quotes and
    returns a list of strings where each string is a genre label."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return set(L_new)


def to_lists(string):
    """This function takes in a string of the form
    appearing in the genrelist of the dataframe.
    It strips the square brackets and extra quotes and
    returns a list of strings where each string is a genre label."""
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

# # For sparse vector rep of X

# genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))
# genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
# genre_list['genre_id'] = list(range(1,genre_list.shape[0]+1))

# #Size of the vocab:
# vocab_size = genre_list.shape[0]

# #Create a dictionary {genre_label: genre_id}
# temp = genre_list.set_index(['genre_list'])
# label_id_dict = temp['genre_id'].to_dict()

# # set genre_id to index
# genre_list.set_index(['genre_id'], inplace = True)

# #Find max length of genre lists:
# max_list_length = data.genrelist_length.max()


X_path = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now)
y_path = '/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now)


class LoadGenreData():
    """Load and prepreocess the genre label data."""
    def __init__(self, X_path, y_path, date):
        self.X_path = X_path
        self.y_path = y_path
        self.date = date

        # import from CSV
        self.X = pd.read_csv(X_path, index_col = ['artist'])
        self.y = pd.read_csv(y_path, index_col = ['artist'])
        
        # assemble X,y into DF
        self.data = self.X.join(self.y, how = 'inner', on = 'artist')

    def data(self):
        return self.data
    
    def as_sets(self):
        """Return view of data with genre labels in a set for each artist;
        'genrelist' column is not shown"""
        self.data['genre_set']= self.data['genrelist'].apply(to_sets)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    def as_lists(self):
        """Return view of data with genre labels in a list for each artist;
        'genrelist' column is not shown"""
        self.data['genre_list']= self.data['genrelist'].apply(to_lists)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    def as_strings(self):
        """Return view of data with genre labels as string for each artist;
        'genrelist' column is not shown"""
        self.data['genre_string']= self.data['genrelist'].apply(to_strings)
        # remove old version of genre labels
        columns = self.data.columns.tolist()
        columns.remove('genrelist')
        #columns = pd.Index(columns)
        return self.data[columns]
    
    # WARNING: don't add a column to self.X in this method; use a temp DF instead
    def get_list_of_genres(self):
        """Returns a sorted list of genres for the dataset provided to the instance."""
        self.X['genre_list']= self.X['genrelist'].apply(to_lists)
        self.list_of_genres = self.X['genre_list'].values.tolist()
        self.list_of_genres = [label for artists_labels in self.list_of_genres for label in artists_labels]
        self.list_of_genres = list(set(self.list_of_genres))
        self.list_of_genres.sort()
        return self.list_of_genres
    
    def get_sparse_X_vector(self):
        """Return X as a sparse vector
        Notes on sparse vector commands: 
        To get the number of nonzero entries: X_sparse.nnz
        To get the nonzero entries of a row: X_sparse[n:m].nonzero() -- returns list of rows and columns with nonzero entries
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(self.list_of_genres,range(len(self.list_of_genres))))
        vec = CountVectorizer(vocabulary = dict_genre_to_id) # uses scipy.sparse.csr_matrix representation
        self.data_genre_strings = self.as_strings()
        self.X_genre_string = self.data_genre_strings['genre_string']
        self.X_sparse = vec.fit_transform(self.X_genre_string)
        return self.X_sparse
    
    def get_dict_genre_to_id(self):
        """Return dictionary of the form {'label':id_number}
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(self.list_of_genres,range(len(self.list_of_genres))))
        return dict_genre_to_id
    
    def get_dict_id_to_genre(self):
        """Return dictionary of the form {id_number:'label'}
        """
        self.list_of_genres = self.get_list_of_genres()
        dict_genre_to_id = dict(zip(range(len(self.list_of_genres)),self.list_of_genres))
        return dict_genre_to_id

In [35]:
genre_data = LoadGenreData(X_path,y_path,now)

In [36]:
dictgid = genre_data.get_dict_genre_to_id()
dictidg = genre_data.get_dict_id_to_genre()

In [39]:
dictgid[dictidg[409]]

409

In [26]:
X_sparse = genre_data.get_sparse_X_vector()

In [57]:
i = np.random.randint(genre_data.data.shape[0])
zeros, cols = X_sparse[i].nonzero()
rows = [i for j in range(cols.size)]
inds = [*zip(rows, cols.tolist())]
inds

[(6305, 301), (6305, 941), (6305, 1057), (6305, 1072)]

In [33]:
def unit_test_sparse_encoding(sparse_row_nonzero):
    """compare the dictionary decoded sparse vector to the entry in the data frame"""
    pass

Create an id for each genre_set

In [14]:
# Initialize list of genre sets and counts:
genre_sets = [] # a list of the genre sets

def set_id(row):
    if row.genre_set_encoded in genre_sets:
        row_id = genre_sets.index(row.genre_set_encoded)
    else:
        # add to list of all genre sets
        genre_sets.append(row.genre_set_encoded)
        row_id = genre_sets.index(row.genre_set_encoded)
    return row_id

data['set_id'] = data.apply(set_id, axis = 1)

In [15]:
data.head()

Unnamed: 0_level_0,artist,genrelist_length,genre_set,gender,genre_set_encoded,set_id
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Pablo_Holman,3,"{emo_pop, rock, pop}",male,"{794, 1007, 1431}",0
1,Bobby_Edwards,1,{country},male,{465},1
2,La_Palabra,4,"{son_montuno, guaracha, afro_cuban_jazz, salsa...",male,"{809, 1442, 1004, 1357}",2
3,Sherrick,2,"{soul, r_and_b}",male,"{1426, 359}",3
4,Allen_Collins,1,{southern_rock},male,{1186},4
