# Create a Bayes Classifier as a baseline for our modeling

In [9]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import re

Import the cleaned data:

In [10]:
#%ls -lt ../../data/genre_lists/data_ready_for_model/

In [11]:
%store -r now
now
#now = '2020-05-11-14-35'

'2020-05-18-10-06'

In [12]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])

### Genre Labels -- as a set

Each value of the genre column is a _string_ of comma separated genre labels using the spotify abbreviations. We want to convert it to a _set_ of strings.

In [13]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return set(L_new)

Now we apply
- it to the whole column and put the lists in a new column
- assemble X,y into DF
-reset index to 'artist_id'

In [14]:
X_train['genre_set']= X_train['genrelist'].apply(genrelist)

data = X_train.join(y_train, how = 'inner', on = 'artist')

data.reset_index(inplace = True)
data.index.name = 'artist_id'
data_set_size = data.shape[0]

In [15]:
data.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,genre_set,gender
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Pablo_Holman,"['pop', 'rock', 'emo_pop']",3,"{rock, pop, emo_pop}",male
1,Bobby_Edwards,['country'],1,{country},male
2,La_Palabra,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4,"{salsa_romántica, guaracha, son_montuno, afro_...",male
3,Sherrick,"['r_and_b', 'soul']",2,"{soul, r_and_b}",male
4,Allen_Collins,['southern_rock'],1,{southern_rock},male


- Full genre_list (not just that for the training set)
- Vocab Dict and Size
- max length of lists

In [17]:
genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))
genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
genre_list['genre_id'] = list(range(1,genre_list.shape[0]+1))

#Size of the vocab:
vocab_size = genre_list.shape[0]

#Create a dictionary {genre_label: genre_id}
genre_list.set_index(['genre_list'], inplace = True)
label_id_dict = genre_list['genre_id'].to_dict()

#Find max length of genre lists:
max_list_length = data.genrelist_length.max()

In [28]:
# encode labels as ints within the list
def encode_list(row):
    return {label_id_dict[item] for item in row.genre_set}

data['genre_set_encoded'] = data.apply(encode_list, axis = 1)

#Check that the encoding is consistent: 
# n = np.random.randint(data.shape[0])
# [label_id_dict[item] for item in data.genrelist.iloc[n]], data.genres_encoded_as_list.iloc[n]

# Encode targets. The categories still appear as strings. To see the encoding use df.column.cat.codes.
data['gender'] = data.gender.apply(lambda x: 1 if x == 'female' else 0)

In [29]:
data.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,genre_set,gender,genre_set_encoded
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Pablo_Holman,"['pop', 'rock', 'emo_pop']",3,"{rock, pop, emo_pop}",0,"{794, 1007, 1431}"
1,Bobby_Edwards,['country'],1,{country},0,{465}
2,La_Palabra,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4,"{salsa_romántica, guaracha, son_montuno, afro_...",0,"{809, 1442, 1004, 1357}"
3,Sherrick,"['r_and_b', 'soul']",2,"{soul, r_and_b}",0,"{1426, 359}"
4,Allen_Collins,['southern_rock'],1,{southern_rock},0,{1186}


Doesn't work yet: Groupby genre_set_encoded and value count:

In [35]:
# data.groupby(['genre_set_encoded']).count()

List of genre sets:

In [124]:
genre_sets = [] # a list of the genre sets
set_counts = {} # a dictionary of items {int id for genre set : frequency of that genre set}

In [125]:
def set_counting(row):
    if row.genre_set_encoded in genre_sets:
        set_counts[genre_sets.index(row.genre_set_encoded)] += 1
    else: 
        genre_sets.append(row.genre_set_encoded)
        set_counts[len(set_counts)] = 1

In [126]:
data.apply(set_counting, axis = 1);

In [128]:
set_counts

{0: 1,
 1: 720,
 2: 1,
 3: 101,
 4: 9,
 5: 18,
 6: 1,
 7: 1,
 8: 1,
 9: 1,
 10: 4,
 11: 8,
 12: 21,
 13: 1,
 14: 25,
 15: 1,
 16: 418,
 17: 163,
 18: 1,
 19: 1,
 20: 1,
 21: 118,
 22: 6,
 23: 32,
 24: 1,
 25: 3,
 26: 1,
 27: 443,
 28: 1,
 29: 6,
 30: 2,
 31: 1,
 32: 1,
 33: 6,
 34: 1,
 35: 1,
 36: 4,
 37: 1,
 38: 5,
 39: 1,
 40: 1,
 41: 2,
 42: 29,
 43: 1,
 44: 2,
 45: 1,
 46: 3,
 47: 1,
 48: 1,
 49: 40,
 50: 23,
 51: 2,
 52: 1,
 53: 1,
 54: 1,
 55: 1,
 56: 1,
 57: 13,
 58: 1,
 59: 1,
 60: 7,
 61: 56,
 62: 1,
 63: 11,
 64: 1,
 65: 1,
 66: 1,
 67: 1,
 68: 16,
 69: 176,
 70: 2,
 71: 3,
 72: 1,
 73: 1,
 74: 1,
 75: 2,
 76: 1,
 77: 1,
 78: 1,
 79: 2,
 80: 1,
 81: 2,
 82: 3,
 83: 1,
 84: 1,
 85: 53,
 86: 3,
 87: 8,
 88: 3,
 89: 9,
 90: 26,
 91: 2,
 92: 1,
 93: 1,
 94: 1,
 95: 6,
 96: 1,
 97: 1,
 98: 1,
 99: 1,
 100: 1,
 101: 1,
 102: 212,
 103: 2,
 104: 9,
 105: 1,
 106: 1,
 107: 3,
 108: 1,
 109: 2,
 110: 13,
 111: 2,
 112: 1,
 113: 1,
 114: 1,
 115: 3,
 116: 1,
 117: 1,
 118: 70,
 119: 1,