This notebook modifies the train/test data sets by replacing "&" with "\_and_" in the genre labels. It is applied to the X,y data as well as to the list of all genres (genre_list_{DATE}.csv). 

In [48]:
import numpy as np
import pandas as pd
from datetime import datetime

import re

In [49]:
#%store -r now
#now
now = '2020-05-11-14-35'

In [50]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])
X_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now), index_col = ['artist'])
y_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now), index_col = ['artist'])

In [51]:
data_train = y_train.join( [X_train], how = 'outer')
data_test = y_test.join( [X_test], how = 'outer')

In [52]:
data_train.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,male,['country'],1
La_Palabra,male,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,male,"['r&b', 'soul']",2
Allen_Collins,male,['southern_rock'],1


In [53]:
data_test.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Josh_Krajcik,male,"['blues', 'rock_soul']",2
Kai_Hansen,male,"['heavy_metal', 'power_metal', 'speed_metal']",3
Kendel_Carson,female,['country'],1
Josie_Cotton,female,"['pop_rock', 'new_wave']",2
Earl_Klugh,male,"['smooth_jazz', 'jazz_fusion']",2


### Genre Labels -- replace &

Each value of the genre column is a _string_ of comma separated genre labels using the spotify abbreviations. This function replaces '&' with "\_and_"

In [54]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets, commas, and extra quotes."""
def genrelist(string):
    string = string.replace("&","_and_")
    return string

Now we apply it to the whole column and put the lists in a new column:

In [55]:
data_train['genrelist']= data_train['genrelist'].apply(genrelist)
data_test['genrelist']= data_test['genrelist'].apply(genrelist)

In [56]:
data_train.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,male,['country'],1
La_Palabra,male,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,male,"['r_and_b', 'soul']",2
Allen_Collins,male,['southern_rock'],1


In [57]:
X_train = data_train[['genrelist','genrelist_length']]
y_train = data_train[['gender']]
X_test = data_test[['genrelist','genrelist_length']]
y_test = data_test[['gender']]

In [58]:
X_test.genrelist.iloc[1]

"['heavy_metal', 'power_metal', 'speed_metal']"

Export data for further use:

In [59]:
today = datetime.today()
#now = 'temp'
now = today.strftime('%Y-%m-%d-%H-%M')
#now = '2020-05-11-14-35'
%store now

Stored 'now' (str)


In [60]:
X_train.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now))
y_train.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now))

X_test.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now))
y_test.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now))

We also respell the list of all genres (train and test):

In [62]:
X_tot = pd.concat([X_train,X_test])

In [64]:
X_tot.shape

(15470, 2)

In [65]:
X_tot.head()

Unnamed: 0_level_0,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Pablo_Holman,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,['country'],1
La_Palabra,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,"['r_and_b', 'soul']",2
Allen_Collins,['southern_rock'],1


### Genre Labels

Each value of the genre column is a _string_ of comma separated genre labels. We want to convert it to a _list_ of strings.

In [78]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

In [79]:
X_tot['genrelist'] = X_tot.genrelist.apply(genrelist)

In [80]:
X_tot.head()

Unnamed: 0_level_0,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Pablo_Holman,"[pop, rock, emo_pop]",3
Bobby_Edwards,[country],1
La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4
Sherrick,"[r_and_b, soul]",2
Allen_Collins,[southern_rock],1


### Extracting the unique genre labels:

In [81]:
genre_list = X_tot.genrelist.values.tolist()
genre_list = [x for y in genre_list for x in y]
genre_list = list(set(genre_list))

### Export the list of genres:

In [82]:
genre_list_df = pd.DataFrame({'genre_list':genre_list})

In [83]:
genre_list_df.head()

Unnamed: 0,genre_list
0,chilean
1,zamba
2,afro_punk_blues
3,crunk
4,spanish_guitar


In [84]:
genre_list_df.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))

View the genre list:

In [85]:
glist = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))

In [86]:
glist.drop(['Unnamed: 0'], axis =1, inplace = True)

In [87]:
glist = glist.sort_values('genre_list')

In [88]:
glist.head()

Unnamed: 0,genre_list
948,1960s
1056,2_step
813,2_step_garage
1338,2_tone
960,a_cappella
