Train a logistic model to classify the gender of an artist based on the list of genre labels.

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import re

Import the cleaned data:

In [2]:
%store -r now
now

'2020-05-11-14-35'

In [3]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])

In [4]:
X_train.shape, y_train.shape

((12376, 2), (12376, 1))

### Genre Labels

Each value of the genre column is a _string_ of comma separated genre labels using the spotify abbreviations. We want to convert it to a _list_ of strings.

In [5]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply it to the whole column and put the lists in a new column:

In [6]:
X_train['genrelist']= X_train['genrelist'].apply(genrelist)

In [7]:
X_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Pablo_Holman,"[pop, rock, emo_pop]",3
Bobby_Edwards,[country],1
La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4
Sherrick,"[r&b, soul]",2
Allen_Collins,[southern_rock],1


In [8]:
genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_training_{}.csv'.format(now))
genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [9]:
genre_list.head()

Unnamed: 0,genre_list
0,country
1,afro_cuban_jazz
2,aaa
3,mainstream_jazz
4,chicano_rock


In [60]:
genre_label_counts = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_label_counts_TRAINING_{}.csv'.format(now))
#genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
#genre_label_counts.set_index(['Unnamed: 0'], inplace = True)
genre_label_counts.index.name = 'index'
genre_label_counts.columns = ['genre','freqency']

In [61]:
genre_label_counts

Unnamed: 0_level_0,genre,freqency
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,pop,2617
1,rock,1765
2,r&b,1647
3,country,1613
4,hip_hop,1114
...,...,...
1348,tapping,1
1349,street_artist,1
1350,euthadisco,1
1351,afro,1


In [62]:
def vec_position(row):
    i = row.name
    v = np.zeros(1353)
    v[i] = 1
    return v

In [63]:
genre_label_counts['vectors'] = genre_label_counts.apply(vec_position, axis = 1)

In [64]:
genre_label_counts.head()

Unnamed: 0_level_0,genre,freqency,vectors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,pop,2617,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,rock,1765,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,r&b,1647,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,country,1613,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,hip_hop,1114,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [73]:
keys = genre_label_counts.genre.values.tolist()
values = genre_label_counts.vectors.tolist()
genre_dict = dict(zip(keys, values))

In [75]:
genre_dict['r&b']

array([0., 0., 1., ..., 0., 0., 0.])

Now apply a function to the data that adds the vectors for each genre in the list

In [79]:
def genre_list_vector(x):
    v = np.zeros(1353)
    for genre in x:
        v += genre_dict[genre]
    return v

In [80]:
X_train['vector'] = X_train.genrelist.apply(genre_list_vector)

In [81]:
X_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length,vector
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,"[pop, rock, emo_pop]",3,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Bobby_Edwards,[country],1,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Sherrick,"[r&b, soul]",2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
Allen_Collins,[southern_rock],1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


Now encode male/female as 0/1 for targets:

In [82]:
model = LogisticRegression(random_state = 0 )

In [84]:
y_train

Unnamed: 0_level_0,gender
artist,Unnamed: 1_level_1
Pablo_Holman,male
Bobby_Edwards,male
La_Palabra,male
Sherrick,male
Allen_Collins,male
...,...
Steve_Gaines,male
Dan_Hoerner,male
Detail,male
Billy_Woods,male


In [83]:
model.fit(X_train.vector,y_traina)

ValueError: setting an array element with a sequence.