This notebook calculates the following:
### <p><center>For each genre, what is the gender split among all artists that are labeled with that genre?</center></p> 

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#%matplotlib notebook


import re

from functools import partial

import plotly.graph_objects as go

Import the cleaned data:

In [2]:
#%ls -lt ../../data/genre_lists/data_ready_for_model/

In [3]:
%store -r now
now
#now = '2020-05-11-14-35'

'2020-05-18-10-06'

In [4]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])

### Genre Labels -- as a list

Each value of the genre column is a _string_ of comma separated genre labels using the spotify abbreviations. We want to convert it to a _list_ of strings.

In [5]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply
- it to the whole column and put the lists in a new column
- assemble X,y into DF
-reset index to 'artist_id'

In [6]:
X_train['genrelist']= X_train['genrelist'].apply(genrelist)

data = X_train.join(y_train, how = 'inner', on = 'artist')

data.reset_index(inplace = True)
data.index.name = 'artist_id'
data_set_size = data.shape[0]

- Full genre_list (not just that for the training set)
- Vocab Dict and Size
- max length of lists

In [7]:
genre_list = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now))
genre_list.drop(['Unnamed: 0'], axis = 1, inplace = True)
genre_list['genre_id'] = list(range(1,genre_list.shape[0]+1))

#Size of the vocab:
vocab_size = genre_list.shape[0]

#Create a dictionary {genre_label: genre_id}
label_id_dict = genre_list.set_index(['genre_list'])['genre_id'].to_dict()
id_label_dict = genre_list.set_index(['genre_id'])['genre_list'].to_dict()



#Find max length of genre lists:
max_list_length = data.genrelist_length.max()

## What are the most common genre labels? 

(This deals only with the training data, not the test data.)

### Count the number of times that a label occurs:

In [38]:
genre_list_1 = data.genrelist.values.tolist()
genre_list_1 = [x for y in genre_list_1 for x in y]
genre_counts = pd.Series(genre_list_1)
label_value_counts = genre_counts.value_counts().to_frame()

In [39]:
label_value_counts[:5]

Unnamed: 0,0
pop,2617
rock,1765
r_and_b,1647
country,1613
hip_hop,1114


In [31]:
genre_list.head()

Unnamed: 0,genre_list,genre_id
0,chilean,1
1,zamba,2
2,afro_punk_blues,3
3,crunk,4
4,spanish_guitar,5


In [8]:
# encode labels as ints within the list
def encode_list(row):
    return [label_id_dict[item] for item in row.genrelist]

data['genres_encoded_as_list'] = data.apply(encode_list, axis = 1)

#Check that the encoding is consistent: 
# n = np.random.randint(data.shape[0])
# [label_id_dict[item] for item in data.genrelist.iloc[n]], data.genres_encoded_as_list.iloc[n]

# Encode targets. The categories still appear as strings. To see the encoding use df.column.cat.codes.
data['gender'] = data.gender.apply(lambda x: 1 if x == 'female' else 0)

In [9]:
data.head()

Unnamed: 0_level_0,artist,genrelist,genrelist_length,gender,genres_encoded_as_list
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Pablo_Holman,"[pop, rock, emo_pop]",3,0,"[1007, 1431, 794]"
1,Bobby_Edwards,[country],1,0,[465]
2,La_Palabra,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4,0,"[1442, 1357, 1004, 809]"
3,Sherrick,"[r_and_b, soul]",2,0,"[1426, 359]"
4,Allen_Collins,[southern_rock],1,0,[1186]


In [10]:
data.shape

(12376, 5)

### Count the min, mean, max of number of genres:

In [11]:
n = data.shape[0]
a,b,c = data.genrelist_length.mean(), data.genrelist_length.std(), data.genrelist_length.max()
print('Total:')
print(f'{n} Artists.')
print(f'Mean number of genre labels: {round(a,2)}.')
print(f'STD of the number of genre labels: {round(b,2)}.')
print(f'Max number of genre labels: {c}.')

Total:
12376 Artists.
Mean number of genre labels: 2.7.
STD of the number of genre labels: 1.83.
Max number of genre labels: 73.


### Count the min, mean, max of number of genres for male and female artists:

In [36]:
data_female = data[data.gender == 1]
n = data_female.shape[0]
a,b,c = data_female.genrelist_length.mean(), data_female.genrelist_length.std(), data_female.genrelist_length.max()
print('Female:')
print(f'{n} Artists.')
print(f'Mean number of genre labels: {round(a,2)}.')
print(f'STD of the number of genre labels: {round(b,2)}.')
print(f'Max number of genre labels: {c}.')

Female:
3847 Artists.
Mean number of genre labels: 2.63.
STD of the number of genre labels: 1.52.
Max number of genre labels: 11.


In [13]:
# plt.hist(data_female.num_genres, bins = 25, density = True)
# plt.show()

In [37]:
data_male = data[data.gender == 0]
m = data_male.shape[0]
a,b,c = data_male.genrelist_length.mean(), data_male.genrelist_length.std(), data_male.genrelist_length.max()
print('Male:')
print(f'{m} Artists.')
print(f'Mean number of genre labels: {round(a,2)}.')
print(f'STD of the number of genre labels: {round(b,2)}.')
print(f'Max number of genre labels: {c}.')

Male:
8529 Artists.
Mean number of genre labels: 2.74.
STD of the number of genre labels: 1.95.
Max number of genre labels: 73.


In [15]:
# plt.hist(data_male.num_genres, bins = 25, density = True)
# plt.show()

Set variables:

In [16]:
max_num_male = 73
max_num_female = 11
max_num = max(max_num_female, max_num_male)

### For each genre label, collect all artists with that label:
- [ ] do this using sparse matrices for speed up?

In [17]:
lis = ['hi','bye','chow']
if 'byes' in lis:
    print('yep')

In [18]:
def indicate(row):
    if label in row.genrelist:
        return 1
    else:
        return 0

In [19]:
label_id_dict.keys?

In [20]:
for label in list(label_id_dict.keys()):
    #print(data.genrelist.iloc[0])
    if indicate(data.iloc[0]) == 1:
        print(label)

emo_pop
pop
rock


In [22]:
data.iloc[0]

artist                            Pablo_Holman
genrelist                 [pop, rock, emo_pop]
genrelist_length                             3
gender                                       0
genres_encoded_as_list       [1007, 1431, 794]
Name: 0, dtype: object

In [24]:
label = 'pop'
data['indicator'] = data.apply(indicate, axis = 1)
label_artists = data[data.indicator == 1].drop(['indicator'])
g

In [34]:
# produce stats for each label: male, female; mean number of labels; max, min number of labels
idx_list = []
for label in label_value_counts.index[0]: # use labels ordered by their frequency of appearance
    #if indicate(data.iloc[0]) == 1:
    data['indicator'] = data.apply(indicate, axis = 1)
    label_artists = data[data.indicator == 1]
    label_value_counts['female'] = label_artists.gender.sum()

In [35]:
label_value_counts

pop                  2617
rock                 1765
r_and_b              1647
country              1613
hip_hop              1114
                     ... 
chacarera               1
german_hip_hop          1
slide_guitar            1
mardi_gras_indian       1
female                  0
Length: 1354, dtype: int64

count the numbers of male and female artists with 