This notebook examines how genre list length correlates with gender.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#%matplotlib notebook


import re

from functools import partial

import plotly.graph_objects as go

Import the cleaned data:

In [2]:
%ls -lt ../../data/genre_lists/data_ready_for_model/

total 38072
-rw-r--r--@ 1 Daniel  staff   287510 May 31 05:35 genre_stats.html
-rw-r--r--@ 1 Daniel  staff     6110 May 29 11:30 female_skew_freq100.html
-rw-r--r--  1 Daniel  staff     9315 May 29 11:30 male_skew_freq100.html
-rw-r--r--  1 Daniel  staff     9778 May 29 11:30 femaleness_freq50.html
-rw-r--r--  1 Daniel  staff    16374 May 29 11:30 maleness_freq50.html
-rw-r--r--  1 Daniel  staff    73746 May 29 10:19 genre_stats.csv
-rw-r--r--  1 Daniel  staff    66235 May 21 11:00 promiscuity_table.csv
-rw-r--r--  1 Daniel  staff    57474 May 20 12:47 corpus.mm.index
-rw-r--r--  1 Daniel  staff   382436 May 20 12:47 corpus.mm
-rw-r--r--  1 Daniel  staff    49966 May 20 12:47 genre_dictionary.dict
drwxr-xr-x  5 Daniel  staff      160 May 20 10:59 [34mlogistic_model_data[m[m/
-rw-r--r--  1 Daniel  staff    10926 May 18 11:10 genre_label_non-lonely_TRAINING_2020-05-18-10-06.csv
-rw-r--r--  1 Daniel  staff     8664 May 18 11:09 genre_label_lonely_TRAINING_2020-05-18-10-06.

In [3]:
%store -r now
now
#now = '2020-05-11-14-35'

'2020-05-18-10-06'

In [4]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])

In [5]:
X_train.head()

Unnamed: 0_level_0,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Pablo_Holman,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,['country'],1
La_Palabra,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,"['r_and_b', 'soul']",2
Allen_Collins,['southern_rock'],1


In [6]:
X_train.shape, y_train.shape

((12376, 2), (12376, 1))

In [7]:
data = y_train.join( [X_train], how = 'outer')

In [8]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,male,['country'],1
La_Palabra,male,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,male,"['r_and_b', 'soul']",2
Allen_Collins,male,['southern_rock'],1


In [9]:
data.shape, data.isnull().sum()

((12376, 3),
 gender              0
 genrelist           0
 genrelist_length    0
 dtype: int64)

### Genre Labels

Each value of the genre column is a _string_ of comma separated genre labels. We want to convert it to a _list_ of strings.

In [10]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply it to the whole column and put the lists in a new column:

In [11]:
data['genrelist']= data['genrelist'].apply(genrelist)

In [12]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"[pop, rock, emo_pop]",3
Bobby_Edwards,male,[country],1
La_Palabra,male,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4
Sherrick,male,"[r_and_b, soul]",2
Allen_Collins,male,[southern_rock],1


### Extract unique genre labels: 

### This is for the training set:

In [13]:
# genre_list = data.genrelist.values.tolist()
# genre_list = [x for y in genre_list for x in y]
# genre_list = list(set(genre_list))

### Import the labels from the whole data set:

In [14]:
then = '2020-05-11-14-34'

In [15]:
genrelist_df = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now), index_col = 'Unnamed: 0')

In [16]:
genrelist_df[:5]

Unnamed: 0,genre_list
0,chilean
1,zamba
2,afro_punk_blues
3,crunk
4,spanish_guitar


In [17]:
genrelist_df.shape

(1494, 1)

In [18]:
print('There are {} artists with genre and binary-gender labels.'.format(data.shape[0]))
print('There are {} unique genre labels.'.format(genrelist_df.shape[0]))
print('Previously there were 1669 unique genre lables.')

There are 12376 artists with genre and binary-gender labels.
There are 1494 unique genre labels.
Previously there were 1669 unique genre lables.


In [19]:
data_male = data[data.gender == 'male']
data_female = data[data.gender == 'female']

In [20]:
tot = data.shape[0]
m = data_male.shape[0]
f = data_female.shape[0]
print('{} total artists'.format(tot))
print('{} female artists, or {:0.0f}%'.format(f, 100*f/(f+m)))
print('{} male artists, or {:0.0f}%'.format(m, 100*m/(f+m)))

12376 total artists
3847 female artists, or 31%
8529 male artists, or 69%


### Count the number of times that a genre label occurs:
(This deals only with the training data, not the test data.)

In [21]:
genre_list_1 = data.genrelist.values.tolist()
genre_list_1 = [x for y in genre_list_1 for x in y]
genre_counts = pd.Series(genre_list_1)
label_value_counts = genre_counts.value_counts()

In [22]:
print('In the training data there are {} unique genre labels.'.format(label_value_counts.shape[0]))

In the training data there are 1353 unique genre labels.


### Here are the frequencies of the sizes of genre lists:

In [23]:
list_length_counts = data.groupby(['genrelist_length']).count()
list_length_counts.drop(['gender'], axis = 1, inplace = True)
list_length_counts.columns = ['artist_count']

In [24]:
list_length_counts

Unnamed: 0_level_0,artist_count
genrelist_length,Unnamed: 1_level_1
1,3561
2,3067
3,2488
4,1594
5,850
6,462
7,181
8,92
9,37
10,19


And here are the frequencies of the lengths of lists by gender:

In [25]:
list_length_counts_gender = data.groupby(['genrelist_length','gender']).count()
list_length_counts_gender.columns = ['artist_count']
#list_length_counts_gender

Now include the absolute and relative frequencies by gender:

In [26]:
list_length_counts_gender.columns

Index(['artist_count'], dtype='object')

In [27]:
list_length_counts_gender.reset_index(inplace = True)
list_length_counts_gender.set_index(['genrelist_length'], inplace = True)
list_length_counts_gender = list_length_counts_gender.pivot(columns = 'gender')
list_length_counts_gender[('artist_count','female_relative')] = \
                                            round(list_length_counts_gender[('artist_count', 'female')]/f,3)
list_length_counts_gender[('artist_count','male_relative')] = \
                                            round(list_length_counts_gender[('artist_count', 'male')]/m,3)
#list_length_counts_gender.fillna(-1)
list_length_counts_gender[('artist_count','male_to_female_relative_ratio')] = \
             round(list_length_counts_gender[('artist_count', 'male_relative')]/ \
             list_length_counts_gender[('artist_count', 'female_relative')],3)
# list_length_counts_gender.apply(lambda x: round(x,3) if x.name in )

In [30]:
list_length_counts_gender.fillna(value = {('artist_count', 'female'): 0}, inplace = True)

In [31]:
list_length_counts_gender

Unnamed: 0_level_0,artist_count,artist_count,artist_count,artist_count,artist_count
gender,female,male,female_relative,male_relative,male_to_female_relative_ratio
genrelist_length,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,1031.0,2530.0,0.268,0.297,1.108
2,1030.0,2037.0,0.268,0.239,0.892
3,854.0,1634.0,0.222,0.192,0.865
4,502.0,1092.0,0.13,0.128,0.985
5,243.0,607.0,0.063,0.071,1.127
6,127.0,335.0,0.033,0.039,1.182
7,27.0,154.0,0.007,0.018,2.571
8,15.0,77.0,0.004,0.009,2.25
9,9.0,28.0,0.002,0.003,1.5
10,3.0,16.0,0.001,0.002,2.0


In [54]:
#list_length_counts_gender.to_html('/Users/Daniel/Code/Genre/visualizations/genrelist_length_likelihoods.html')

A table in plotly:

In [55]:
df = list_length_counts_gender

In [56]:
# fig = go.Figure(data=[go.Table(
#     header=dict(values=[list_length_counts_gender.index.name]+list(df.columns),
#                 fill_color='paleturquoise',
#                 align='left'),
#     cells=dict(values=[df.index, \
#                        df[('artist_count','female')], \
#                        df[('artist_count','male')], \
#                        df[('artist_count','female_relative')], \
#                        df[('artist_count','male_relative')], \
#                        df[('artist_count','male_to_female_relative_ratio')]\
#                       ], \
#                fill_color='lavender',
#                align='left'))
# ])

# fig.show()

In [57]:
#fig.write_image('/Users/Daniel/Code/Genre/visualizations/genrelist_length_likelihoods.pdf')

### Now let's count the most common genre labels for male and female... TBD