This notebook produces the gender frequencies for each genre. This is then applied to the genre lists in the network communities.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#%matplotlib notebook


import re

from functools import partial

import plotly.graph_objects as go

Import the cleaned data:

In [4]:
%store -r now
now

'2020-05-18-10-06'

In [5]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])
X_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now), index_col = ['artist'])
y_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now), index_col = ['artist'])

In [6]:
X_tot = pd.concat([X_train,X_test])
y_tot = pd.concat([y_train,y_test])

In [7]:
X_tot.shape, y_tot.shape

((15470, 2), (15470, 1))

In [8]:
data = y_tot.join([X_tot], how = 'outer')

In [9]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,male,['country'],1
La_Palabra,male,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,male,"['r_and_b', 'soul']",2
Allen_Collins,male,['southern_rock'],1


In [10]:
data.shape, data.isnull().sum()

((15470, 3),
 gender              0
 genrelist           0
 genrelist_length    0
 dtype: int64)

### Genre Labels

Each value of the genre column is a _string_ of comma separated genre labels. We want to convert it to a _list_ of strings.

In [11]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply it to the whole column and put the lists in a new column:

In [12]:
data['genrelist']= data['genrelist'].apply(genrelist)

In [13]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"[pop, rock, emo_pop]",3
Bobby_Edwards,male,[country],1
La_Palabra,male,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4
Sherrick,male,"[r_and_b, soul]",2
Allen_Collins,male,[southern_rock],1


### Import the genre labels from the whole data set:

In [14]:
genrelist_df = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now), index_col = 'Unnamed: 0')

In [15]:
genrelist_df.shape

(1494, 1)

In [16]:
print('There are {} artists with genre and binary-gender labels in the total data set.'.format(data.shape[0]))
print('There are {} unique genre labels.'.format(genrelist_df.shape[0]))

There are 15470 artists with genre and binary-gender labels in the total data set.
There are 1494 unique genre labels.


### Count the number of times that a label occurs:

In [43]:
genre_list_1 = data.genrelist.values.tolist()
genre_list_1 = [x for y in genre_list_1 for x in y]
genre_counts = pd.Series(genre_list_1)
label_value_counts = pd.DataFrame(genre_counts.value_counts())
label_value_counts.columns = ['Frequency']

Dataframes for each gender

In [44]:
data_male = data[data.gender == 'male']
data_female = data[data.gender == 'female']

### Count the number of times that a label occurs for female artists:

In [45]:
genre_list_1_female = data_female.genrelist.values.tolist()
genre_list_1_female = [x for y in genre_list_1_female for x in y]
genre_counts_female = pd.Series(genre_list_1_female)
label_value_counts_female = pd.DataFrame(genre_counts_female.value_counts())
label_value_counts_female.columns = ['Frequency_female']

### Count the number of times that a label occurs for male artists:

In [46]:
genre_list_1_male = data_male.genrelist.values.tolist()
genre_list_1_male = [x for y in genre_list_1_male for x in y]
genre_counts_male = pd.Series(genre_list_1_male)
label_value_counts_male = pd.DataFrame(genre_counts_male.value_counts())
label_value_counts_male.columns = ['Frequency_male']

In [47]:
label_value_counts = label_value_counts.join([label_value_counts_female,label_value_counts_male], how = 'left', lsuffix = '_tot')
label_value_counts.fillna(0,inplace = True)

In [48]:
label_value_counts

Unnamed: 0,Frequency,Frequency_female,Frequency_male
pop,3281,1670.0,1611.0
rock,2209,461.0,1748.0
r_and_b,2061,966.0,1095.0
country,2027,635.0,1392.0
hip_hop,1395,248.0,1147.0
...,...,...,...
aleatoric,1,0.0,1.0
haole,1,1.0,0.0
electro_harmonica,1,0.0,1.0
maskandi,1,0.0,1.0


Export:

In [19]:
# label_value_counts.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_label_counts_TOTAL_{}.csv'.format(now))