This notebook builds a function such that:
- input: a genre occurring in our data set
- output: (list of all genres it occurs with and the frequencies, the list sizes it occurs in as histogram, the gender breakdown for that genre)

Run all the cells leading up to the function and then you can put a genre into the function and run that cell.


This function will be turned into a web app using streamlit for publisc exploration of the dataset

In [23]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#%matplotlib notebook


import re

from functools import partial

import plotly.graph_objects as go

Import the cleaned data:

In [24]:
%ls -lt ../../data/genre_lists/data_ready_for_model/

total 39824
-rw-r--r--  1 Daniel  staff    21724 Jun  9 11:47 genre_label_counts_TOTAL_2020-05-18-10-06.csv
-rw-r--r--@ 1 Daniel  staff   287510 Jun  4 13:42 genre_stats.html
-rw-r--r--@ 1 Daniel  staff   911587 Jun  4 13:39 genre_set_counts.html
-rw-r--r--@ 1 Daniel  staff     1845 Jun  4 13:11 female_bias_freq500.html
-rw-r--r--@ 1 Daniel  staff     1459 Jun  4 13:11 male_bias_freq500.html
-rw-r--r--  1 Daniel  staff    73746 May 29 10:19 genre_stats.csv
-rw-r--r--  1 Daniel  staff    66235 May 21 11:00 promiscuity_table.csv
-rw-r--r--  1 Daniel  staff    57474 May 20 12:47 corpus.mm.index
-rw-r--r--  1 Daniel  staff   382436 May 20 12:47 corpus.mm
-rw-r--r--  1 Daniel  staff    49966 May 20 12:47 genre_dictionary.dict
drwxr-xr-x  5 Daniel  staff      160 May 20 10:59 [34mlogistic_model_data[m[m/
-rw-r--r--  1 Daniel  staff    10926 May 18 11:10 genre_label_non-lonely_TRAINING_2020-05-18-10-06.csv
-rw-r--r--  1 Daniel  staff     8664 May 18 11:09 genre_label_lonely_TR

In [25]:
%store -r now
now
#now = '2020-05-11-14-35'

'2020-05-18-10-06'

In [26]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])
X_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now), index_col = ['artist'])
y_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now), index_col = ['artist'])

In [27]:
X_tot = pd.concat([X_train,X_test])
y_tot = pd.concat([y_train,y_test])

In [28]:
X_tot.shape, y_tot.shape

((15470, 2), (15470, 1))

In [29]:
data = y_tot.join([X_tot], how = 'outer')

In [30]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,male,['country'],1
La_Palabra,male,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,male,"['r_and_b', 'soul']",2
Allen_Collins,male,['southern_rock'],1


In [31]:
data.shape, data.isnull().sum()

((15470, 3),
 gender              0
 genrelist           0
 genrelist_length    0
 dtype: int64)

### Genre Labels

Each value of the genre column is a _string_ of comma separated genre labels. We want to convert it to a _list_ of strings.

In [32]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply it to the whole column and put the lists in a new column:

In [33]:
data['genrelist']= data['genrelist'].apply(genrelist)

In [34]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"[pop, rock, emo_pop]",3
Bobby_Edwards,male,[country],1
La_Palabra,male,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4
Sherrick,male,"[r_and_b, soul]",2
Allen_Collins,male,[southern_rock],1


### Import the genre labels from the whole data set:

In [35]:
genrelist_df = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now), index_col = 'Unnamed: 0')

In [36]:
genrelist_df.shape

(1494, 1)

In [37]:
print('There are {} artists with genre and binary-gender labels in the total data set.'.format(data.shape[0]))
print('There are {} unique genre labels.'.format(genrelist_df.shape[0]))

There are 15470 artists with genre and binary-gender labels in the total data set.
There are 1494 unique genre labels.


Basic gender stats on full data set

In [38]:
data_male = data[data.gender == 'male']
data_female = data[data.gender == 'female']
tot = data.shape[0]
m = data_male.shape[0]
f = data_female.shape[0]
fem = 100*f/(f+m)
mal = 100*m/(f+m)
print('{} total artists'.format(tot))
print('{} female artists, or {:0.0f}%'.format(f, fem))
print('{} male artists, or {:0.0f}%'.format(m, mal))

15470 total artists
4855 female artists, or 31%
10615 male artists, or 69%


### Count the number of times that a label occurs:

In [43]:
genre_list_1 = data.genrelist.values.tolist()
genre_list_1 = [x for y in genre_list_1 for x in y]
genre_counts = pd.Series(genre_list_1)
label_value_counts = pd.DataFrame(genre_counts.value_counts())
label_value_counts.columns = ['Frequency']

In [44]:
label_value_counts[:10]

Unnamed: 0,Frequency
pop,3281
rock,2209
r_and_b,2061
country,2027
hip_hop,1395
soul,1305
folk,1288
jazz,1173
alternative_rock,1152
blues,1067


Export:

In [19]:
# label_value_counts.to_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_label_counts_TOTAL_{}.csv'.format(now))

## Co-Occurrence

Now we produce a Series with the counts of the appearances of genre labels in the lists that include QueryGenre

In [20]:
def coocurr(QueryGenre):
    # set genre to query
    QueryGenre = QueryGenre 
    # select artists whose genre list contains QueryGenre
    artists_with_QueryGenre = data[data.genrelist.apply(lambda x: True if QueryGenre in x else False)]
    # create a list of genre lists from all artists that have QueryGenre on their list
    QueryGenre_CoGenres = artists_with_QueryGenre.genrelist.values.tolist()
    # flatten
    QueryGenre_CoGenres = [x for y in QueryGenre_CoGenres for x in y]
    # turn it into a Series
    QueryGenre_CoGenres = pd.Series(QueryGenre_CoGenres)
    # make counts of appearances of the co-genres
    QueryGenre_CoGenres_counts = QueryGenre_CoGenres.value_counts()
    # drop the QueryGenre itself
    QueryGenre_CoGenres_counts.drop(QueryGenre, axis = 0, inplace = True )
    #QueryGenre_CoGenres_counts.rename_axis( 'counts', inplace = True)
    QueryGenre_CoGenres_counts.index.name = 'genres'
    return QueryGenre_CoGenres_counts

In [21]:
coocurr('rap')

genres
hip_hop               23
r_and_b               13
pop                   12
dance                  5
soul                   2
alternative            2
latin_trap             1
electronica            1
east_coast_hip_hop     1
soft_rock              1
gangsta_rap            1
mexitón                1
funk                   1
soca                   1
trap                   1
filk                   1
reggae                 1
latin_pop              1
hardcore_rap           1
house                  1
ragga                  1
spoken_word            1
homo_hop               1
big_room_house         1
west_coast_hip_hop     1
future_house           1
k_pop                  1
industrial_rock        1
latin                  1
electro_house          1
parody                 1
progressive_house      1
future_bass            1
dtype: int64

Needs to be modified:

In [22]:
# fig, ax = plt.subplots()

# # hide axes
# fig.patch.set_visible(False)
# ax.axis('off')
# ax.axis('tight')

# ax.table(cellText = coocurr('rap').values, rowLabels = coocurr('rap').index, loc = 'center')

# fig.tight_layout()

# plt.show()