This notebook produces the gender frequencies for each genre. This is then applied to the genre lists in the network communities.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
#%matplotlib notebook


import re

from functools import partial

import plotly.graph_objects as go

Import the cleaned data:

In [2]:
%store -r now
now

'2020-05-18-10-06'

In [3]:
X_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_train_{}.csv'.format(now), index_col = ['artist'])
y_train = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_train_{}.csv'.format(now), index_col = ['artist'])
X_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_X_test_{}.csv'.format(now), index_col = ['artist'])
y_test = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/wiki-kaggle_y_test_{}.csv'.format(now), index_col = ['artist'])

In [4]:
X_tot = pd.concat([X_train,X_test])
y_tot = pd.concat([y_train,y_test])

In [5]:
X_tot.shape, y_tot.shape

((15470, 2), (15470, 1))

In [6]:
data = y_tot.join([X_tot], how = 'outer')

In [7]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"['pop', 'rock', 'emo_pop']",3
Bobby_Edwards,male,['country'],1
La_Palabra,male,"['afro_cuban_jazz', 'son_montuno', 'guaracha',...",4
Sherrick,male,"['r_and_b', 'soul']",2
Allen_Collins,male,['southern_rock'],1


In [8]:
data.shape, data.isnull().sum()

((15470, 3),
 gender              0
 genrelist           0
 genrelist_length    0
 dtype: int64)

### Genre Labels

Each value of the genre column is a _string_ of comma separated genre labels. We want to convert it to a _list_ of strings.

In [9]:
"""This function takes in a string of the form
appearing in the genrelist of the dataframe.
It strips the square brackets and extra quotes and
returns a list of strings where each string is a genre label."""
def genrelist(string):
    string = string.strip("[").strip("]").replace("'","")
    L = [s for s in string.split(',')]
    L_new = []
    for x in L:
        L_new.append(x.replace(" ","_").lstrip("_").rstrip("_"))
    while (str("") in L_new):
        L_new.remove("")
    return L_new

Now we apply it to the whole column and put the lists in a new column:

In [10]:
data['genrelist']= data['genrelist'].apply(genrelist)

In [11]:
data.head()

Unnamed: 0_level_0,gender,genrelist,genrelist_length
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Pablo_Holman,male,"[pop, rock, emo_pop]",3
Bobby_Edwards,male,[country],1
La_Palabra,male,"[afro_cuban_jazz, son_montuno, guaracha, salsa...",4
Sherrick,male,"[r_and_b, soul]",2
Allen_Collins,male,[southern_rock],1


### Import the genre labels from the whole data set:

In [12]:
genrelist_df = pd.read_csv('/Users/Daniel/Code/Genre/data/genre_lists/data_ready_for_model/genre_list_{}.csv'.format(now), index_col = 'Unnamed: 0')

In [13]:
genrelist_df.shape

(1494, 1)

In [14]:
print('There are {} artists with genre and binary-gender labels in the total data set.'.format(data.shape[0]))
print('There are {} unique genre labels.'.format(genrelist_df.shape[0]))

There are 15470 artists with genre and binary-gender labels in the total data set.
There are 1494 unique genre labels.


### Dataframes for each gender

In [15]:
data_male = data[data.gender == 'male']
data_female = data[data.gender == 'female']

### Count the frequency of genres for tot, female, male:

In [23]:
def freq_count(df,suffix):
    list1 = df.genrelist.values.tolist()
    list1 = [x for y in list1 for x in y]
    genre_counts = pd.Series(list1)
    label_value_counts = pd.DataFrame(genre_counts.value_counts())
    label_value_counts.columns = [f'Frequency_{suffix}']
    label_value_counts = label_value_counts.astype('int64')
    return label_value_counts

Create counts for total, female, male:

In [24]:
label_value_counts = freq_count(data, 'total')
label_value_counts_female = freq_count(data_female, 'female')
label_value_counts_male = freq_count(data_male, 'male')

Join them and reset dtype:

In [25]:
label_value_counts = label_value_counts.join([label_value_counts_female,label_value_counts_male], how = 'left')
label_value_counts.fillna(0,inplace = True)
label_value_counts = label_value_counts.astype('int64')

In [27]:
label_value_counts

Unnamed: 0,Frequency_total,Frequency_female,Frequency_male
pop,3281,1670,1611
rock,2209,461,1748
r_and_b,2061,966,1095
country,2027,635,1392
hip_hop,1395,248,1147
...,...,...,...
fringe_folk,1,0,1
vaudeville_blues,1,0,1
roots_pop,1,1,0
sami,1,1,0


### The community network csvs use "&" instead of the "\_and_" version. I change to "&" in the label_value_counts to make the join.

In [43]:
"""This function replaces _and_ with &"""
def genrelist(string):
    string = string.replace("_and_","&")
    return string

In [49]:
lvc = label_value_counts.reset_index()
lvc['index'] = lvc['index'].map(genrelist)
lvc = lvc.set_index(['index'])

In [29]:
%ls

network_gender_frequency_by_genre.ipynb
[31mwiki_artists+genres_community_data_5.csv[m[m*
[31mwiki_corpus_full_partition_5-0.100000.csv[m[m*
[31mwiki_corpus_full_partition_5-1.000000.csv[m[m*
[31mwiki_corpus_full_partition_50-0.100000.csv[m[m*
[31mwiki_corpus_full_partition_50-1.000000.csv[m[m*
[31mwiki_genres_community_data_50.csv.csv[m[m*


### Add these gender frequency counts to the genres in the community tables from Tom

In [64]:
# change code to loop through files?

wcfp1 = pd.read_csv('./wiki_corpus_full_partition_50-1.000000.csv')
wcfp2 = pd.read_csv('./wiki_corpus_full_partition_50-0.100000.csv')
wcfp3 = pd.read_csv('./wiki_corpus_full_partition_5-1.000000.csv')
wcfp4 = pd.read_csv('./wiki_corpus_full_partition_5-0.100000.csv')

In [65]:
wcfp1=wcfp1.join(lvc, on = 'genre', how = 'left')
wcfp2=wcfp2.join(lvc, on = 'genre', how = 'left')
wcfp3=wcfp3.join(lvc, on = 'genre', how = 'left')
wcfp4=wcfp4.join(lvc, on = 'genre', how = 'left')

In [66]:
wcfp1.to_csv('./wiki_corpus_full_partition_50-1.000000_with_frequencies.csv')
wcfp2.to_csv('./wiki_corpus_full_partition_50-0.100000_with_frequencies.csv')
wcfp3.to_csv('./wiki_corpus_full_partition_5-1.000000_with_frequencies.csv')
wcfp4.to_csv('./wiki_corpus_full_partition_5-0.100000_with_frequencies.csv')

Next type:

In [68]:
# change code to loop through files?

wcfp5 = pd.read_csv('./wiki_artists+genres_community_data_5.csv')
wcfp6 = pd.read_csv('./wiki_genres_community_data_50.csv.csv')

In [69]:
wcfp5

Unnamed: 0,Id,Label,timeset,frequency,type,modularity_class,pageranks,eigencentrality
0,Vinnie_Mele,Vinnie_Mele,,3,Source,47,0.000042,0.019026
1,ccm,ccm,,56,; Target,62,0.000673,0.011586
2,Kevin_Jonas,Kevin_Jonas,,5,Source,62,0.000055,0.012145
3,Vinnie_Bell,Vinnie_Bell,,1,Source,62,0.000020,0.012594
4,Jenny_Vincent,Jenny_Vincent,,2,Source,10,0.000020,0.004547
...,...,...,...,...,...,...,...,...
15818,Baz_Warne,Baz_Warne,,2,Source,10,0.000027,0.001607
15819,Sheryl_Crow,Sheryl_Crow,,7,Source,10,0.000081,0.042476
15820,extreme_metal,extreme_metal,,13,; Target,31,0.000139,0.001925
15821,Julia_Easterlin,Julia_Easterlin,,3,Source,149,0.000038,0.004900


Joins are more complicated because Id is artist or genre

In [71]:
# wcfp5=wcfp5.join(data, on = 'Id', how = 'left')
#wcfp6=wcfp2.join(lvc, on = 'genre', how = 'left')

In [66]:
# wcfp5.to_csv('./')
# wcfp6.to_csv('./')