<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#extract-gender" data-toc-modified-id="extract-gender-1">extract gender</a></span></li><li><span><a href="#Get-lyrics-reading-ease-score" data-toc-modified-id="Get-lyrics-reading-ease-score-2">Get lyrics reading ease score</a></span></li><li><span><a href="#Save-as-csv" data-toc-modified-id="Save-as-csv-3">Save as csv</a></span></li></ul></div>

In [79]:
import pandas as pd
import os
import textstat

In [80]:
singer_gender = pd.read_csv("../data/singer_gender.csv", index_col = 0, encoding = "latin")
df = pd.read_csv("../data/2016-2019_with_lyrics_bio.csv", sep = "\t")
df = df.drop("Unnamed: 0", axis = 1)

### extract gender

In [81]:
def get_gender(name):
    try:
        return singer_gender.loc[name].gender.upper()[0]
    except:
        return "X"

In [82]:
df['artist.gender'] = df['artist'].apply(lambda name: get_gender(name))

In [None]:
with_gender = df[df['artist.gender'] != "X"]
with_gender = with_gender.drop(['artist.bio'], axis = 1)

### Get lyrics reading ease score

In [63]:
# preprocess lyrics to remve tags like [Verse 1]
# join the lyrics with ". "
# return reading ease score of the lyrics
# see https://pypi.org/project/textstat/ for reading ease score definition
def get_lyrics_reading_ease_score(lyrics):
    lyrics = str(lyrics)
    spt = lyrics.split("\n")
    # remove tags like [Verse 1], [Pre-Chorus] etc
    spt = [x for x in spt if len(x) != 0 ]
    tag_removed = [lyric for lyric in spt if lyric[0] != "[" and lyric[-1] != "]"]
    cleaned_lyrics = ". ".join(tag_removed)
    return textstat.flesch_reading_ease(cleaned_lyrics)

In [68]:
with_gender['lyrics.ease.score'] = with_gender['track.lyrics'].apply(lambda lyrics:
                                                                     get_lyrics_reading_ease_score(lyrics))
# get rid of lyrics once obtained reading ease score
with_gender = with_gender.drop("track.lyrics", axis = 1)

### Save as csv

In [72]:
with_gender.to_csv("../data/processed_dataset.csv", index = False)

In [75]:
new = pd.read_csv("../data/processed_dataset.csv")
new.head()

Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,artist.gender,lyrics.ease.score
0,Love Yourself,Justin Bieber,canadian pop,2016,100,38,61,-10,28,52,234,84,44,83,M,103.63
1,Into You,Ariana Grande,dance pop,2016,108,73,62,-6,14,37,244,2,11,80,F,85.59
2,This Is What You Came For,Calvin Harris,dance pop,2016,124,93,63,-3,15,47,222,20,3,80,M,107.38
3,Million Reasons,Lady Gaga,dance pop,2016,130,42,67,-8,11,15,205,49,4,80,F,97.7
4,Needed Me,Rihanna,barbadian pop,2016,111,31,67,-8,8,30,192,11,24,80,F,98.11


In [78]:
new.groupby("year").count()

Unnamed: 0_level_0,title,artist,top genre,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop,artist.gender,lyrics.ease.score
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51
2017,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27
2018,36,36,36,36,36,36,36,36,36,36,36,36,36,36,36
2019,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12
