In [1]:
import numpy as np
import pandas as pd

from ast import literal_eval
from collections import Counter
import re

In [2]:
df = pd.read_csv('songs_dataset.csv', usecols=['Singer', 'Song', 'Lyrics', 'Genre'])
df['Genre'] = df['Genre'].apply(literal_eval)
print(df.shape)
df.head()

(253678, 4)


Unnamed: 0,Singer,Song,Genre,Lyrics
0,$UICIDEBOY$,40 Oz. & Sober,"[Alternative/Indie, Hip-Hop/Rap]","\n\n[Hook: Gangsta Pat]\nKiller, killer, kille..."
1,$UICIDEBOY$,7th or St. Tammany,[Hip-Hop/Rap],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
2,$UICIDEBOY$,Dead Batteries,[Hip-Hop/Rap],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
3,$UICIDEBOY$,Drugs/Hoes/Money/Etc.,[Hip-Hop/Rap],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...
4,$UICIDEBOY$,I’ll Pay for It (If I Want It),[Hip-Hop/Rap],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...


In [3]:
%%time
# for better tokenization 
def format_str(str_):
    str_ = re.sub('[\[\]*.,!:?\"\'«»]', '', str_)
    str_ = re.sub('[-–——]+', ' ', str_)
    str_ = str_.strip()
    str_ = str_.lower()
    return str_

df['Lyrics'] = df['Lyrics'].apply(format_str)

Wall time: 9.08 s


In [4]:
%%time
all_genres = set([genre for genres in df['Genre'] for genre in genres])

for genre in all_genres:
    df[f'is_{genre}'] = df['Genre'].apply(lambda tags: int(not bool(set(tags).isdisjoint([genre]))))
df.drop(['Genre'], axis=1, inplace=True)
df.head()

Wall time: 1.44 s


Unnamed: 0,Singer,Song,Lyrics,is_Country,is_Folk,is_R&B/Soul,is_Pop,is_Rock,is_Metal,is_Hip-Hop/Rap,is_Punk,is_Alternative/Indie,is_Blues/Jazz
0,$UICIDEBOY$,40 Oz. & Sober,hook gangsta pat\nkiller killer killer\nwhat t...,0,0,0,0,0,0,1,0,1,0
1,$UICIDEBOY$,7th or St. Tammany,uh\n59 yah\nuhhhh \n59 motherfucker\nwest bank...,0,0,0,0,0,0,1,0,0,0
2,$UICIDEBOY$,Dead Batteries,produced by budd dwyer\n\nintro frayser boy\nh...,0,0,0,0,0,0,1,0,0,0
3,$UICIDEBOY$,Drugs/Hoes/Money/Etc.,verse 1 yung $now\ncounting sheep until im sou...,0,0,0,0,0,0,1,0,0,0
4,$UICIDEBOY$,I’ll Pay for It (If I Want It),verse 1 ruby da cherry\nfucking ruby got a blu...,0,0,0,0,0,0,1,0,0,0


In [5]:
%%time
lyrics_features = []

def get_words_features(words):
    words = pd.Series(words)
    words_features = words.agg([pd.Series.count, pd.Series.nunique]).values.tolist()
    words_features.append(words_features[1]/words_features[0])

    words_len = words.apply(len)
    words_len_features = words_len.agg([pd.Series.count, pd.Series.nunique, np.min, np.max, 
                                        np.mean, np.median, np.std, np.var, np.sum]).values.tolist()
    return [*words_features, *words_len_features]


def get_lyrics_features(lyrics):
    all_words = lyrics.split()
    all_words_features = get_words_features(all_words)

    rows = list(map(lambda row: row.split(), lyrics.split('\n')))
    rows_features = [get_words_features(row) for row in rows if row]
    rows_features = (pd.DataFrame(rows_features)
                     .agg([np.min, np.max, np.mean, np.median, np.std, np.var])
                     .values.flatten().tolist())

    lyrics_features.append([*all_words_features, *rows_features])

df['Lyrics'].apply(get_lyrics_features)
lyrics_features = pd.DataFrame(lyrics_features)
lyrics_features.replace({np.nan: 0, np.inf: 0, -np.inf: 0}, inplace=True)
print(lyrics_features.shape)

(253678, 84)
Wall time: 5h 20min 6s


In [6]:
lyrics_features_ = pd.concat([df.iloc[:, :2], lyrics_features, df.iloc[:, -10:]], axis=1)
lyrics_features_.head()

Unnamed: 0,Singer,Song,0,1,2,3,4,5,6,7,...,is_Country,is_Folk,is_R&B/Soul,is_Pop,is_Rock,is_Metal,is_Hip-Hop/Rap,is_Punk,is_Alternative/Indie,is_Blues/Jazz
0,$UICIDEBOY$,40 Oz. & Sober,427,159,0.372365,427.0,11.0,1.0,12.0,4.096019,...,0,0,0,0,0,0,1,0,1,0
1,$UICIDEBOY$,7th or St. Tammany,86,25,0.290698,86.0,7.0,2.0,12.0,3.5,...,0,0,0,0,0,0,1,0,0,0
2,$UICIDEBOY$,Dead Batteries,407,156,0.383292,407.0,10.0,1.0,10.0,3.960688,...,0,0,0,0,0,0,1,0,0,0
3,$UICIDEBOY$,Drugs/Hoes/Money/Etc.,240,146,0.608333,240.0,9.0,1.0,9.0,3.85,...,0,0,0,0,0,0,1,0,0,0
4,$UICIDEBOY$,I’ll Pay for It (If I Want It),246,152,0.617886,246.0,12.0,1.0,13.0,3.918699,...,0,0,0,0,0,0,1,0,0,0


In [7]:
lyrics_features_.to_csv('lyrics_features.csv')