In [1]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')

# Get singers features

In [2]:
songs = pd.read_csv('songs_dataset.csv')
print(songs.shape)
songs.head()

(253046, 7)


Unnamed: 0,Singer,Album,Song,Date,Featuring,Tags,Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,2015-04-02,[],"['Alternative', 'Rap']","\n\n[Hook: Gangsta Pat]\nKiller, killer, kille..."
1,$UICIDEBOY$,7th or St. Tammany,7th or St. Tammany,2015-04-02,[],['Rap'],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,2015-03-19,[],['Rap'],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
3,$UICIDEBOY$,7th or St. Tammany,Drugs/Hoes/Money/Etc.,2015-04-02,[],['Rap'],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...
4,$UICIDEBOY$,7th or St. Tammany,I’ll Pay for It (If I Want It),2015-04-02,[],['Rap'],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...


In [3]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— ]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [4]:
agg_funcs = ['max', 'mean', 'median', 'std', 'var']

def get_agg_feats_names(base_name):
    return [f'{base_name}_{agg_func}' for agg_func in agg_funcs]

In [5]:
%%time
singers_features = []
for singer, singer_df in songs.groupby('Singer'):
    songs_count = singer_df.shape[0]
    albums_count = singer_df['Album'].nunique()
    
    singer_df['Lyrics'] = singer_df['Lyrics'].apply(format_lyrics)
    
    words = singer_df['Lyrics'].apply(lambda x: x.split())
    words_len = words.apply(lambda x: pd.Series(map(len, x)))
    words_len_feats = words_len.agg(agg_funcs+['min'])
    words_len_feats = words_len_feats.agg(agg_funcs, axis=1)
    words_len_feats_names = [f'{col}_from_{ind}_words_len' for ind in words_len_feats.index 
                                                           for col in words_len_feats.columns]
    words_len_feats = words_len_feats.values.flatten().tolist()

    words_count = words.apply(len)
    unique_words_count = words.apply(lambda x: len(list(set(x))))
    uniq_words_prop = unique_words_count / words_count
    uniq_words_prop_feats = uniq_words_prop.agg(agg_funcs).values.tolist()
    uniq_words_prop_feats_names = get_agg_feats_names(f'uniq_words_prop')
    
    
    lyrics_rows_count = singer_df['Lyrics'].apply(lambda x: len(x.split('\n')))
    lyrics_rows_count = lyrics_rows_count.agg(agg_funcs)
    
    features = [singer, songs_count, albums_count, 
                *words_len_feats, 
                *uniq_words_prop_feats, 
                *lyrics_rows_count,
               ]
    singers_features.append(features)

singers = pd.DataFrame(singers_features, columns=['Singer', 'songs_count', 'albums_count',
                                                  *words_len_feats_names,
                                                  *uniq_words_prop_feats_names,
                                                  *get_agg_feats_names('lyrics_rows_count'),
                                                 ])

Wall time: 25min 20s


In [6]:
print(singers.shape)
singers.head()

(1947, 43)


Unnamed: 0,Singer,songs_count,albums_count,max_from_max_words_len,mean_from_max_words_len,median_from_max_words_len,std_from_max_words_len,var_from_max_words_len,max_from_mean_words_len,mean_from_mean_words_len,...,uniq_words_prop_max,uniq_words_prop_mean,uniq_words_prop_median,uniq_words_prop_std,uniq_words_prop_var,lyrics_rows_count_max,lyrics_rows_count_mean,lyrics_rows_count_median,lyrics_rows_count_std,lyrics_rows_count_var
0,$UICIDEBOY$,226,40,30.0,8.636812,8.0,3.218914,10.361407,7.0,4.006811,...,0.76506,0.538746,0.556607,0.115719,0.013391,168.0,50.70354,47.0,24.22039,586.627276
1,...And You Will Know Us by the Trail of Dead,85,10,15.0,7.710027,8.0,2.753443,7.581448,10.0,3.894231,...,0.818182,0.530128,0.507692,0.120192,0.014446,81.0,32.152941,30.0,14.086895,198.440616
2,.Otrix,121,18,20.0,9.512671,10.0,3.020169,9.121421,8.0,4.181177,...,0.831169,0.626606,0.647059,0.121055,0.014654,121.0,50.586777,48.0,21.485526,461.627824
3,10cc,140,13,29.0,7.962416,8.0,3.051737,9.313102,12.666667,4.003075,...,0.621622,0.434011,0.436894,0.101018,0.010205,130.0,46.535714,43.0,17.353461,301.1426
4,116,58,5,15.0,8.123606,8.0,2.483164,6.166103,10.0,3.94904,...,0.770492,0.444604,0.432884,0.099798,0.00996,128.0,77.948276,78.5,26.080457,680.19026


In [7]:
singers.to_csv('singers_dataset.csv')