In [18]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')

In [33]:
songs = pd.read_csv('songs_dataset.csv', index_col=0)
songs.head()

Unnamed: 0,Singer,Album,Song,Date,Featuring,Tags,Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,2015-04-02,[],"['Alternative', 'Rap']","\n\n[Hook: Gangsta Pat]\nKiller, killer, kille..."
1,$UICIDEBOY$,7th or St. Tammany,7th or St. Tammany,2015-04-02,[],['Rap'],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,2015-03-19,[],['Rap'],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
3,$UICIDEBOY$,7th or St. Tammany,Drugs/Hoes/Money/Etc.,2015-04-02,[],['Rap'],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...
4,$UICIDEBOY$,7th or St. Tammany,I’ll Pay for It (If I Want It),2015-04-02,[],['Rap'],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...


In [34]:
songs.shape

(253046, 7)

In [20]:
tags = songs['Tags'].apply(lambda x: literal_eval(x)).values.tolist()

In [25]:
tags = list(map(literal_eval, set(songs['Tags'].values)))

In [26]:
flat_list = [item for sublist in tags for item in sublist]
flat_list = list(set(flat_list))

In [28]:
len(flat_list)

649

In [29]:
flat_list

['Economics',
 'Ska Punk',
 'Україна (Ukraine)',
 'Adult Swim',
 'Россия (Russia)',
 'Afrika',
 'Trip-Hop',
 'Polski Rap',
 'Nation Of Islam',
 'House',
 'Eurodance',
 'Retro',
 'Hardcore Punk',
 'Chillout',
 'Disney',
 'Vaporwave',
 'Flamenco',
 'Traduction Française',
 'Video Game',
 'Singer-Songwriter',
 'Speed Metal',
 'Jump Blues',
 'Eighties',
 'Christian Pop',
 'Estonia',
 'Metal',
 'Progressive House',
 'Norsk Rap',
 'Speeches',
 'Cinematic',
 'Indie Monday',
 'Portugal',
 'West Coast',
 'Tradução',
 'Scandinavia',
 'Conscious Hip-Hop',
 'Art-Punk',
 'Bachata',
 'Art Rock',
 'Folk Rock',
 'Memes',
 'Musicals',
 'Norsk',
 'Classical Music',
 'Drone',
 'Slang',
 'Deutscher Rock',
 'Fantasy (Lit)',
 'Anime',
 'Melodic Death Metal',
 'Science ',
 'Melodic Metalcore',
 'Climate Change',
 'Motown',
 'Progressive Death Metal',
 'Interview',
 'Proto-Punk',
 'Dubstep',
 'Русский трэп (Russian Trap)',
 'Interlude',
 'Moroccan Rap  | راب مغربي',
 'Christmas Rap',
 'Roots',
 'Electronic Tr

# Initial author features

In [2]:
songs = pd.read_csv('songs_dataset.csv', index_col=0)
songs.head()

Unnamed: 0,Singer,Album,Song,Featuring,Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[],"\n\n[Hook: Gangsta Pat]\nKiller, killer, kille..."
1,$UICIDEBOY$,7th or St. Tammany,7th or St. Tammany,[],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,[],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
3,$UICIDEBOY$,7th or St. Tammany,Drugs/Hoes/Money/Etc.,[],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...
4,$UICIDEBOY$,7th or St. Tammany,I’ll Pay for It (If I Want It),[],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...


In [3]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— ]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [4]:
agg_funcs = ['max', 'mean', 'median', 'std', 'var']
def get_agg_feats_names(base_name):
    return [f'{base_name}_{agg_func}' for agg_func in agg_funcs]

In [5]:
%%time
authors_features = []
for author, author_df in songs.groupby('Singer'):
    songs_count = author_df.shape[0]
    albums_count = author_df['Album'].nunique()
    
    author_df['Lyrics'] = author_df['Lyrics'].apply(format_lyrics)
    
    words = author_df['Lyrics'].apply(lambda x: x.split())
    words_len = words.apply(lambda x: pd.Series(map(len, x)))
    words_len_feats = words_len.agg(agg_funcs+['min'])
    words_len_feats = words_len_feats.agg(agg_funcs, axis=1)
    words_len_feats_names = [f'{col}_from_{ind}_words_len' for ind in words_len_feats.index 
                                                           for col in words_len_feats.columns]
    words_len_feats = words_len_feats.values.flatten().tolist()

    words_count = words.apply(len)
    unique_words_count = words.apply(lambda x: len(list(set(x))))
    uniq_words_prop = unique_words_count / words_count
    uniq_words_prop_feats = uniq_words_prop.agg(agg_funcs).values.tolist()
    uniq_words_prop_feats_names = get_agg_feats_names(f'uniq_words_prop')
    
    featuring_count = author_df['Featuring'].apply(lambda x: len(ast.literal_eval(x)))
    featuring_count = featuring_count.agg(agg_funcs)
    
    lyrics_rows_count = author_df['Lyrics'].apply(lambda x: len(x.split('\n')))
    lyrics_rows_count = lyrics_rows_count.agg(agg_funcs)
    
    features = [author, songs_count, albums_count, 
                *words_len_feats, 
                *uniq_words_prop_feats, 
                *featuring_count, 
                *lyrics_rows_count]
    authors_features.append(features)

authors = pd.DataFrame(authors_features, columns=['Singer', 'songs_count', 'albums_count',
                                                  *words_len_feats_names,
                                                  *uniq_words_prop_feats_names,
                                                  *get_agg_feats_names('feats_count'),
                                                  *get_agg_feats_names('lyrics_rows_count')])
print(authors.shape)

(1947, 48)
Wall time: 22min 8s


(492, 48)  
Wall time: 7min 30s

In [8]:
authors.head()

Unnamed: 0,Singer,songs_count,albums_count,max_from_max_words_len,mean_from_max_words_len,median_from_max_words_len,std_from_max_words_len,var_from_max_words_len,max_from_mean_words_len,mean_from_mean_words_len,...,feats_count_max,feats_count_mean,feats_count_median,feats_count_std,feats_count_var,lyrics_rows_count_max,lyrics_rows_count_mean,lyrics_rows_count_median,lyrics_rows_count_std,lyrics_rows_count_var
0,$UICIDEBOY$,227,40,30.0,8.685691,8.0,3.213507,10.326625,8.0,4.008446,...,3.0,0.15859,0.0,0.432582,0.187127,165.0,50.678414,47.0,24.075244,579.617364
1,...And You Will Know Us by the Trail of Dead,85,10,15.0,7.710027,8.0,2.753443,7.581448,10.0,3.894231,...,0.0,0.0,0.0,0.0,0.0,81.0,32.152941,30.0,14.086895,198.440616
2,.Otrix,113,17,20.0,9.479532,10.0,3.004565,9.027412,8.0,4.190111,...,2.0,0.123894,0.0,0.35689,0.12737,121.0,49.318584,47.0,20.675324,427.469027
3,10cc,136,14,29.0,7.962416,8.0,3.051737,9.313102,12.666667,4.00339,...,0.0,0.0,0.0,0.0,0.0,130.0,46.801471,43.5,17.487697,305.819553
4,116,56,5,15.0,8.173695,8.0,2.415393,5.834121,10.0,3.881165,...,7.0,2.107143,2.0,1.580317,2.497403,128.0,77.410714,78.5,25.803296,665.810065


In [9]:
authors.to_csv('artist_dataset.csv')

# Author features from parts

In [None]:
parts = pd.read_csv('parts_dataset.csv', index_col=0)
all_singers = parts['Singer'].unique().tolist()
parts.head()

In [None]:
parts['Part Name'].unique()

In [None]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.’,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— \s]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [None]:
def get_part_features(part_df, part_name):
    part_df['Part Lyrics'] = part_df['Part Lyrics'].apply(format_lyrics)

    words = part_df['Part Lyrics'].apply(lambda x: x.split())
    words_len = words.apply(lambda x: pd.Series(map(len, x)))
#     print(words_len)
    words_len_feats = words_len.agg(agg_funcs+['min'])
    print(words_len_feats)
    words_len_feats = words_len_feats.agg(agg_funcs, axis=1)
#     print(words_len_feats)
    words_len_feats_names = [f'{part_name}_{col}_from_{ind}_words_len' for ind in words_len_feats.index 
                                                                       for col in words_len_feats.columns]
    words_len_feats = words_len_feats.values.flatten().tolist()
    
    words_count = words.apply(len)
    unique_words_count = words.apply(lambda x: len(list(set(x))))
    uniq_words_prop = unique_words_count / words_count
    uniq_words_prop_feats = uniq_words_prop.agg(agg_funcs).values.tolist()
    uniq_words_prop_feats_names = get_agg_feats_names(f'{part_name}_uniq_words_prop')
#     print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    return words_len_feats + uniq_words_prop_feats, words_len_feats_names + uniq_words_prop_feats_names


singers_features = []

for singer, singer_df in parts.iloc[:1000, :].groupby('Part Singer'):
    if singer not in all_singers:
        continue
    print(singer)
    parts_freq = singer_df['Part Name'].value_counts(normalize=True, dropna=False)
    parts_freq_feats = parts_freq[['Verse', 'Chorus', 'Intro', 'Hook', 
                                   'Outro', 'Bridge', 'Refrain', 'Skit', 'Other', np.nan]].fillna(0).values[:-2]
    verse_features, verse_features_names = get_part_features(singer_df[singer_df['Part Name'] == 'Verse'], 'Verse')
#     chorus_features, chorus_features_names = get_part_features(singer_df[singer_df['Part Name'] == 'Chorus'], 'Chorus')

#     print(singer)
#     singers_features.append()

In [None]:
temp = parts[(parts['Part Singer'] == 'Chris Brown') & (parts['Part Name'] == 'Verse')]
temp.head()

In [None]:
temp['Part Lyrics'] = temp['Part Lyrics'].apply(format_lyrics)

In [None]:
temp_ = temp['Part Lyrics'].apply(lambda x: x.split()).apply(lambda x: pd.Series(map(len, x))).agg(agg_funcs+['min']).agg(agg_funcs, axis=1)
temp_

In [None]:
temp['Part Lyrics'].apply(lambda x: x.split()).apply(lambda x: len(list(set(x))))

In [None]:
[f'{col}_from_{ind}_words_len' for ind in temp_.index for col in temp_.columns]

In [None]:
temp_.values.flatten().tolist()