In [1]:
import pandas as pd
import numpy as np
import re
import ast

# Initial author features

In [None]:
songs = pd.read_csv('songs_dataset.csv', index_col=0)
songs.head()

In [None]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— ]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [2]:
agg_funcs = ['max', 'mean', 'median', 'std', 'var']
def get_agg_feats_names(base_name):
    return [f'{base_name}_{agg_func}' for agg_func in agg_funcs]

In [None]:
authors_features = []
for author, author_df in songs.groupby('Singer'):
    songs_count = author_df.shape[0]
    albums_count = author_df['Album'].nunique()
    
#     author_df['Lyrics'] = author_df['Lyrics'].apply(format_lyrics)
    
#     all_words = author_df['Lyrics'].apply(lambda x: x.split())
#     all_words_len = all_words.apply(len)
#     all_words_len = all_words_len.agg(agg_funcs)
    
#     all_words_count = all_words.apply(len)
#     unique_words_count = author_df['Lyrics'].apply(lambda x: len(list(set(x.split()))))
#     uniq_words = unique_words_count / all_words_count
#     uniq_words = uniq_words.agg(agg_funcs)
    
    featuring_count = author_df['Featuring'].apply(lambda x: len(ast.literal_eval(x)))
    featuring_count = featuring_count.agg(agg_funcs)
    
#     lyrics_symbols_count = author_df['Lyrics'].apply(len)
#     lyrics_symbols_count = lyrics_symbols_count.agg(agg_funcs)
    
    lyrics_rows_count = author_df['Lyrics'].apply(lambda x: len(x.split('\n')))
    lyrics_rows_count = lyrics_rows_count.agg(agg_funcs)
    
    features = [author, songs_count, albums_count, 
#                 *all_words_len, 
#                 *uniq_words, 
                *featuring_count, 
#                 *lyrics_symbols_count, 
                *lyrics_rows_count]
    authors_features.append(features)

authors = pd.DataFrame(authors_features, columns=['Singer', 'songs_count', 'albums_count',
#                                                   *get_agg_feats_names('words_len'),
#                                                   *get_agg_feats_names('uniq_words'),
                                                  *get_agg_feats_names('feats_count'),
#                                                   *get_agg_feats_names('lyrics_symbols_count'),
                                                  *get_agg_feats_names('lyrics_rows_count')])
authors.head()

In [None]:
authors.shape

In [None]:
authors.to_csv('artist_dataset.csv')

# Author features from parts

In [14]:
parts = pd.read_csv('parts_dataset.csv', index_col=0)
all_singers = parts['Singer'].unique().tolist()
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,That’s Very Minimalist of You,['$UICIDEBOY$'],$UICIDEBOY$,Verse,"Ruby casting shadows, looting fucking castles\..."
1,$UICIDEBOY$,7th or St. Tammany,That’s Very Minimalist of You,['$UICIDEBOY$'],$UICIDEBOY$,Verse,"Ridin' shotty with my shotty, smokin' next to ..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,['$UICIDEBOY$'],$UICIDEBOY$,Intro,"Hit the, hit the, hit the—\nHit the tone, cock..."
3,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,['$UICIDEBOY$'],$UICIDEBOY$,Verse,Bitches know I got the vendetta\nRollin' up th...
4,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,['$UICIDEBOY$'],$UICIDEBOY$,Hook,"Hit the tone, cock it back, ho, I gotta holla ..."


In [4]:
parts['Part Name'].unique()

array(['Verse', 'Intro', 'Hook', 'Outro', 'Chorus', 'Other', 'Bridge',
       'Refrain', nan, 'Skit'], dtype=object)

In [5]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.’,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— \s]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [18]:
def get_part_features(part_df, part_name):
    part_df['Part Lyrics'] = part_df['Part Lyrics'].apply(format_lyrics)

    words = part_df['Part Lyrics'].apply(lambda x: x.split())
    words_len = words.apply(lambda x: pd.Series(map(len, x)))
#     print(words_len)
    words_len_feats = words_len.agg(agg_funcs+['min'])
    print(words_len_feats)
    words_len_feats = words_len_feats.agg(agg_funcs, axis=1)
#     print(words_len_feats)
    words_len_feats_names = [f'{part_name}_{col}_from_{ind}_words_len' for ind in words_len_feats.index 
                                                                       for col in words_len_feats.columns]
    words_len_feats = words_len_feats.values.flatten().tolist()
    words_count = words.apply(len)
    unique_words_count = words.apply(lambda x: len(list(set(x))))
    uniq_words_prop = unique_words_count / words_count
    uniq_words_prop_feats = uniq_words_prop.agg(agg_funcs).values.tolist()
    uniq_words_prop_feats_names = get_agg_feats_names(f'{part_name}_uniq_words_prop')
#     print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    return words_len_feats + uniq_words_prop_feats, words_len_feats_names + uniq_words_prop_feats_names


singers_features = []

for singer, singer_df in parts.iloc[:1000, :].groupby('Part Singer'):
    if singer not in all_singers:
        continue
    print(singer)
    parts_freq = singer_df['Part Name'].value_counts(normalize=True, dropna=False)
    parts_freq_feats = parts_freq[['Verse', 'Chorus', 'Intro', 'Hook', 
                                   'Outro', 'Bridge', 'Refrain', 'Skit', 'Other', np.nan]].fillna(0).values[:-2]
    verse_features, verse_features_names = get_part_features(singer_df[singer_df['Part Name'] == 'Verse'], 'Verse')
#     chorus_features, chorus_features_names = get_part_features(singer_df[singer_df['Part Name'] == 'Chorus'], 'Chorus')

#     print(singer)
#     singers_features.append()

$UICIDEBOY$


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


              0          1          2          3          4          5    \
max     12.000000  13.000000  30.000000  13.000000  13.000000  13.000000   
mean     4.273684   3.633684   4.094737   4.155789   4.153684   4.147368   
median   4.000000   3.000000   4.000000   4.000000   4.000000   4.000000   
std      1.974746   1.898401   2.460119   2.207807   2.026086   2.216016   
var      3.899622   3.603926   6.052187   4.874413   4.105023   4.910726   
min      1.000000   1.000000   1.000000   1.000000   1.000000   1.000000   

              6          7          8          9    ...  243  244  245  246  \
max     13.000000  13.000000  13.000000  10.000000  ...  4.0  2.0  4.0  5.0   
mean     4.044211   3.943158   3.972632   4.027368  ...  4.0  2.0  4.0  5.0   
median   4.000000   4.000000   4.000000   4.000000  ...  4.0  2.0  4.0  5.0   
std      2.008459   1.991260   1.836482   1.877383  ...  NaN  NaN  NaN  NaN   
var      4.033906   3.965117   3.372667   3.524566  ...  NaN  NaN  NaN  

ValueError: No axis named 1 for object type <class 'type'>

In [19]:
temp = parts[(parts['Part Singer'] == 'Chris Brown') & (parts['Part Name'] == 'Verse')]
temp.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
8059,50 Cent,Kanan Reloaded,No Romeo No Juliet,"['50 Cent', 'Chris Brown']",Chris Brown,Verse,"P.I.M.P., who the shotta them?\nI'm the girl d..."
8065,50 Cent,Kanan Reloaded,I’m the Man (Remix),"['50 Cent', 'Chris Brown']",Chris Brown,Verse,Came in the game gettin' money\nI fuck with al...
9947,A Boogie wit da Hoodie,The Bigger Artist,Fucking & Kissing,"['A Boogie wit da Hoodie', 'Chris Brown']",Chris Brown,Verse,Would I be trippin'\nIf I DM your friends and ...
10578,A$AP Ferg,Always Strive and Prosper,I Love You,"['A$AP Ferg', 'Ty Dolla $ign', 'Chris Brown']",Chris Brown,Verse,"Look, house on the water like a casino\nVacati..."
16994,Akon,,Take It Down Low,"['Akon', 'Chris Brown']",Chris Brown,Verse,"She on the pole (What?), watch her break it do..."


In [None]:
temp['Part Lyrics'] = temp['Part Lyrics'].apply(format_lyrics)

In [None]:
temp_ = temp['Part Lyrics'].apply(lambda x: x.split()).apply(lambda x: pd.Series(map(len, x))).agg(agg_funcs+['min']).agg(agg_funcs, axis=1)
temp_

In [None]:
temp['Part Lyrics'].apply(lambda x: x.split()).apply(lambda x: len(list(set(x))))

In [None]:
[f'{col}_from_{ind}_words_len' for ind in temp_.index for col in temp_.columns]

In [None]:
temp_.values.flatten().tolist()