In [1]:
import pandas as pd
import numpy as np
import re
import ast

# Initial author features

In [2]:
songs = pd.read_csv('songs_dataset.csv', index_col=0)
songs.head()

Unnamed: 0,Author,Album,Song,Featuring,Lyrics
0,$uicideBoy$,7th or St. Tammany,7th or St. Tammany,[],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,[],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
2,$uicideBoy$,7th or St. Tammany,I’ll Pay for It (If I Want It),[],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...
3,$uicideBoy$,7th or St. Tammany,That’s Very Minimalist of You,[],\n\n[Verse 1: RUBY DA CHERRY]\nRuby casting sh...
4,$uicideBoy$,7th or St. Tammany,Romulus,[],\n\n[Intro: YUNG HANK MOODY]\nGrey*59\nGrey*59...


In [3]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— ]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [4]:
agg_funcs = ['max', 'mean', 'median', 'std', 'var']
def get_agg_feats_names(base_name):
    return [f'{base_name}_{agg_func}' for agg_func in agg_funcs]

In [12]:
authors_features = []
for author, author_df in songs.groupby('Author'):
    songs_count = author_df.shape[0]
    albums_count = author_df['Album'].nunique()
    
    author_df['Lyrics'] = author_df['Lyrics'].apply(format_lyrics)
    
    all_words = author_df['Lyrics'].apply(lambda x: x.split())
    all_words_len = all_words.apply(len)
    all_words_len = all_words_len.agg(agg_funcs)
    
    all_words_count = all_words.apply(len)
    unique_words_count = author_df['Lyrics'].apply(lambda x: len(list(set(x.split()))))
    uniq_words = unique_words_count / all_words_count
    uniq_words = uniq_words.agg(agg_funcs)
    
    featuring_count = author_df['Featuring'].apply(lambda x: len(ast.literal_eval(x)))
    featuring_count = featuring_count.agg(agg_funcs)
    
    lyrics_symbols_count = author_df['Lyrics'].apply(len)
    lyrics_symbols_count = lyrics_symbols_count.agg(agg_funcs)
    
    lyrics_rows_count = author_df['Lyrics'].apply(lambda x: len(x.split('\n')))
    lyrics_rows_count = lyrics_rows_count.agg(agg_funcs)
    
    
    features = [author, songs_count, albums_count, *all_words_len, *uniq_words, *featuring_count, 
                *lyrics_symbols_count, *lyrics_rows_count]
    authors_features.append(features)

authors = pd.DataFrame(authors_features, columns=['Author', 'songs_count', 'albums_count',
                                                  *get_agg_feats_names('words_len'),
                                                  *get_agg_feats_names('uniq_words'),
                                                  *get_agg_feats_names('feats_count'),
                                                  *get_agg_feats_names('lyrics_symbols_count'),
                                                  *get_agg_feats_names('lyrics_rows_count')])
authors.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Author,songs_count,albums_count,words_len_max,words_len_mean,words_len_median,words_len_std,words_len_var,uniq_words_max,uniq_words_mean,...,lyrics_symbols_count_max,lyrics_symbols_count_mean,lyrics_symbols_count_median,lyrics_symbols_count_std,lyrics_symbols_count_var,lyrics_rows_count_max,lyrics_rows_count_mean,lyrics_rows_count_median,lyrics_rows_count_std,lyrics_rows_count_var
0,$uicideBoy$,191,30,1209.0,357.209424,313.0,178.110191,31723.240121,0.76506,0.54304,...,6242.0,1807.298429,1621.0,903.860335,816963.5,165.0,49.298429,47.0,23.381953,546.715734
1,2 Chainz,207,21,1700.0,490.338164,483.0,200.987711,40396.059847,0.882353,0.442808,...,10158.0,2391.913043,2357.0,1028.882164,1058599.0,178.0,69.21256,71.0,27.642012,764.080812
2,21 Savage,65,6,1093.0,561.492308,535.0,170.026388,28908.972596,0.495902,0.370608,...,5094.0,2771.861538,2642.0,801.863472,642985.0,144.0,79.538462,73.0,21.244909,451.346154
3,2Pac,386,29,2387.0,637.448187,706.5,350.111238,122577.879127,0.818182,0.459149,...,11941.0,3154.5,3493.0,1725.341921,2976805.0,199.0,84.639896,96.0,43.923574,1929.280378
4,3 Doors Down,77,7,401.0,270.818182,274.0,54.106554,2927.519139,0.552632,0.353287,...,1844.0,1298.415584,1329.0,248.048719,61528.17,82.0,50.571429,50.0,10.952563,119.958647


In [13]:
authors.shape

(432, 28)

In [14]:
authors.to_csv('authors_dataset.csv')

# Author features from parts

In [None]:
parts = pd.read_csv('parts_dataset.csv', index_col=0)
parts.head()

In [None]:
parts['Part Name'].unique()

In [None]:
def format_lyrics(lyrics):
    lyrics = re.sub('[*.’,!:?\"\'«»]', '', lyrics)
    lyrics = re.sub('[-–—— \s]+', ' ', lyrics)
    lyrics = lyrics.strip()
    lyrics = lyrics.lower()
    return lyrics

In [None]:
authors_features = []

for author, author_df in parts.groupby('Part Singer'):
    parts_freq = author_df['Part Name'].value_counts(normalize=True, dropna=False)
    parts_freq_feats = parts_freq[['Verse', 'Chorus', 'Intro', 'Hook', 
                                   'Outro', 'Bridge', 'Refrain', 'Other', np.nan]].fillna(0).values
    print(author, parts_freq_feats)

In [None]:
parts[parts['Author'] == 'Eminem']

In [None]:
temp = parts[parts['Author'] == 'Feduk']['Part Name'].value_counts(normalize=True, dropna=False)
temp