In [1]:
import pandas as pd
import numpy as np
import ast
import json
import os
import re

# Get songs dataset from parsed data

In [2]:
def correct_non_eng_authors(name):
    if '(' in name:
        return name.split('(')[0][:-1]
    else:
        return name.strip(u'\u200b')

In [5]:
%%time
data = pd.read_csv('res_all.csv', index_col=0)
# data.drop('album_url', axis=1, inplace=True)
data.drop_duplicates(subset='text', keep='last', inplace=True)
data = data[data['text'].apply(len) > 300]
data = data[['artist', 'album', 'song', 'featuring', 'text']]
data.columns = ['Singer', 'Album', 'Song', 'Featuring', 'Lyrics']
data['Singer'] = data['Singer'].apply(correct_non_eng_authors)
data['Featuring'] = data['Featuring'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: list(map(correct_non_eng_authors, x)))
data = data[data['Song'].apply(lambda x: '(Skit)' not in x and '(Live' not in x)]
data = data[data['Singer'].isin((data.groupby('Singer')['Song'].nunique() >= 10).replace({False: np.nan}).dropna().index)]
data.sort_values(by=['Singer', 'Album'], inplace=True)
data.reset_index(drop=True, inplace=True)
print(data.shape)

(248325, 5)
Wall time: 6.05 s


(79965, 5)  
Wall time: 2.46 s

In [6]:
data.head()

Unnamed: 0,Singer,Album,Song,Featuring,Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[],"\n\n[Hook: Gangsta Pat]\nKiller, killer, kille..."
1,$UICIDEBOY$,7th or St. Tammany,7th or St. Tammany,[],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,[],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
3,$UICIDEBOY$,7th or St. Tammany,Drugs/Hoes/Money/Etc.,[],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...
4,$UICIDEBOY$,7th or St. Tammany,I’ll Pay for It (If I Want It),[],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...


In [7]:
data.to_csv('songs_dataset.csv', encoding='utf-8')

# Get parts dataset from songs

In [8]:
songs = pd.read_csv('songs_dataset.csv', index_col=0, encoding='utf-8')

In [9]:
# Write to file all singers from dataset
with open ('singers.txt', 'w', encoding='utf-8') as file:
    file.write("\n".join(songs['Singer'].unique().tolist()))

In [10]:
# Get lyrics from part test i.e. remove first row with part name and singer's name
def get_part_lyrics(part):
    part = part[1:]
    return '\n'.join(part)

In [11]:
# Get info about song author, song album, song name, all singres of song, 
# singer of part, part name, some debug info (first row of part text) and part lyrics
def get_part_info(part, all_singers, author, album, song):
    singer = None
    brackets = part[0]
    brackets_parts = re.split(':|—', brackets[1:-1])
#     brackets_parts = brackets[1:-1].split(':')
    if len(brackets_parts) == 1:
        singer = all_singers[0]
        part_name = brackets_parts[0]
    else:
        part_singers = brackets_parts[1]
        part_singers = list(map(str.strip, re.split(',|&', part_singers)))
        for part_singer in part_singers:
            if part_singer in all_singers:
                singer = part_singer
                break
        part_name = brackets_parts[0]
    part_lyrics = get_part_lyrics(part)
    if not singer:
        singer = author
    if part_lyrics == '':
        return None
    return [author, album, song, all_singers, singer, part_name, brackets[1:-1], part_lyrics]

In [12]:
# We divide song text by parts via \n's and brackets and then get info from them separately
def get_parts_from_song(song):
    feats = ast.literal_eval(song['Featuring'])
    author, album, song, _, lyrics = song.values
    all_singers = list(set([author] + feats))
#     print(author, '-', album, '-', song, '-', feats, '-', all_singers)
    
    indices = [i for i, ltr in enumerate(lyrics) if ltr == '[' and 
                                                    lyrics[i-1] == '\n' and 
                                                    lyrics[i-2] == '\n'] + [len(lyrics)]
    song_parts = [lyrics[indices[i]:indices[i+1]].strip().split('\n') for i, index in enumerate(indices[:-1])]
    song_parts_info = [get_part_info(song_part, all_singers, author, album, song) for song_part in song_parts]
    song_parts_info = list(filter(None, song_parts_info))

    df = pd.DataFrame(song_parts_info, columns=['Singer', 'Album', 'Song', 'All Singers', 
                                                'Part Singer', 'Part Name', 'Part Debug', 'Part Lyrics'])
    return df

In [13]:
%%time
# Get parts dataframe
parts = songs.apply(get_parts_from_song, axis=1)
parts = pd.concat(parts.tolist(), axis=0)
parts.reset_index(drop=True, inplace=True)
print(parts.shape)

(812842, 8)
Wall time: 6min 26s


(375992, 8)  
Wall time: 1min 51s

In [14]:
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'..."
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of..."
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou..."
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."


In [15]:
# Save raw parts dataset 
parts.to_csv('raw_parts_dataset.csv')

# Raw parts data processing

In [16]:
parts = pd.read_csv('raw_parts_dataset.csv', index_col=0)
# Some artifacts fix
parts['Part Name'] = parts['Part Name'].replace(to_replace={np.nan: 'Verse'})
parts['Part Debug'] = parts['Part Debug'].replace(to_replace={np.nan: ''})
parts['All Singers'] = parts['All Singers'].apply(lambda x: x.replace('\\xa0', ' '))
parts['Part Singer'] = parts['Part Singer'].apply(lambda x: x.replace('\\xa0', ' '))
parts['Part Debug'] = parts['Part Debug'].apply(lambda x: x.replace('\\xa0', ' '))
# Drop one-liners
indices_to_drop = parts[parts['Part Lyrics'].apply(lambda x: len(x.split('\n'))) == 1].index
parts.drop(indices_to_drop, inplace=True)
parts.reset_index(drop=True, inplace=True)
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'..."
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of..."
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou..."
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."


In [17]:
# After dropping one-liners
parts.shape

(782239, 8)

(363932, 8)

In [18]:
# Format part name for further generalization
def format_part_name(part_name):
    if pd.isna(part_name):
        return ''
    part_name = re.sub('[-–—— ]+', ' ', part_name)
    part_name = part_name.strip()
    part_name = part_name.lower()
    return part_name

In [19]:
parts['Part Name 2'] = parts['Part Name'].apply(lambda x: format_part_name(x))
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics,Part Name 2
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'...",hook
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of...",verse 1
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou...",verse 2
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook


In [20]:
# Generalize different parts names, mistakes and strange things
def generalize_part_name(in_part_name):
    other_ = ['collision', 'spoken word', 'drop', 'spoken', 'talking', 'solo', '?', 'interview', 'inaudioble',
              'break', 'breakdown', 'sample', 'all', 'pre verse', 'both', 'instrumental', 'voicemail', 
              'dialogue', 'ad libs', 'singer', 'poet', 'man', 'instrumentale', 'pause', 'refarin',
              'pont', 'Other']
    intro_ = ['интро', 'интерлюдия', 'intro', 'into', 'interlude', 'introduction', 'Intro']
    outro_ = ['аутро', 'завершение', 'outro', 'Outro']
    bridge_ = ['бридж', 'переход', 'puente', 'bridge', 'brige', 'Bridge']
    hook_ = ['хук', 'hook', 'Hook']
    skit_ = ['скит', 'skit', 'Skit']
    refrain_ = ['рефрен', 'refrain', 'Refrain']
    chorus_ = ['припев', 'estribillo', 'ritornello', 'coro', 'chorus', 'chrous', 'Chorus']
    verse_ = ['куплет', 'strophe', 'couplet', 'verso', 'verse', 'Verse']
    
    part_name_variations = {'Other': other_, 
                            'Intro': intro_, 
                            'Outro': outro_, 
                            'Bridge': bridge_, 
                            'Hook': hook_, 
                            'Skit': skit_, 
                            'Refrain': refrain_, 
                            'Chorus': chorus_, 
                            'Verse': verse_}
    out_part_name = None
    for part_name in part_name_variations:
        if any([variation in in_part_name for variation in part_name_variations[part_name]]):
            out_part_name = part_name
    return out_part_name

In [21]:
parts['Part Name 3'] = parts['Part Name 2'].apply(generalize_part_name)
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics,Part Name 2,Part Name 3
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'...",hook,Hook
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of...",verse 1,Verse
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook,Hook
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou...",verse 2,Verse
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook,Hook


In [22]:
# Attempts to process incorrect names
def correct_part_name(parts):
    all_singers = parts['All Singers']
    part_singer = parts['Part Singer']
    part_name = parts['Part Name']
    new_part_singer = part_singer
    new_part_name = part_name 
    if part_name in all_singers:
        new_part_singer = part_name
        new_part_name = 'Verse'
    elif part_singer in part_name:
        new_part_singer = part_singer
        new_part_name = 'Verse'
    return new_part_singer, new_part_name

In [23]:
parts['Part Name 3'].value_counts(dropna=False)

Verse      293938
Chorus     261102
Hook        60150
Bridge      42192
Outro       38581
Intro       35709
NaN         32065
Refrain     10392
Other        7533
Skit          577
Name: Part Name 3, dtype: int64

In [24]:
null_index = parts[pd.isna(parts['Part Name 3'])].index
res = parts.loc[null_index].apply(correct_part_name, axis=1)
parts.loc[null_index, 'Part Singer'] = res.apply(lambda x: x[0])
parts.loc[null_index, 'Part Name 3'] = res.apply(lambda x: x[1])

parts['Part Name 3'] = parts['Part Name 3'].apply(generalize_part_name)
parts['Part Name'] = parts['Part Name 3']
parts.drop(['Part Name 2', 'Part Name 3'], axis=1, inplace=True)

In [25]:
parts['Part Name'].value_counts(dropna=False)

Verse      304456
Chorus     261102
Hook        60150
Bridge      42192
Outro       38581
Intro       35709
NaN         21547
Refrain     10392
Other        7533
Skit          577
Name: Part Name, dtype: int64

In [26]:
null_index = parts[pd.isna(parts['Part Name'])].index
parts['Part Debug'] = parts['Part Debug'].apply(lambda x: format_part_name(x))
parts['Part Name 2'] = parts['Part Debug'].apply(generalize_part_name)
null_index = parts[pd.isna(parts['Part Name']) & pd.notna(parts['Part Name 2'])].index
parts['Part Name'].loc[null_index] = parts['Part Name 2'].loc[null_index]

parts = parts[['Singer', 'Album', 'Song', 'All Singers', 'Part Singer', 'Part Name', 'Part Lyrics']]

In [27]:
parts['Part Name'].value_counts(dropna=False)

Verse      304549
Chorus     261142
Hook        60160
Bridge      42197
Outro       38586
Intro       35720
NaN         21074
Refrain     10392
Other        7842
Skit          577
Name: Part Name, dtype: int64

In [28]:
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,"Killer, killer, killer\nWhat the fuck you gon'..."
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse,"Bitch, I be the $nowmane\nFifty nine shades of..."
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,"Killer, killer\nWhat the fuck you gon' do?\nI'..."
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse,"Yesterday, I rode my bike to the moon\nOr thou..."
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,"Killer, killer\nWhat the fuck you gon' do?\nI'..."


In [29]:
parts.shape

(782239, 7)

(363932, 7)

In [30]:
parts.drop_duplicates(subset='Part Lyrics', keep='last', inplace=True)
parts.sort_values(by=['Singer', 'Album'], inplace=True)
parts.reset_index(drop=True, inplace=True)
parts.shape

(664329, 7)

(304639, 7)

In [31]:
parts.isnull().sum()

Singer             0
Album          88382
Song               0
All Singers        0
Part Singer        0
Part Name      20008
Part Lyrics        0
dtype: int64

In [32]:
parts.to_csv('parts_dataset.csv')