In [1]:
import pandas as pd
import numpy as np
import ast
import os
import re

In [2]:
songs = pd.read_csv('songs_dataset.csv', index_col=0)
songs.head()

Unnamed: 0,Author,Album,Song,Featuring,Lyrics
0,$uicideBoy$,7th or St. Tammany,7th or St. Tammany,[],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke..."
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,[],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...
2,$uicideBoy$,7th or St. Tammany,I’ll Pay for It (If I Want It),[],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...
3,$uicideBoy$,7th or St. Tammany,That’s Very Minimalist of You,[],\n\n[Verse 1: RUBY DA CHERRY]\nRuby casting sh...
4,$uicideBoy$,7th or St. Tammany,Romulus,[],\n\n[Intro: YUNG HANK MOODY]\nGrey*59\nGrey*59...


In [3]:
songs.shape

(51343, 5)

In [5]:
with open ('singers_.txt', 'w') as file:
    file.write("\n".join(songs['Author'].unique().tolist()))

In [6]:
def get_part_lyrics(part):
    part = part[1:]
    return '\n'.join(part)

In [7]:
def get_part_info(part, all_singers, author, album, song):
    singer = None
    brackets = part[0]
    brackets_parts = re.split(':|—', brackets[1:-1])
#     brackets_parts = brackets[1:-1].split(':')
    if len(brackets_parts) == 1:
        singer = all_singers[0]
        part_name = brackets_parts[0]
    else:
        part_singers = brackets_parts[1]
        part_singers = list(map(str.strip, re.split(',|&', part_singers)))
        for part_singer in part_singers:
            if part_singer in all_singers:
                singer = part_singer
                break
        part_name = brackets_parts[0]
    part_lyrics = get_part_lyrics(part)
    if not singer:
        singer = author
    if part_lyrics == '':
        return None
    return [author, album, song, all_singers, singer, part_name, brackets[1:-1], part_lyrics]

In [8]:
def get_parts_from_song(song):
    feats = ast.literal_eval(song['Featuring'])
    author, album, song, _, lyrics = song.values
    all_singers = [author] + feats
#     print(author, '-', album, '-', song, '-', feats, '-', all_singers)
    
    indices = [i for i, ltr in enumerate(lyrics) if ltr == '[' 
                                                and lyrics[i-1] == '\n'
                                                and lyrics[i-2] == '\n'] + [len(lyrics)]
    song_parts = [lyrics[indices[i]:indices[i+1]].strip().split('\n') for i, index in enumerate(indices[:-1])]
    song_parts_info = [get_part_info(song_part, all_singers, author, album, song) for song_part in song_parts]
    song_parts_info = list(filter(None, song_parts_info))

    df = pd.DataFrame(song_parts_info, columns=['Author', 'Album', 'Song', 'All Singers', 
                                                'Part Singer', 'Part Name', 'Part Debug', 'Part Lyrics'])
    return df

In [9]:
%%time
parts = songs.apply(get_parts_from_song, axis=1)
parts = pd.concat(parts.tolist(), axis=0)
parts.reset_index(drop=True, inplace=True)

Wall time: 1min 4s


In [10]:
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics
0,$uicideBoy$,7th or St. Tammany,Dead Batteries,[$uicideBoy$],$uicideBoy$,Intro,Intro: Frayser Boy,"Hit the, hit the, hit the—\nHit the tone, cock..."
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,[$uicideBoy$],$uicideBoy$,Verse 1,Verse 1: YUNG DEATH/LIL LIFE,Bitches know I got the vendetta\nRollin' up th...
2,$uicideBoy$,7th or St. Tammany,Dead Batteries,[$uicideBoy$],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ..."
3,$uicideBoy$,7th or St. Tammany,Dead Batteries,[$uicideBoy$],$uicideBoy$,Verse 2,Verse 2: ODDY NUFF DA $NOW LEOPARD,Here I come to paint it grey\nWhip look like I...
4,$uicideBoy$,7th or St. Tammany,Dead Batteries,[$uicideBoy$],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ..."


In [11]:
parts.shape

(264227, 8)

In [12]:
parts.to_csv('parts_dataset.csv')

##### "bad" text will convert into np.nan by itself while saving to csv so we can handle it

In [13]:
parts = pd.read_csv('parts_dataset.csv', index_col=0)
parts['Part Name'] = parts['Part Name'].replace(to_replace={np.nan: 'Verse'})
parts['Part Debug'] = parts['Part Debug'].replace(to_replace={np.nan: ''})
indices_to_drop = parts[parts['Part Lyrics'].apply(lambda x: len(x.split('\n'))) == 1].index
parts.drop(indices_to_drop, inplace=True)
parts.reset_index(drop=True, inplace=True)
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics
0,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Intro,Intro: Frayser Boy,"Hit the, hit the, hit the—\nHit the tone, cock..."
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse 1,Verse 1: YUNG DEATH/LIL LIFE,Bitches know I got the vendetta\nRollin' up th...
2,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ..."
3,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse 2,Verse 2: ODDY NUFF DA $NOW LEOPARD,Here I come to paint it grey\nWhip look like I...
4,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ..."


In [14]:
parts.shape

(255930, 8)

In [15]:
def format_part_name(part_name):
    if pd.isna(part_name):
        return ''
    part_name = re.sub('[-–—— ]+', ' ', part_name)
    part_name = part_name.strip()
    part_name = part_name.lower()
    return part_name

parts['Part Name 2'] = parts['Part Name'].apply(lambda x: format_part_name(x))
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics,Part Name 2
0,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Intro,Intro: Frayser Boy,"Hit the, hit the, hit the—\nHit the tone, cock...",intro
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse 1,Verse 1: YUNG DEATH/LIL LIFE,Bitches know I got the vendetta\nRollin' up th...,verse 1
2,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ...",hook
3,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse 2,Verse 2: ODDY NUFF DA $NOW LEOPARD,Here I come to paint it grey\nWhip look like I...,verse 2
4,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ...",hook


In [16]:
def generalize_part_name(in_part_name):
    other_ = ['collision', 'spoken word', 'drop', 'spoken', 'talking', 'solo', '?', 'interview', 'inaudioble',
              'break', 'breakdown', 'sample', 'all', 'pre verse', 'both', 'instrumental', 'voicemail', 
              'dialogue', 'ad libs', 'singer', 'poet', 'man', 'instrumentale', 'pause', 'refarin',
              'pont', 'Other']
    intro_ = ['интро', 'интерлюдия', 'intro', 'into', 'interlude', 'introduction', 'Intro']
    outro_ = ['аутро', 'завершение', 'outro', 'Outro']
    bridge_ = ['бридж', 'переход', 'puente', 'bridge', 'brige', 'Bridge']
    hook_ = ['хук', 'hook', 'Hook']
    skit_ = ['скит', 'skit', 'Skit']
    refrain_ = ['рефрен', 'refrain', 'Refrain']
    chorus_ = ['припев', 'estribillo', 'ritornello', 'coro', 'chorus', 'chrous', 'Chorus']
    verse_ = ['куплет', 'strophe', 'couplet', 'verso', 'verse', 'Verse']
    
    part_names_variations = {'Other': other_, 
                             'Intro': intro_, 
                             'Outro': outro_, 
                             'Bridge': bridge_, 
                             'Hook': hook_, 
                             'Skit': skit_, 
                             'Refrain': refrain_, 
                             'Chorus': chorus_, 
                             'Verse': verse_}
    out_part_name = None
    for part_name in part_names_variations:
        if any([variation in in_part_name for variation in part_names_variations[part_name]]):
            out_part_name = part_name
    return out_part_name

parts['Part Name 3'] = parts['Part Name 2'].apply(generalize_part_name)
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics,Part Name 2,Part Name 3
0,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Intro,Intro: Frayser Boy,"Hit the, hit the, hit the—\nHit the tone, cock...",intro,Intro
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse 1,Verse 1: YUNG DEATH/LIL LIFE,Bitches know I got the vendetta\nRollin' up th...,verse 1,Verse
2,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ...",hook,Hook
3,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse 2,Verse 2: ODDY NUFF DA $NOW LEOPARD,Here I come to paint it grey\nWhip look like I...,verse 2,Verse
4,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,Hook: Frayser Boy,"Hit the tone, cock it back, ho, I gotta holla ...",hook,Hook


In [17]:
def correct_part_name(parts):
    all_singers = parts['All Singers']
    part_singer = parts['Part Singer']
    part_name = parts['Part Name']
    new_part_singer = part_singer
    new_part_name = part_name 
    if part_name in all_singers:
        new_part_singer = part_name
        new_part_name = 'Verse'
    elif part_singer in part_name:
        new_part_singer = part_singer
        new_part_name = 'Verse'
    return new_part_singer, new_part_name

In [18]:
parts['Part Name 3'].value_counts(dropna=False)

Verse      92971
Chorus     85965
Hook       23321
Bridge     14332
Intro      14060
Outro      12663
NaN         7635
Other       2534
Refrain     2176
Skit         273
Name: Part Name 3, dtype: int64

In [19]:
null_index = parts[pd.isna(parts['Part Name 3'])].index
res = parts.loc[null_index].apply(correct_part_name, axis=1)
parts.loc[null_index, 'Part Singer'] = res.apply(lambda x: x[0])
parts.loc[null_index, 'Part Name 3'] = res.apply(lambda x: x[1])

parts['Part Name 3'] = parts['Part Name 3'].apply(generalize_part_name)
parts['Part Name'] = parts['Part Name 3']
parts.drop(['Part Name 2', 'Part Name 3'], axis=1, inplace=True)

In [20]:
parts['Part Name'].value_counts(dropna=False)

Verse      96763
Chorus     85965
Hook       23321
Bridge     14332
Intro      14060
Outro      12663
NaN         3843
Other       2534
Refrain     2176
Skit         273
Name: Part Name, dtype: int64

In [21]:
null_index = parts[pd.isna(parts['Part Name'])].index
parts['Part Debug'] = parts['Part Debug'].apply(lambda x: format_part_name(x))
parts['Part Name 2'] = parts['Part Debug'].apply(generalize_part_name)
null_index = parts[pd.isna(parts['Part Name']) & pd.notna(parts['Part Name 2'])].index
parts['Part Name'].loc[null_index] = parts['Part Name 2'].loc[null_index]

parts = parts[['Author', 'Album', 'Song', 'All Singers', 'Part Singer', 'Part Name', 'Part Lyrics']]

In [22]:
parts['Part Name'].value_counts(dropna=False)

Verse      96779
Chorus     85977
Hook       23325
Bridge     14334
Intro      14062
Outro      12665
NaN         3781
Other       2558
Refrain     2176
Skit         273
Name: Part Name, dtype: int64

In [23]:
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
0,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Intro,"Hit the, hit the, hit the—\nHit the tone, cock..."
1,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse,Bitches know I got the vendetta\nRollin' up th...
2,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,"Hit the tone, cock it back, ho, I gotta holla ..."
3,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Verse,Here I come to paint it grey\nWhip look like I...
4,$uicideBoy$,7th or St. Tammany,Dead Batteries,['$uicideBoy$'],$uicideBoy$,Hook,"Hit the tone, cock it back, ho, I gotta holla ..."


In [24]:
parts.shape

(255930, 7)

In [29]:
parts[(parts['Author'] == 'Monetochka') & (True)]

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
156331,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Chorus,Разбитый ковчег всплывает со дна\nИ вечно жить...
156332,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Verse,В аромате хмельном джентльмены пускают дым\nИх...
156333,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Chorus,Разбитый ковчег всплывает со дна\nИ вечно жить...
156334,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Verse,"Говорят, что причала нет, мы обречены, и где-т..."
156335,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Skit,Россия! Россия! Россия!\nУкраина! Украина!
156336,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Bridge,Сердце не требует перемен\nДремлет в каюте уст...
156337,Monetochka,Раскраски для взрослых (Adult Coloring Books),Русский ковчег (Russian Ark),['Monetochka'],Monetochka,Chorus,Разбитый ковчег всплывает со дна\nИ вечно жить...
156338,Monetochka,Раскраски для взрослых (Adult Coloring Books),Каждый раз (Every Time),['Monetochka'],Monetochka,Chorus,"Если б мне платили каждый раз\nКаждый раз, ког..."
156339,Monetochka,Раскраски для взрослых (Adult Coloring Books),Каждый раз (Every Time),['Monetochka'],Monetochka,Verse,"«Где ты, когда ты не онлайн, когда я жду тебя ..."
156340,Monetochka,Раскраски для взрослых (Adult Coloring Books),Каждый раз (Every Time),['Monetochka'],Monetochka,Chorus,"Если б мне платили каждый раз\nКаждый раз, ког..."


In [26]:
# parts.to_csv('parts_dataset.csv')