In [1]:
import pandas as pd
import numpy as np
import json
import ast
import os
import re

In [None]:
songs = pd.read_csv('songs_dataset.csv', index_col=0)
songs.head()

In [None]:
songs.shape

In [None]:
def get_part_lyrics(part):
    part = part[1:]
    return '\n'.join(part)

In [None]:
def get_part_info(part, all_singers, author, album, song):
    singer = None
    brackets = part[0]
    brackets_parts = re.split(':|—', brackets[1:-1])
    brackets_parts = brackets[1:-1].split(':')
    if len(brackets_parts) == 1:
        singer = all_singers[0]
        part_name = brackets_parts[0]
    else:
        part_singers = brackets_parts[1]
        part_singers = list(map(str.strip, re.split(',|&', part_singers)))
        for part_singer in part_singers:
            if part_singer in all_singers:
                singer = part_singer
                break
        part_name = brackets_parts[0]
    part_lyrics = get_part_lyrics(part)
    if not singer:
        singer = author
    if part_lyrics == '':
        return None
    return [author, album, song, all_singers, singer, part_name, part_lyrics]

In [None]:
def get_parts_from_song(song):
    feats = ast.literal_eval(song['Featuring'])
    author, album, song, _, lyrics = song.values
    all_singers = [author] + feats
#     print(author, '-', album, '-', song, '-', feats, '-', all_singers)
    
    indices = [i for i, ltr in enumerate(lyrics) if ltr == '[' 
                                                and lyrics[i-1] == '\n'
                                                and lyrics[i-2] == '\n'] + [len(lyrics)]
    song_parts = [lyrics[indices[i]:indices[i+1]].strip().split('\n') for i, index in enumerate(indices[:-1])]
    song_parts_info = [get_part_info(song_part, all_singers, author, album, song) for song_part in song_parts]
    song_parts_info = list(filter(None, song_parts_info))

    df = pd.DataFrame(song_parts_info, columns=['Author', 'Album', 'Song', 'All Singers', 
                                                'Part Singer', 'Part Name', 'Part Lyrics'])
    return df

In [None]:
%%time
parts = songs.apply(get_parts_from_song, axis=1)
parts = pd.concat(parts.tolist(), axis=0)
parts.reset_index(drop=True, inplace=True)

In [None]:
parts.head()

In [None]:
parts.to_csv('parts_dataset.csv')

In [28]:
parts = pd.read_csv('parts_dataset.csv', index_col=0)
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
0,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Interlude,"""They changed so much, you know what I'm sayin..."
1,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 1,"Yeah, I was the wave, but now I feel the tide ..."
2,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 2,Two cups and a bad bitch\nThen I pop me a Xana...
3,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Refrain,"4 AM, praying, ""can I get some rest?""\nDreams ..."
4,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Outro,"""Happening now, streets blocked off as New Orl..."


In [29]:
parts.dropna(inplace=True)

In [30]:
parts.to_csv('parts_dataset.csv')

In [31]:
parts = pd.read_csv('parts_dataset.csv', index_col=0)
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
0,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Interlude,"""They changed so much, you know what I'm sayin..."
1,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 1,"Yeah, I was the wave, but now I feel the tide ..."
2,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 2,Two cups and a bad bitch\nThen I pop me a Xana...
3,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Refrain,"4 AM, praying, ""can I get some rest?""\nDreams ..."
4,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Outro,"""Happening now, streets blocked off as New Orl..."


In [32]:
def format_part_name(part_name):
    part_name = re.sub('[-–—— ]+', ' ', part_name)
    part_name = part_name.strip()
    part_name = part_name.lower()
    return part_name

In [33]:
parts['Part Name 2'] = parts['Part Name'].apply(lambda x: format_part_name(x))

In [34]:
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics,Part Name 2
0,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Interlude,"""They changed so much, you know what I'm sayin...",interlude
1,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 1,"Yeah, I was the wave, but now I feel the tide ...",verse 1
2,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 2,Two cups and a bad bitch\nThen I pop me a Xana...,verse 2
3,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Refrain,"4 AM, praying, ""can I get some rest?""\nDreams ...",refrain
4,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Outro,"""Happening now, streets blocked off as New Orl...",outro


In [35]:
def fix_part_name(in_part_name):
    other_ = ['collision', 'spoken word', 'drop', 'spoken', 'talking', 'solo', '?', 'interview', 'inaudioble',
              'break', 'breakdown', 'sample', 'all', 'pre verse', 'both', 'instrumental', 'voicemail', 'Other']
    intro_ = ['интро', 'интерлюдия', 'intro', 'interlude', 'introduction', 'Intro']
    outro_ = ['аутро', 'завершение', 'outro', 'Outro']
    bridge_ = ['бридж', 'переход', 'puente', 'bridge', 'brige', 'Bridge']
    hook_ = ['хук', 'hook', 'Hook']
    skit_ = ['скит', 'skit', 'Skit']
    refrain_ = ['рефрен', 'refrain', 'Refrain']
    chorus_ = ['припев', 'estribillo', 'ritornello', 'coro', 'chorus', 'chrous', 'Chorus']
    verse_ = ['куплет', 'strophe', 'couplet', 'verso', 'verse', 'Verse']
    
    part_names_variations = {'Other': other_, 
                             'Intro': intro_, 
                             'Outro': outro_, 
                             'Bridge': bridge_, 
                             'Hook': hook_, 
                             'Skit': skit_, 
                             'Refrain': refrain_, 
                             'Chorus': chorus_, 
                             'Verse': verse_}
    out_part_name = None
    for part_name in part_names_variations:
        if any([variation in in_part_name for variation in part_names_variations[part_name]]):
            out_part_name = part_name
    return out_part_name

In [36]:
parts['Part Name 2'] = parts['Part Name 2'].apply(fix_part_name)

In [37]:
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics,Part Name 2
0,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Interlude,"""They changed so much, you know what I'm sayin...",Intro
1,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 1,"Yeah, I was the wave, but now I feel the tide ...",Verse
2,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse 2,Two cups and a bad bitch\nThen I pop me a Xana...,Verse
3,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Refrain,"4 AM, praying, ""can I get some rest?""\nDreams ...",Refrain
4,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Outro,"""Happening now, streets blocked off as New Orl...",Outro


In [38]:
drop_indices = parts[pd.isna(parts['Part Name 2']) & 
                     (parts['Part Lyrics'].apply(lambda x: len(x.split('\n'))) == 1)].index
parts.drop(drop_indices, inplace=True)
parts.reset_index(drop=True, inplace=True)
parts.shape

(190261, 8)

In [39]:
parts[pd.isna(parts['Part Name 2'])]

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics,Part Name 2
1283,2 Chainz,Trapavelli Tre,Each Erry One Of Em,"['2 Chainz', 'Skooly', 'Cap-1']",2 Chainz,2 Chainz],I’m a pull my whips out\nEach and erry one of ...,
1332,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,C-White,Strap in this bitch fucking up\nAli in this bi...,
1333,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,2 Chainz,I'm in this bitch fucking up\nI wish Testaross...,
1334,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Young Dolph,Y'all playing we fuck it up\nMedecine in my do...,
1335,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Skooly,I'm in Magic City drunk as fuck\nAfter that we...,
1336,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Skooly,I'm in this bitch and I'm geeked up\nAll about...,
1337,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Young Dolph,Yo' hoe up in here choosing us\nShe only want ...,
1338,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Skooly,Took a 20 turned it to a 50\nI'm a menace all ...,
1339,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Greazy,I put your ho on the boot\nShe sucking and fuc...,
1340,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Juju,"You playing ? We fucking it up !\nFuck it up, ...",


In [40]:
parts.iloc[[1283], :]

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics,Part Name 2
1283,2 Chainz,Trapavelli Tre,Each Erry One Of Em,"['2 Chainz', 'Skooly', 'Cap-1']",2 Chainz,2 Chainz],I’m a pull my whips out\nEach and erry one of ...,


In [41]:
parts.iloc[1332:1342, :]

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics,Part Name 2
1332,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,C-White,Strap in this bitch fucking up\nAli in this bi...,
1333,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,2 Chainz,I'm in this bitch fucking up\nI wish Testaross...,
1334,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Young Dolph,Y'all playing we fuck it up\nMedecine in my do...,
1335,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Skooly,I'm in Magic City drunk as fuck\nAfter that we...,
1336,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Skooly,I'm in this bitch and I'm geeked up\nAll about...,
1337,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Young Dolph,Yo' hoe up in here choosing us\nShe only want ...,
1338,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Skooly,Took a 20 turned it to a 50\nI'm a menace all ...,
1339,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Greazy,I put your ho on the boot\nShe sucking and fuc...,
1340,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Juju,"You playing ? We fucking it up !\nFuck it up, ...",
1341,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,2 Chainz,I'm in this bitch fucking paper up\nYou didn't...,


In [42]:
def correct_part_name(parts):
    all_singers = parts['All Singers']
    part_singer = parts['Part Singer']
    part_name = parts['Part Name']
    new_part_singer = part_singer
    new_part_name = part_name 
    if part_name in all_singers:
        new_part_singer = part_name
        new_part_name = 'Chorus'
    elif part_singer in part_name:
        new_part_singer = part_singer
        new_part_name = 'Chorus'
    return new_part_singer, new_part_name

In [43]:
parts['Part Name 2'].value_counts(dropna=False)

Verse      65973
Chorus     58541
Hook       20611
Intro      13845
Outro      10375
Bridge      9591
NaN         7269
Other       1965
Refrain     1815
Skit         276
Name: Part Name 2, dtype: int64

In [44]:
null_index = parts[pd.isna(parts['Part Name 2'])].index
null_index

Int64Index([  1283,   1332,   1333,   1334,   1335,   1336,   1337,   1338,
              1339,   1340,
            ...
            190188, 190189, 190190, 190191, 190192, 190193, 190194, 190217,
            190234, 190256],
           dtype='int64', length=7269)

In [45]:
res = parts.loc[null_index].apply(correct_part_name, axis=1)
parts.loc[null_index, 'Part Singer'] = res.apply(lambda x: x[0])
parts.loc[null_index, 'Part Name 2'] = res.apply(lambda x: x[1])

In [46]:
parts['Part Name 2'] = parts['Part Name 2'].apply(fix_part_name)
parts['Part Name'] = parts['Part Name 2']
parts.drop(['Part Name 2'], axis=1, inplace=True)
parts.head()

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
0,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Intro,"""They changed so much, you know what I'm sayin..."
1,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse,"Yeah, I was the wave, but now I feel the tide ..."
2,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Verse,Two cups and a bad bitch\nThen I pop me a Xana...
3,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Refrain,"4 AM, praying, ""can I get some rest?""\nDreams ..."
4,$uicideBoy$,I Want to Die in New Orleans,King Tulip,['$uicideBoy$'],$uicideBoy$,Outro,"""Happening now, streets blocked off as New Orl..."


In [47]:
parts['Part Name'].value_counts(dropna=False)

Verse      65973
Chorus     62462
Hook       20611
Intro      13845
Outro      10375
Bridge      9591
NaN         3348
Other       1965
Refrain     1815
Skit         276
Name: Part Name, dtype: int64

In [48]:
parts.iloc[[1283], :]

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
1283,2 Chainz,Trapavelli Tre,Each Erry One Of Em,"['2 Chainz', 'Skooly', 'Cap-1']",2 Chainz,Chorus,I’m a pull my whips out\nEach and erry one of ...


In [49]:
parts.iloc[1332:1342, :]

Unnamed: 0,Author,Album,Song,All Singers,Part Singer,Part Name,Part Lyrics
1332,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,,Strap in this bitch fucking up\nAli in this bi...
1333,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Chorus,I'm in this bitch fucking up\nI wish Testaross...
1334,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",Young Dolph,Chorus,Y'all playing we fuck it up\nMedecine in my do...
1335,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",Skooly,Chorus,I'm in Magic City drunk as fuck\nAfter that we...
1336,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",Skooly,Chorus,I'm in this bitch and I'm geeked up\nAll about...
1337,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",Young Dolph,Chorus,Yo' hoe up in here choosing us\nShe only want ...
1338,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",Skooly,Chorus,Took a 20 turned it to a 50\nI'm a menace all ...
1339,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",Greazy,Chorus,I put your ho on the boot\nShe sucking and fuc...
1340,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,,"You playing ? We fucking it up !\nFuck it up, ..."
1341,2 Chainz,T.R.U. Jack City,We Fukin It Up,"['2 Chainz', 'Money Crew Ju', 'Bankroll Fresh'...",2 Chainz,Chorus,I'm in this bitch fucking paper up\nYou didn't...
