In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import json
import os
import re
from collections import Counter

# Get songs dataset from parsed data

In [2]:
data = pd.read_csv('all_parsed_songs.csv', index_col=0, parse_dates=['date'])
print(data.shape)
data.head()

(292623, 9)


Unnamed: 0,artist,album,song,date,produced by,written by,featuring,tags,text
0,EIGHTEEN,DELIGHT,Анна (Ann),2018-11-23,"['Найкер (Nick Niker)', 'Mike Ty', 'BOL', 'EIG...",['EIGHTEEN'],[],"['Русское аренби (Russian R&B)', 'R&B', 'Русск...",\n\n[Текст песни “Анна”]\n\n[Куплет 1]\nВечери...
1,EIGHTEEN,DELIGHT,Вода (Water),2018-11-16,"['Mike Ty', 'EIGHTEEN', 'BOL', 'Найкер (Nick N...",['EIGHTEEN'],[],"['Россия (Russia)', 'Русский рэп (Russian Rap)...",\n\n[Текст песни “Вода”]\n\n[Интро]\nМои руки ...
2,EIGHTEEN,DELIGHT,Ещё (More),2018-11-23,"['Mike Ty', 'EIGHTEEN', 'BOL', 'Найкер (Nick N...",['EIGHTEEN'],[],"['Русский рэп (Russian Rap)', 'Россия (Russia)...",\n\n[Текст песни “Ещё”]\n\n[Припев]\nТы привык...
3,EIGHTEEN,,Земной шар (Globe),2018-03-27,"['Найкер (Nick Niker)', 'BOL', 'Mike Ty', 'EIG...",['EIGHTEEN'],[],"['Dance-Pop', 'R&B']","\n\n[Припев]\nЯ чувствую, когда ты танцуешь кл..."
4,EIGHTEEN,DELIGHT,Земной шар+ (Globe Plus),2018-11-23,"['BOL', 'Найкер (Nick Niker)', 'EIGHTEEN', 'Mi...",['EIGHTEEN'],[],"['Русский рэп (Russian Rap)', 'Россия (Russia)...",\n\n[Текст песни “Земной шар+”]\n\n[Припев]\nА...


In [3]:
def correct_non_eng_authors(name):
    return name.split('(')[0][:-1] if '(' in name else name.strip(u'\u200b')

def format_str(str_):
    str_ = re.sub('[*.,!:?\"\'«»]', '', str_)
    str_ = re.sub('[-–—— ]+', ' ', str_)
    str_ = str_.strip()
    str_ = str_.lower()
    return str_

In [4]:
%%time
data = pd.read_csv('all_parsed_songs.csv', index_col=0, parse_dates=['date'])
data.drop_duplicates(subset='text', keep='last', inplace=True)
data = data[data['text'].apply(len) > 300]
data['date'].loc[data[data['date'] < '1700-01-01'].index] = np.nan
data = data[['artist', 'album', 'song', 'date', 'featuring', 'text', 'tags', 'produced by', 'written by']]
data.columns = ['Singer', 'Album', 'Song', 'Date', 'Featuring', 'Lyrics', 'Tags', 'Producers', 'Writers']
data['Singer'] = (data['Singer']
                  .apply(correct_non_eng_authors))
data['Producers'] = (data['Producers']
                     .apply(lambda x: literal_eval(x))
                     .apply(lambda x: list(map(correct_non_eng_authors, x))))
data['Writers'] = (data['Writers']
                   .apply(lambda x: literal_eval(x))
                   .apply(lambda x: list(map(correct_non_eng_authors, x))))
data['Featuring'] = (data['Featuring']
                     .apply(lambda x: literal_eval(x))
                     .apply(lambda x: list(map(correct_non_eng_authors, x))))
data['Tags'] = (data['Tags']
                .apply(lambda x: literal_eval(x)))
data = data[data['Song'].apply(lambda x: '(Skit)' not in x and '(Live' not in x)]
data = data[data['Singer'].isin((data.groupby('Singer')['Song'].nunique() >= 10).replace({False: np.nan}).dropna().index)]
data.sort_values(by=['Singer', 'Album'], inplace=True)
data.reset_index(drop=True, inplace=True)

CPU times: user 9.36 s, sys: 268 ms, total: 9.62 s
Wall time: 9.68 s


In [5]:
# introduce "Genre" field

# format each tag, then get counter for each tag and choose only those
# that are having more than 30 counts, then get genres from them
data['Tags_fmt'] = data['Tags'].apply(lambda tags_list: list(map(format_str, tags_list)))
tags_dict = Counter([tag for tags in data['Tags_fmt'].tolist() for tag in tags])
tags_dict = Counter({key: val for key, val in tags_dict.items() if val >= 30})
all_tags = list(tags_dict.keys())

sub_genres = {
    'Metal': [genre for genre in all_tags if 'metal' in genre] + 
             ['neue deutsche härte', 'mathcore', 'deathcore'],
    'Folk': [genre for genre in all_tags if 'folk' in genre],
    'Rock': [genre for genre in all_tags if 'rock' in genre],
    'Punk': [genre for genre in all_tags if 'punk' in genre] + 
            ['post hardcore', 'melodic hardcore'],
    'Alternative/Indie': [genre for genre in all_tags 
                          if 'indie' in genre or 'alternative' in genre] + 
                         ['grunge', 'post grunge'],
    'Pop': [genre for genre in all_tags if 'pop' in genre],
    'Country': [genre for genre in all_tags if 'country' in genre] + 
               ['rockabilly', 'bluegrass'],
    'R&B/Soul': [genre for genre in all_tags 
                 if 'r&b' in genre or 'soul' in genre or 'funk' in genre],
    'Blues/Jazz': [genre for genre in all_tags 
                   if 'blues' in genre or 'jazz' in genre] + 
                  ['gospel'],
    'Hip-Hop/Rap': [genre for genre in all_tags 
                    if 'hip hop' in genre or 'rap' in genre] + 
                   ['trap', 'west coast', 'east coast', 'drill',
                    'freestyle', 'dirty south', 'chicago drill', 
                    'beef', 'horrorcore', 'latin trap', 
                    'crunk', 'nerdcore', 'afro trap', 
                    'uk drill', 'reggaeton', 'reggae', 
                    'grime', 'русский трэп (russian trap)', 
                    'русский грайм (russian grime)', 
                    'american underground'],
}
data['Genre'] = (data['Tags_fmt']
                 .apply(lambda x: 
                        [main_genre for main_genre in sub_genres 
                         if not bool(set(x).isdisjoint(sub_genres[main_genre]))]))
data = data[['Singer', 'Album', 'Song', 'Date', 'Featuring', 'Genre', 'Lyrics', 'Tags', 'Producers', 'Writers']]
print(data.shape)
data.head()

(253678, 10)


Unnamed: 0,Singer,Album,Song,Date,Featuring,Genre,Lyrics,Tags,Producers,Writers
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,2015-04-02,[],"[Alternative/Indie, Hip-Hop/Rap]","\n\n[Hook: Gangsta Pat]\nKiller, killer, kille...","[Alternative, Rap]",[Jck Ruby],"[Scott Arceneaux Jr., Aristos Petrou]"
1,$UICIDEBOY$,7th or St. Tammany,7th or St. Tammany,2015-04-02,[],[Hip-Hop/Rap],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke...",[Rap],[Budd Dwyer],"[Scott Arceneaux Jr., Aristos Petrou, Master P..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,2015-03-19,[],[Hip-Hop/Rap],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...,[Rap],[Budd Dwyer],"[Scott Arceneaux Jr., Aristos Petrou]"
3,$UICIDEBOY$,7th or St. Tammany,Drugs/Hoes/Money/Etc.,2015-04-02,[],[Hip-Hop/Rap],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...,[Rap],[Budd Dwyer],"[Scott Arceneaux Jr., Aristos Petrou]"
4,$UICIDEBOY$,7th or St. Tammany,I’ll Pay for It (If I Want It),2015-04-02,[],[Hip-Hop/Rap],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...,[Rap],[Budd Dwyer],"[Scott Arceneaux Jr., Aristos Petrou]"


In [6]:
data.to_csv('songs_dataset.csv', encoding='utf-8', index=False)

# Get parts dataset from songs

In [7]:
songs = pd.read_csv('songs_dataset.csv', encoding='utf-8')
songs.head()

Unnamed: 0,Singer,Album,Song,Date,Featuring,Genre,Lyrics,Tags,Producers,Writers
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,2015-04-02,[],"['Alternative/Indie', 'Hip-Hop/Rap']","\n\n[Hook: Gangsta Pat]\nKiller, killer, kille...","['Alternative', 'Rap']",['Jck Ruby'],"['Scott Arceneaux Jr.', 'Aristos Petrou']"
1,$UICIDEBOY$,7th or St. Tammany,7th or St. Tammany,2015-04-02,[],['Hip-Hop/Rap'],"\n\nUh\n*59, yah!\nUhhhh, [?]\n*59 motherfucke...",['Rap'],['Budd Dwyer'],"['Scott Arceneaux Jr.', 'Aristos Petrou', 'Mas..."
2,$UICIDEBOY$,7th or St. Tammany,Dead Batteries,2015-03-19,[],['Hip-Hop/Rap'],\n\n[Produced by Budd Dwyer]\n\n[Intro: Frayse...,['Rap'],['Budd Dwyer'],"['Scott Arceneaux Jr.', 'Aristos Petrou']"
3,$UICIDEBOY$,7th or St. Tammany,Drugs/Hoes/Money/Etc.,2015-04-02,[],['Hip-Hop/Rap'],\n\n[Verse 1: YUNG $NOW]\nCounting sheep until...,['Rap'],['Budd Dwyer'],"['Scott Arceneaux Jr.', 'Aristos Petrou']"
4,$UICIDEBOY$,7th or St. Tammany,I’ll Pay for It (If I Want It),2015-04-02,[],['Hip-Hop/Rap'],\n\n[Verse 1: RUBY DA CHERRY]\nFucking Ruby go...,['Rap'],['Budd Dwyer'],"['Scott Arceneaux Jr.', 'Aristos Petrou']"


In [8]:
# Write to file all singers from dataset
with open ('singers.txt', 'w', encoding='utf-8') as file:
    file.write("\n".join(songs['Singer'].unique().tolist()))

In [9]:
# Get lyrics from part test i.e. remove first row with part name and singer's name
def get_part_lyrics(part):
    part = part[1:]
    return '\n'.join(part)

In [10]:
# Get info about song author, album, song name, 
# singer of part, part name, some debug info (first row of part text) and part lyrics
def get_part_info(part, all_singers, author, album, song):
    singer = None
    brackets = part[0]
    brackets_parts = re.split(':|—', brackets[1:-1])
    if len(brackets_parts) == 1:
        singer = all_singers[0]
        part_name = brackets_parts[0]
    else:
        part_singers = brackets_parts[1]
        part_singers = list(map(str.strip, re.split(',|&', part_singers)))
        for part_singer in part_singers:
            if part_singer in all_singers:
                singer = part_singer
                break
        part_name = brackets_parts[0]
    part_lyrics = get_part_lyrics(part)
    if not singer:
        singer = author
    if part_lyrics == '':
        return None
    return [author, album, song, all_singers, singer, part_name, brackets[1:-1], part_lyrics]

In [11]:
# We divide song text by parts via \n's and brackets and then get info from them separately
def get_parts_from_song(song):
    feats = literal_eval(song['Featuring'])
    author, album, song, lyrics = song[['Singer', 'Album', 'Song', 'Lyrics']].values
    all_singers = list(set([author] + feats))
#     print(author, '-', album, '-', song, '-', feats, '-', all_singers)
    
    indices = [i for i, ltr in enumerate(lyrics) if ltr == '[' and 
                                                    lyrics[i-1] == '\n' and 
                                                    lyrics[i-2] == '\n'] + [len(lyrics)]
    song_parts = [lyrics[indices[i]:indices[i+1]].strip().split('\n') for i, index in enumerate(indices[:-1])]
    song_parts_info = [get_part_info(song_part, all_singers, author, album, song) for song_part in song_parts]
    song_parts_info = list(filter(None, song_parts_info))

    df = pd.DataFrame(song_parts_info, columns=['Singer', 'Album', 'Song', 'All Singers', 
                                                'Part Singer', 'Part Name', 'Part Debug', 'Part Lyrics'])
    return df

In [12]:
%%time
# Get parts dataframe
parts = songs.apply(get_parts_from_song, axis=1)
parts = pd.concat(parts.tolist(), axis=0)
parts.reset_index(drop=True, inplace=True)
print(parts.shape)

(847582, 8)
CPU times: user 6min 49s, sys: 1.91 s, total: 6min 51s
Wall time: 6min 47s


In [13]:
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'..."
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of..."
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou..."
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,[$UICIDEBOY$],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."


In [14]:
# Save raw parts dataset 
parts.to_csv('raw_parts_dataset.csv')

# Raw parts data processing

In [15]:
parts = pd.read_csv('raw_parts_dataset.csv', index_col=0)
# Some artifacts fix
parts['Part Name'] = parts['Part Name'].replace(to_replace={np.nan: 'Verse'})
parts['Part Debug'] = parts['Part Debug'].replace(to_replace={np.nan: ''})
parts['All Singers'] = parts['All Singers'].apply(lambda x: x.replace('\\xa0', ' '))
parts['Part Singer'] = parts['Part Singer'].apply(lambda x: x.replace('\\xa0', ' '))
parts['Part Debug'] = parts['Part Debug'].apply(lambda x: x.replace('\\xa0', ' '))
# Drop one-liners
indices_to_drop = parts[parts['Part Lyrics'].apply(lambda x: len(x.split('\n'))) == 1].index
parts.drop(indices_to_drop, inplace=True)
parts.reset_index(drop=True, inplace=True)
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'..."
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of..."
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou..."
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'..."


In [16]:
# After dropping one-liners
parts.shape

(815996, 8)

In [17]:
# Format part name for further generalization
def format_part_name(part_name):
    if pd.isna(part_name):
        return ''
    part_name = re.sub('[-–—— ]+', ' ', part_name)
    part_name = part_name.strip()
    part_name = part_name.lower()
    return part_name

In [18]:
parts['Part Name 2'] = parts['Part Name'].apply(lambda x: format_part_name(x))
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics,Part Name 2
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'...",hook
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of...",verse 1
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou...",verse 2
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook


In [19]:
# Generalize different parts names, mistakes and strange things
def generalize_part_name(in_part_name):
    other_ = ['collision', 'spoken word', 'drop', 'spoken', 'talking', 'solo', '?', 'interview', 'inaudioble',
              'break', 'breakdown', 'sample', 'all', 'pre verse', 'both', 'instrumental', 'voicemail', 
              'dialogue', 'ad libs', 'singer', 'poet', 'man', 'instrumentale', 'pause',
              'pont', 'Other']
    intro_ = ['интро', 'интерлюдия', 'intro', 'into', 'interlude', 'introduction', 'Intro']
    outro_ = ['аутро', 'завершение', 'outro', 'Outro']
    bridge_ = ['бридж', 'переход', 'puente', 'bridge', 'brige', 'Bridge']
    hook_ = ['хук', 'hook', 'Hook']
    skit_ = ['скит', 'skit', 'Skit']
    refrain_ = ['рефрен', 'refrain', 'refarin', 'Refrain']
    chorus_ = ['припев', 'estribillo', 'ritornello', 'coro', 'chorus', 'chrous', 'Chorus']
    verse_ = ['куплет', 'strophe', 'couplet', 'verso', 'verse', 'Verse']
    
    part_name_variations = {'Other': other_, 
                            'Intro': intro_, 
                            'Outro': outro_, 
                            'Bridge': bridge_, 
                            'Hook': hook_, 
                            'Skit': skit_, 
                            'Refrain': refrain_, 
                            'Chorus': chorus_, 
                            'Verse': verse_}
    out_part_name = None
    for part_name in part_name_variations:
        if any([variation in in_part_name for variation in part_name_variations[part_name]]):
            out_part_name = part_name
    return out_part_name

In [20]:
parts['Part Name 3'] = parts['Part Name 2'].apply(generalize_part_name)
parts.head()

Unnamed: 0,Singer,Album,Song,All Singers,Part Singer,Part Name,Part Debug,Part Lyrics,Part Name 2,Part Name 3
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer, killer\nWhat the fuck you gon'...",hook,Hook
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 1,Verse 1: YUNG $NOW,"Bitch, I be the $nowmane\nFifty nine shades of...",verse 1,Verse
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook,Hook
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Verse 2,Verse 2: ANTHONY MARS,"Yesterday, I rode my bike to the moon\nOr thou...",verse 2,Verse
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,['$UICIDEBOY$'],$UICIDEBOY$,Hook,Hook: Gangsta Pat,"Killer, killer\nWhat the fuck you gon' do?\nI'...",hook,Hook


In [21]:
# Attempts to process incorrect names
def correct_part_name(parts):
    all_singers = parts['All Singers']
    part_singer = parts['Part Singer']
    part_name = parts['Part Name']
    new_part_singer = part_singer
    new_part_name = part_name 
    if part_name in all_singers:
        new_part_singer = part_name
        new_part_name = 'Verse'
    elif part_singer in part_name:
        new_part_singer = part_singer
        new_part_name = 'Verse'
    return new_part_singer, new_part_name

In [22]:
parts['Part Name 3'].value_counts(dropna=False)

Verse      305267
Chorus     278375
Hook        58360
Bridge      44192
Outro       40572
Intro       37585
NaN         32009
Refrain     11201
Other        7834
Skit          601
Name: Part Name 3, dtype: int64

In [23]:
null_index = parts[pd.isna(parts['Part Name 3'])].index
res = parts.loc[null_index].apply(correct_part_name, axis=1)
parts.loc[null_index, 'Part Singer'] = res.apply(lambda x: x[0])
parts.loc[null_index, 'Part Name 3'] = res.apply(lambda x: x[1])

parts['Part Name 3'] = parts['Part Name 3'].apply(generalize_part_name)
parts['Part Name'] = parts['Part Name 3']
parts.drop(['Part Name 2', 'Part Name 3'], axis=1, inplace=True)

In [24]:
parts['Part Name'].value_counts(dropna=False)

Verse      315450
Chorus     278375
Hook        58360
Bridge      44192
Outro       40572
Intro       37585
NaN         21826
Refrain     11201
Other        7834
Skit          601
Name: Part Name, dtype: int64

In [25]:
null_index = parts[pd.isna(parts['Part Name'])].index
parts['Part Debug'] = parts['Part Debug'].apply(lambda x: format_part_name(x))
parts['Part Name 2'] = parts['Part Debug'].apply(generalize_part_name)
null_index = parts[pd.isna(parts['Part Name']) & pd.notna(parts['Part Name 2'])].index
parts['Part Name'].loc[null_index] = parts['Part Name 2'].loc[null_index]

parts = parts[['Singer', 'Album', 'Song', 'Part Singer', 'Part Name', 'Part Lyrics']]

In [26]:
parts['Part Name'].value_counts()

Verse      315545
Chorus     278415
Hook        58370
Bridge      44197
Outro       40577
Intro       37596
Refrain     11201
Other        8131
Skit          601
Name: Part Name, dtype: int64

In [27]:
print(parts.shape)
parts.head()

(815996, 6)


Unnamed: 0,Singer,Album,Song,Part Singer,Part Name,Part Lyrics
0,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,$UICIDEBOY$,Hook,"Killer, killer, killer\nWhat the fuck you gon'..."
1,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,$UICIDEBOY$,Verse,"Bitch, I be the $nowmane\nFifty nine shades of..."
2,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,$UICIDEBOY$,Hook,"Killer, killer\nWhat the fuck you gon' do?\nI'..."
3,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,$UICIDEBOY$,Verse,"Yesterday, I rode my bike to the moon\nOr thou..."
4,$UICIDEBOY$,7th or St. Tammany,40 Oz. & Sober,$UICIDEBOY$,Hook,"Killer, killer\nWhat the fuck you gon' do?\nI'..."


In [28]:
parts.drop_duplicates(subset='Part Lyrics', keep='last', inplace=True)
parts.sort_values(by=['Singer', 'Album'], inplace=True)
parts.reset_index(drop=True, inplace=True)
parts.shape

(690645, 6)

In [29]:
temp = parts.isnull().sum()
temp[temp != 0]

Album        91997
Part Name    20245
dtype: int64

In [30]:
parts.to_csv('parts_dataset.csv', encoding='utf-8', index=False)