In [719]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import nltk
import matplotlib.pyplot as plt
from datetime import datetime

In [720]:
def string_to_datetime(string):
    return datetime.strptime(string, "%m/%d/%Y")

In [734]:
#merge dataframe that contains youtube information and dataframe that contains dates
data_yt = pd.read_csv('doolset-lyrics-v8-yt-info.csv')
data_dates = pd.read_csv('doolset-lyrics-v6-dates - doolset-lyrics-v6 (2).csv', converters={'Date': string_to_datetime})

data = data_dates.merge(data_yt, how='left', on=['Title', 'Album', 'URL', 'Complete Translated Lyrics', 'Notes', 'Translated Korean Lyrics Only', 'English Lyrics Only',
                                                'Internal Links'])
#drop irrelevant and empty columns
data.drop(columns=['Soup_x', 'Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [736]:
data.sort_values("Date", ascending=True, inplace=True)
data.drop_duplicates(subset=['Title'], inplace=True) #drop duplicate songs, keeping the first/oldest one
#(for when songs are included in multiple albums)

Unnamed: 0,Title,Album,Date,Complete Translated Lyrics,Notes,Translated Korean Lyrics Only,English Lyrics Only,Internal Links,URL,Soup_y,Youtube Views,Youtube Video
147,No More Dream,2 Cool 4 Skool,2013-06-12,"Hey you, what is your dream \nHey you, what is...",Most ‘good’ colleges are geographically concen...,"Hey you, what is your dream \nHey you, what is...","I wanna big house, big cars & big rings \nBut ...",[],https://doolsetbangtan.wordpress.com/2018/06/2...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",165847468.0,http://www.youtube.com/watch?v=rBG5L7UsUxA
145,Intro: 2 Cool 4 Skool (Feat. DJ Friz),2 Cool 4 Skool,2013-06-12,We’re now going to progress to some steps \nwh...,,\n \n \n \nDj Friz \nWho’s that? \nB A N G T ...,We’re now going to progress to some steps \nwh...,[],https://doolsetbangtan.wordpress.com/2018/06/2...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",543883.0,http://www.youtube.com/watch?v=Pd4wMA329zM
149,길 (Road/Path),2 Cool 4 Skool,2013-06-12,"Yeah, wassup \nYou know time flows like stars ...",Hongdae is an abbreviation of Hongik Daehakgyo...,"Yeah, wassup \nYou know time flows like stars ...","Yeah, wassup \nYou know time flows like stars ...",[],https://doolsetbangtan.wordpress.com/2020/01/3...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",0.0,
148,좋아요 (Like),2 Cool 4 Skool,2013-06-12,"Wanna be loved… \nDon’t wanna be fool, wanna b...",도깨비 감투 (goblin’s hat) is a Korean equivalent o...,"Wanna be loved… \nDon’t wanna be fool, wanna b...","Same love \nUh f**k that, all stupid b*******s...",[],https://doolsetbangtan.wordpress.com/2018/06/2...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",1888030.0,http://www.youtube.com/watch?v=62VvYktlk-I
146,We Are Bulletproof Pt.2,2 Cool 4 Skool,2013-06-12,"(What) Give it to me \n(What) Be alert, everyo...",The final boss is the final/strongest opponent...,"() Give it to me \n() Be alert, everyone \n() ...",What \nWhat \nWhat \n(What) We are bulletproof...,[],https://doolsetbangtan.wordpress.com/2018/06/2...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",52051586.0,http://www.youtube.com/watch?v=lE9lkSdtZeQ
...,...,...,...,...,...,...,...,...,...,...,...,...
153,이상하지 않은가 (Strange; Feat. RM),D-2,2020-05-22,Someone please tell me if life is pain \nWell ...,Note: 창궐하다 (to be rampant) is a verb that is m...,Someone please tell me if life is pain \n \nIf...,Everything in dust \nDo you see? \nWell well w...,['https://doolsetbangtan.wordpress.com/2018/06...,https://doolsetbangtan.wordpress.com/2020/05/2...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",0.0,
158,Interlude : Set me free,D-2,2020-05-22,"Set me free, knowing that it won’t go the way ...",,", knowing that it won’t go the way I want \n, ...",Set me free \nSet me free \nSet me free \nSet ...,[],https://doolsetbangtan.wordpress.com/2020/05/2...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",0.0,
192,Still With You (Jungkook),SoundCloud,2020-06-05,Your faint voice that brushes past me \nPlease...,,Your faint voice that brushes past me \nPlease...,Still With You \nStill With You,[],https://doolsetbangtan.wordpress.com/2020/06/0...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",0.0,
208,MAX – Blueberry Eyes (feat. SUGA),Miscellaneous,2020-09-15,Damn you look so good \nLaying there wearing ...,,\n \n \nWanna drive my lips all around it \nC...,Damn you look so good \nLaying there wearing ...,[],https://doolsetbangtan.wordpress.com/2020/09/1...,"<!DOCTYPE html>\n\n<html lang=""en"">\n<head>\n<...",21947941.0,http://www.youtube.com/watch?v=TzFRVk2ektI


In [737]:
data.insert(3, "Track Number", [0 for i in range(len(data))])
#clean albums and label songs by posittion in album (for d3 positioning)
data["Album"] = data["Album"].str.replace('\xa0',' ', regex=False)

for album in data["Album"].unique():
    #only label for actual albums
    if album not in ["SoundCloud", "Miscellaneous"]:
        album_songs = data.loc[data["Album"] == album]
        p = 0
        #not really close to actual position but thats ok
        for i, row in album_songs.iterrows():
            data.at[i, "Track Number"] = p
            p += 1

In [738]:
##proceed with cleaning the text :)

#iterate through text columns
#simple clean text
for c in ['Complete Translated Lyrics', 'Notes', 'Translated Korean Lyrics Only', 'English Lyrics Only']:
    #make all roman words lowercase
    data[c] = data[c].str.lower()
    #remove line breaks
    data[c] = data[c].str.replace('\n',' ', regex=False)
    data[c] = data[c].str.replace('\xa0',' ', regex=False)

    #insert column to hold BOW representation of this column
    data.insert(4, 'BOW_' + c, ['' for i in range(len(data))])

data.insert(4, 'Most Common English', ['' for i in range(len(data))])
data.insert(4, 'Most Common Translated', ['' for i in range(len(data))])
data.insert(4, 'Percent English BOW', ['' for i in range(len(data))])
data.insert(4, 'Percent English Original', ['' for i in range(len(data))])
data.insert(4, 'Percent English Unique', ['' for i in range(len(data))])
data.insert(4, 'Notes to Lyrics Ratio Original', ['' for i in range(len(data))])
data.insert(4, "Unique Word Count", ["" for i in range(len(data))])

In [739]:
##tokenization, lemmatization, bag of words
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#create tokenizer that uses regular expressions to get rid of punctuation
tokenizer = nltk.RegexpTokenizer(r"\w+")

for i, row in data.iterrows():
    #not dealing with notes for now
    for c in ['Complete Translated Lyrics', 'Translated Korean Lyrics Only', 'English Lyrics Only', 'Notes']:
        try:
            BOW = []
            words_list = tokenizer.tokenize(row[c])

            #strip out english stopwords using default nltk corpus
            words_list = [i for i in words_list if not i in stopwords]

            words_list = nltk.pos_tag(words_list)
            #try to lemmatize words
            for word in words_list:
                if "N" in word[1]:
                    word_l = wordnet_lemmatizer.lemmatize(word[0], pos="n")
                else:
                    word_l = wordnet_lemmatizer.lemmatize(word[0], pos="v")
                BOW.append(word_l)
            
            data.at[i, 'BOW_' + c] = BOW
        except: #if there is an error (like empty notes strings) just skip
            pass

In [740]:
##get basic info about most common word in different languages, ratios of language, etc.
for i, row in data.iterrows():
    english_original = row['BOW_English Lyrics Only']	
    if len(english_original) > 0:
        #print most frequent word originally in engllish 
        frequencies = pd.Series(english_original).value_counts()
        max_count = frequencies[0]
        most_common_english = frequencies.index[:5]
        #for j in range(len(frequencies)):
           # if frequencies[j] == max_count:
              #  most_common_english.append(frequencies.index[j])
        #print(row['Title'] + ': Most Common English Original Words', most_common_english)
        data.at[i, 'Most Common English'] = list(most_common_english)

    korean_translated = row['BOW_Translated Korean Lyrics Only']	
    if len(korean_translated) > 0:
        #print most frequent word originally in korean 
        frequencies = pd.Series(korean_translated).value_counts()
        max_count = frequencies[0]
        most_common_translated = frequencies.index[:5]
        #for j in range(len(frequencies)):
            #if frequencies[j] == max_count:
               # most_common_translated.append(frequencies.index[j])
        #print(row['Title'] + ': Most Common Translated Korean Words:', most_common_translated)
        data.at[i, 'Most Common Translated'] = list(most_common_translated)
    

    #calculate percentage of english original out of complete translated words - BOW (no stop words, etc.)
    try:
        eng_to_trans_ratio = len(row['BOW_English Lyrics Only']) / len(row['BOW_Complete Translated Lyrics'])
    except:
        eng_to_trans_ratio = 0 #handle songs with no english!!
    data.at[i, 'Percent English BOW'] = eng_to_trans_ratio

    #calculate percentage of english original to complete translated words - original (stop words, etc., included)
    try:
        eng_to_trans_ratio = len(row['English Lyrics Only']) / len(row['Complete Translated Lyrics'])
    except:
        eng_to_trans_ratio = 0 #handle songs with no english!!
    data.at[i, 'Percent English Original'] = eng_to_trans_ratio

    #calculate percentage of unique english original out of complete lyrics - BOW (no stop words, etc.)
    #how much meaning is from either language (ie, some songs have lots of repeated english phrases but most meaning and progression is in korean verse)
    try:
        unique_ratio = len(set(row['BOW_English Lyrics Only'])) / len(set(row['BOW_Complete Translated Lyrics']))
    except:
        unique_ratio #handle songs with no english
    data.at[i, 'Percent English Unique'] = unique_ratio

    #calculate length of notes relative to length of song (complete lyrics)
    try:
        notes_lyrics_ratio = len(row['Notes']) / len(row['Complete Translated Lyrics'])
    except: #songs w/o notes
        notes_lyrics_ratio = 0
    data.at[i, 'Notes to Lyrics Ratio Original'] = notes_lyrics_ratio

    #calculate total unique words in song (using BOW representation)
    unique_words = len(set(row["BOW_Complete Translated Lyrics"]))
    data.at[i, "Unique Word Count"] = unique_words

In [741]:
data["Internal Links"] = data["Internal Links"].str.replace("'", "", regex=False)
data["Internal Links"] = data["Internal Links"].str.replace("[", "", regex=False)
data["Internal Links"] = data["Internal Links"].str.replace("]", "", regex=False)
data["Internal Links"] = data["Internal Links"].str.split(",")

In [742]:
data.insert(9, "Reference URL", ["" for i in range(len(data))])

In [743]:
#create clean reference urls that only contain song name and not the weird % strings that were somettimes added
data["Reference URL"] = data["URL"].str.split('/')
data["Reference URL"] = [link[-2] for link in list(data["Reference URL"])]
data["Reference URL"] = data["Reference URL"].str.split('-')
for i, row in data.iterrows():
    clean = [w for w in row["Reference URL"] if "%" not in w]
    clean = "-".join(w for w in clean)
    data.at[i, "Reference URL"] = clean

In [744]:
data.insert(9, "Link References", ["" for i in range(len(data))])

In [745]:
##store list of linkable references for each song
for i, row in data.iterrows():
    link_references = []
    for link in row["Internal Links"]:
        #split into parrts of url
        link_parts = [p for p in link.split("/") if p != '']
        if len(link_parts) > 0:
            #song referenced is last part of the url, clean it andd match to reference urls
            reference = link_parts[-1].split('-')
            clean = [w for w in reference if "%" not in w]
            clean = "-".join(w for w in clean)
            reference_row = data.loc[data["Reference URL"] == clean]
            try:
                link_references.append(list(reference_row["Reference URL"])[0])
            except:
                print("Error ", clean)
    data.at[i, "Link References"] = link_references

Error  euphoria


In [746]:
##random function to find all songs that contain a given word
def keyword_search(keyword, column):
    songs_with_word = []
    for i, row in data.iterrows():
        bow = row[column]
        if keyword in bow:
            songs_with_word.append(row["Title"])
    return songs_with_word

In [747]:
overall_BOW = []
for wl in data['BOW_Complete Translated Lyrics']:
    for word in wl:
        overall_BOW.append(word)
overall_BOW = pd.Series(overall_BOW)
frequencies = overall_BOW.value_counts()
for i in range(len(frequencies)):
    if frequencies[i] > 10:
        print(frequencies.index[i], frequencies[i])

like 679
know 529
love 523
get 454
go 436
even 423
yeah 391
la 374
say 348
want 345
oh 340
let 324
one 312
feel 295
day 272
make 258
dream 255
time 254
come 236
still 233
away 228
life 227
everything 227
world 224
look 223
become 217
would 195
run 187
hey 187
please 185
right 183
hand 174
back 170
though 160
see 151
think 150
heart 149
live 147
wanna 147
way 146
give 144
baby 139
girl 137
never 136
keep 135
fall 133
every 130
put 130
word 124
night 123
tell 120
hold 118
thing 118
stop 118
take 115
hope 115
bang 114
fly 112
well 108
eye 108
forever 107
two 106
try 104
end 103
without 103
everyone 100
na 99
us 99
always 98
really 98
moment 96
could 95
someone 92
together 92
rain 91
side 89
okay 89
light 88
rap 87
good 86
call 85
worry 85
hate 85
anything 83
today 82
smile 82
jump 81
need 80
pain 80
change 79
place 79
sky 77
ya 77
different 76
crazy 76
leave 76
believe 75
walk 75
throw 74
best 74
much 74
wish 74
cry 73
ever 73
mind 73
fuck 73
first 72
tear 72
ah 71
many 70
fun 69
stay 68


In [748]:
data.to_csv('analyzed-lyrics-v2.csv')

In [749]:
data.columns

Index(['Title', 'Album', 'Date', 'Track Number', 'Unique Word Count',
       'Notes to Lyrics Ratio Original', 'Percent English Unique',
       'Percent English Original', 'Percent English BOW', 'Link References',
       'Reference URL', 'Most Common Translated', 'Most Common English',
       'BOW_English Lyrics Only', 'BOW_Translated Korean Lyrics Only',
       'BOW_Notes', 'BOW_Complete Translated Lyrics',
       'Complete Translated Lyrics', 'Notes', 'Translated Korean Lyrics Only',
       'English Lyrics Only', 'Internal Links', 'URL', 'Soup_y',
       'Youtube Views', 'Youtube Video'],
      dtype='object')