In [40]:
import pandas as pd
import nltk
import numpy as np
import re

In [139]:
genius_data = pd.read_csv('genius_bts_discography_rough (1).csv')
genius_data.drop(['Unnamed: 0', 'Written By', 'Produced By', 'Description'], axis=1, inplace=True) #drop columns irrelevant for this analysis

##add columns for most common latin word, most common non-latin word
genius_data.insert(4, "Most Common LATIN Word", [str(np.nan) for i in range(len(genius_data))])
genius_data.insert(4, "Most Common NON-Latin Word", [str(np.nan) for i in range(len(genius_data))])
genius_data.insert(4, "Most Common Word", [str(np.nan) for i in range(len(genius_data))])

genius_data.head()

Unnamed: 0,Song,Album,Release Date,Lyrics,Most Common Word,Most Common NON-Latin Word,Most Common LATIN Word,URL
0,Dynamite,Dynamite (Extended),"August 21, 2020","\n\n[Intro: Jungkook]\n'Cause I, I, I'm in the...",,,,https://genius.com/Bts-dynamite-lyrics
1,Dynamite (Instrumental),Dynamite (Extended),"August 21, 2020",\n\n[Instrumental]\n\n,,,,https://genius.com/Bts-dynamite-instrumental-l...
2,Dynamite (Acoustic Remix),Dynamite (Extended),"August 24, 2020","\n\n[Intro: Jungkook]\n'Cause I, I, I'm in the...",,,,https://genius.com/Bts-dynamite-acoustic-remix...
3,Dynamite (EDM Remix),Dynamite (Extended),"August 24, 2020","\n\n[Intro: Jungkook]\n'Cause I, I, I'm in the...",,,,https://genius.com/Bts-dynamite-edm-remix-lyrics
4,Dynamite (Tropical Remix),Dynamite (Deluxe),"August 28, 2020","\n\n[Intro: Jungkook]\n'Cause I, I, I'm in the...",,,,https://genius.com/Bts-dynamite-tropical-remix...


In [135]:
#tokenize + clean initial lyrics string
def tokenize_lyrics(original_string):
    #split lyrics on lines and remove everything that isnt lyrics (blank lines, verse headers)
    lyrics_lines = original_string.split("\n")
    lyrics_only_lines = " ".join([x for x in lyrics_lines if x != '' and x[0] != '[']) 
    lyrics_only_lines = lyrics_only_lines.replace('\u2005', ' ').replace('\u205f', ' ') #take out all random weirdly formatted spaces

    #lowercase words and split on spaces
    tokenized_lyrics = lyrics_only_lines.lower()
    #remove punctuation except - and ', since they are part of words
    tokenized_lyrics = re.sub(r'[!?.()"":;/,]', '', tokenized_lyrics) 
    tokenized_lyrics = tokenized_lyrics.split(" ")

    ##not that we are NOT removing stop words right now######





    return tokenized_lyrics


In [118]:
##split tokenized lyrics by language
#note that latin words might be in spanish, etc.
#non-latin words could be in korean, japanese, etc.
def split_language(tokenized_lyrics):
    latin_words = []
    non_latin_words = []
    for i in tokenized_lyrics:
        result = re.search(r'([a-z])', i)
        if result == None:
            non_latin_words.append(i)

        else:
            latin_words.append(i)

    return non_latin_words, latin_words


In [127]:
##function to get momst common word in list
def get_most_common(words_list):
    try:
        return pd.Series(words_list).value_counts().index[0]
    except: 
        return np.nan
     

In [140]:
for i, row in genius_data.iterrows():
    this_original_lyrics = row["Lyrics"]

    this_tokenized_lyrics = tokenize_lyrics(this_original_lyrics)

    this_non_latin = split_language(this_tokenized_lyrics)[0]
    this_latin = split_language(this_tokenized_lyrics)[1]

    this_most_common_non_latin = get_most_common(this_non_latin)
    this_most_common_latin = get_most_common(this_latin)
    this_most_common = get_most_common(this_tokenized_lyrics)

    print(row["Song"] + ": ", this_most_common, "| ", this_most_common_non_latin, this_most_common_latin)
    #write into data frame
    genius_data.at[i, "Most Common NON-Latin Word"] = this_most_common_non_latin
    genius_data.at[i, "Most Common LATIN Word"] = this_most_common_latin
    genius_data.at[i, "Most Common Word"] = this_most_common

Dynamite:  the |  nan the
Dynamite (Instrumental):   |   nan
Dynamite (Acoustic Remix):  the |  nan the
Dynamite (EDM Remix):  the |  nan the
Dynamite (Tropical Remix):  the |  nan the
Dynamite (Poolside Remix):  the |  nan the
Dynamite:  the |  nan the
Dynamite (Instrumental):   |   nan
Dynamite (Acoustic Remix):  the |  nan the
Dynamite (EDM Remix):  the |  nan the
INTRO: Calling:  心惹かれて |  君の全てに gold
Stay Gold:  gold |  夢の中でも gold
Boy With Luv - Japanese ver.:  my |  ウォーウォウォウォウォ my
Make It Right - Japanese ver.:  大丈夫です |  大丈夫です 赤ちゃんi
Dionysus - Japanese ver.:  it |  さあ、酔え酔え酔え　今 it
IDOL - Japanese ver.:  i |  オルッス i
Airplane pt.2 - Japanese ver.:  i |   i
FAKE LOVE - Japanese ver.:  love |  君の為なら love
Black Swan - Japanese ver.:  yeah |  海が光飲み込み yeah
ON - Japanese ver.:  oh |  持って来い oh
Lights:  light |  離れていても届いてる light
Your eyes tell:  so |  どこにも行かないように so
OUTRO: The Journey:   |   nan
Intro: Persona:  i |  나 i
작은 것들을 위한 시 (Boy With Luv):  my |  워워워워워 my
Make It Right:  right |  날 r

In [143]:
print(genius_data["Most Common Word"].value_counts())

i          53
you        35
the        24
la         16
           15
           ..
랩몬스터        1
sunday      1
monster     1
base        1
has         1
Name: Most Common Word, Length: 174, dtype: int64
