In [27]:
import pandas as pd
from datetime import timedelta
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt

import lyricsgenius
from fuzzywuzzy import fuzz

In [28]:
kpop_world_sales = pd.read_csv('Kpop on BB - World Digital Sales.csv')

kpop_world_sales.insert(3, 'original_lyrics', ['' for i in range(len(kpop_world_sales))])
kpop_world_sales.insert(4, 'translated_lyrics', ['' for i in range(len(kpop_world_sales))])

kpop_world_sales.rename(columns={'Chart date[169]': 'chart_date', 
                            'Artist': 'artist',
                            'Song/Album Title': 'song_title',
                            'Peak position': 'peak_position',
                            'Consecutive\nentry weeks': 'entry_weeks'}, inplace=True)
                            
##standardize and process chart data

#remove wiki citations, standardize whitespace, remove quotation marks
#regex patterns and replacement strings
regex_pats = [[r"(\[\d+\])", ""], [r"(\s)", " "], [r"(\")", ""]]
for pat in regex_pats:
    kpop_world_sales["artist"] = kpop_world_sales["artist"].str.replace(pat[0], pat[1])
    kpop_world_sales["song_title"] = kpop_world_sales["song_title"].str.replace(pat[0], pat[1])
    kpop_world_sales["peak_position"] = kpop_world_sales["peak_position"].str.replace(pat[0], pat[1])

#pull out just main artist, remove features (for genius search)
kpop_world_sales['main_artist'] = kpop_world_sales["artist"].str.replace(r"(feat.*)", '')
kpop_world_sales['main_artist'] = kpop_world_sales["main_artist"].str.replace(r"(and .*)", '')
kpop_world_sales['main_artist'] = kpop_world_sales["main_artist"].str.strip()

#remove parentheticals from title
kpop_world_sales['search_title'] = kpop_world_sales['song_title'].str.replace(r"(\(.*\))", "")

#standardize peak positions (remove parentheticals) + remove trailing whitespace + remove all non-digits
kpop_world_sales["peak_position"] = kpop_world_sales["peak_position"].str.replace(r"(\(.*\))", "")
kpop_world_sales["peak_position"] = kpop_world_sales["peak_position"].str.strip()
kpop_world_sales["peak_position"] = kpop_world_sales["peak_position"].str.replace(r"[a-z]", "")

In [5]:
##set up genius w/ api key -- DELETE LATER
genius = lyricsgenius.Genius('wzw-dPEqavabI78f17ZCl6L4M7LjWX6Z-wXO0SRzRCxA7P3cIzIlKbweFEB763KY')

In [39]:
genius.search_song('BTS' + ' - ' + 'DNA' + ' (English Translation)', 'Genius English Translations')

Searching for "BTS - DNA (English Translation)" by Genius English Translations...
Done.


('BTS - DNA (English Translation)', 'Genius English Translations')

In [32]:
##iterate through all songs to scrape original lyrics
for i, r in kpop_world_sales.iterrows():
    this_song = genius.search_song(r['search_title'], r['main_artist'])

    #continue if it returns a result
    if this_song:
        #partial ratio threshold to prevent mismatches
        title_similarity = fuzz.partial_ratio(r['search_title'].lower(), this_song.title.lower())
        artist_similarity = fuzz.partial_ratio(r['main_artist'].lower(), this_song.artist.lower())
        

        #if similarity is above threshold, store lyrics + url
        if title_similarity > 60 and artist_similarity > 60:
            print(r['search_title'], r['main_artist'], " | ", this_song.title, this_song.artist)
            this_lyrics = this_song.lyrics
            kpop_world_sales.at[i, 'original_lyrics'] = this_lyrics


Searching for "Destiny" by Mamamoo...
Done.
Destiny Mamamoo  |  우린 결국 다시 만날 운명이었지 (Destiny) MAMAMOO
Searching for "Lion" by (G)I-dle...
Done.
Lion (G)I-dle  |  Lion (G)I-DLE
Searching for "Wanna Go Back" by Park Bom...
Done.
Wanna Go Back Park Bom  |  되돌릴 수 없는 돌아갈 수 없는 돌아갈 곳 없는 (Wanna Go Back) Park Bom
Searching for "Find You" by Monsta X...
Done.
Find You Monsta X  |  Find You MONSTA X
Searching for "Spark" by Taeyeon...
Done.
Spark Taeyeon  |  불티 (Spark) TAEYEON 태연
Searching for "You Calling My Name" by Got7...
Done.
You Calling My Name Got7  |  니가 부르는 나의 이름 (You Calling My Name) GOT7
Searching for "Flower Shower" by Hyuna...
Done.
Flower Shower Hyuna  |  Flower Shower 현아 (HyunA)
Searching for "Love Poem" by IU...
Done.
Love Poem IU  |  Love Poem IU (아이유)
Searching for "Money" by Dawn...
Done.
Money Dawn  |  Money DAWN (KOR)
Searching for "Drip" by Jessi...
Done.
Drip Jessi  |  Drip Jessi (제시)
Searching for "Drip" by Hinipia...
Done.
Drip Hinipia  |  DRIP HINAPIA (히나피아)
Searching fo

In [41]:
##iterate through all songs to scrape translated lyrics
for i, r in kpop_world_sales.iterrows():

    artist = r['main_artist']
    title = r['search_title']
    this_song = genius.search_song(artist + ' - ' + title + ' (English Translation)', 'Genius English Translations')

    #continue if it returns a result
    if this_song:
        #partial ratio threshold to prevent mismatches
        similarity = fuzz.partial_ratio((artist + ' - ' + title + ' (English Translation)').lower(), this_song.title.lower())
        

        #if similarity is above threshold, store lyrics + url
        if similarity > 60:
            print((artist + ' - ' + title), this_song.title)
            this_lyrics = this_song.lyrics
            kpop_world_sales.at[i, 'translated_lyrics'] = this_lyrics

'TXT - Fairy of Shampoo (English Translation) Genius English Translations'
Searching for "TXT - Eternally (English Translation)" by Genius English Translations...
No results found for: 'TXT - Eternally (English Translation) Genius English Translations'
Searching for "TXT - PUMA (English Translation)" by Genius English Translations...
No results found for: 'TXT - PUMA (English Translation) Genius English Translations'
Searching for "TXT - Maze in the Mirror (English Translation)" by Genius English Translations...
No results found for: 'TXT - Maze in the Mirror (English Translation) Genius English Translations'
Searching for "Suga (Agust D) - Daechwita (English Translation)" by Genius English Translations...
No results found for: 'Suga (Agust D) - Daechwita (English Translation) Genius English Translations'
Searching for "Suga (Agust D) - Strange (English Translation)" by Genius English Translations...
Done.
Searching for "Suga (Agust D) - Burn It (English Translation)" by Genius English

In [None]:
##### process lyrics again

In [42]:
kpop_world_sales.to_csv('wdss.csv')
kpop_world_sales


Unnamed: 0,chart_date,artist,song_title,original_lyrics,translated_lyrics,peak_position,entry_weeks,Total\nweeks,main_artist,search_title
0,2010-07-31,Super Junior,No Other,Romanization\n\nNeo gateun saram ddo eobseo\nJ...,,5,3,3,Super Junior,No Other
1,2010-07-31,Taeyang,Wedding Dress,"[Intro]\nSome say, ""It ain't over ’til it's ov...",[Intro]\nSome say it ain't over 'till it's ove...,9,4,6,Taeyang,Wedding Dress
2,2010-07-31,Taeyang feat. G-Dragon,I Need A Girl,Korean (Original)\n\n[Intro: Taeyang]\nTired o...,,17,1,1,Taeyang,I Need A Girl
3,2010-08-07,Shinee,Lucifer,Hangul\n\n숨을 곳도 찾지 못해 나는\n피하려고 애써 봐도\n거부조차 할 수...,Even if I try to avoid you\nI can't find a pla...,3,20,57,Shinee,Lucifer
4,2010-08-07,Shinee,Up and Down,Hangul\n\n너왜 너왜 DOWN이야 기분이\n오왜 오왜 척하면 척\n너왜 너왜...,,18,1,1,Shinee,Up and Down
...,...,...,...,...,...,...,...,...,...,...
1409,2020-12-05,BTS,Dis-ease,"[방탄소년단 ""병"" 가사]\n\n[Intro: j-hope]\nMm, uh-oh\n...","[Intro: j-hope]\nMm, uh-oh\n\n[Verse 1: j-hope...",5,1,1,BTS,Dis-ease
1410,2020-12-05,BTS,Fly to My Room,"[방탄소년단 ""내 방을 여행하는 법"" 가사]\n\n[Chorus: Jimin, V]...","[Intro: Jimin, V]\nLet's go, let me fly to my\...",6,1,1,BTS,Fly to My Room
1411,2020-12-05,Got7,Breath,,[Verse 1: JB]\nTurn the lights on\nThe lights ...,8,1,1,Got7,Breath
1412,2020-12-05,NCT U,90's Love,"[엔시티 유 ""90's Love"" 가사]\n\n[Intro: All]\nHey, h...","Hey, hey, hey, hey, hey, hey (Hoo, hoo)\nHey, ...",15,1,1,NCT U,90's Love
