In [67]:
import pandas as pd
from datetime import timedelta
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt

import lyricsgenius
from fuzzywuzzy import fuzz

In [68]:
#function to convert strings to dates :)
def string_to_datetime(string):
    result = datetime.strptime(string, "%Y-%m-%d")       
    return result

In [69]:
#read in radio, streaming, sales data
radio = pd.read_csv('subcharts - Radio_Kpop.csv')
streaming = pd.read_csv('subcharts - Streaming_Kpop.csv')
sales = pd.read_csv('subcharts - Sales_Kpop.csv')
hot100 = pd.read_csv('BBSHRK - Spotify_BBH100_Kpop.csv')

In [70]:
##standardize artist and song names 
for chart in [sales, streaming, radio, hot100]:
    #drop random unnamed columns
    drop_columns = [c for c in chart.columns if 'Unnamed' in c ]
    chart.drop(columns=drop_columns, inplace=True)

    #remove wiki citations, standardize whitespace, remove quotation marks
    #regex patterns and replacement strings
    regex_pats = [[r"(\[\d+\])", ""], [r"(\s)", " "], [r"(\")", ""]]
    for pat in regex_pats:
        chart["artist"] = chart["artist"].str.replace(pat[0], pat[1])
        chart["song_title"] = chart["song_title"].str.replace(pat[0], pat[1])
        
    #separate out main artist: split by feat. and and
    chart['main_artist'] = chart['artist'].str.replace(r"(feat.*)", '')
    #handle suga/agust d specifically because this errror comes up a lot
    chart['main_artist'] = ['Agust D' if 'Agust D' in a else a for a in chart['main_artist']]
    chart['main_artist'] = chart['main_artist'].str.replace(r"(and.*)", '')
    chart['main_artist'] = chart['main_artist'].str.replace(r"(,.*)", '')
    chart['main_artist'] = chart['main_artist'].str.replace(r"(&.*)", '')
    chart['main_artist'] = chart['main_artist'].str.replace(r"(:.*)", '')

    #remove parentheticals from song name and artist name
    chart['search_title'] = chart['song_title'].str.replace(r"(\(.*\))", "")
    chart['main_artist'] = chart['main_artist'].str.replace(r"(\(.*\))", "")

    #strip out whitespace
    chart['search_title'] = chart['search_title'].str.strip()
    chart['main_artist'] = chart['main_artist'].str.strip()

    #convert chart_date to datetime object
    #chart["chart_date"] = [string_to_datetime(d) for d in chart["chart_date"]]

In [71]:
## standardize peak positions, take first when multiple are listed
for i, r in sales.iterrows():
    try:
        new_pos = int(r['peak_position'])
    except:
        new_pos = int(r['peak_position'].split(' ')[0])
    sales.at[i, 'peak_position'] = new_pos

In [72]:
sales.rename(columns={'chart_date':'chart_date_sales', 'peak_position':'peak_position_sales', 'chart_weeks':'chart_weeks_sales'}, inplace=True)
streaming.rename(columns={'chart_date':'chart_date_streaming', 'peak_position':'peak_position_streaming', 'chart_weeks':'chart_weeks_streaming'}, inplace=True)
radio.rename(columns={'chart_date':'chart_date_radio','peak_position':'peak_position_radio', 'chart_weeks':'chart_weeks_radio'}, inplace=True)

In [75]:
## merge other charts into the hot 100 data
hot100_radio = hot100.merge(radio, how='outer', on=['search_title', 'main_artist'])
hot100_radio_streaming = hot100_radio.merge(streaming, how='outer', on=['search_title', 'main_artist'])
hot100_radio_streaming_sales = hot100_radio_streaming.merge(sales, how='outer', on=['search_title', 'main_artist'])

In [81]:
# identify all songs missing lyrics (not on hot 100)
to_fill = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['translated_lyrics'].isnull()]

In [78]:
### scrape original lyrics from genius

genius = lyricsgenius.Genius('wxBLY4fHewydC6H8GBlsicom4El7Gyo6vKfejjTR9ojLfNUslIjcmvZ9iYVGi5RC')

for i, r in to_fill.iterrows():
    this_song = genius.search_song(r['search_title'], r['main_artist'])

    #continue if it returns a result
    if this_song:
        #partial ratio threshold to prevent mismatches
        title_similarity = fuzz.partial_ratio(r['search_title'].lower(), this_song.title.lower())
        artist_similarity = fuzz.partial_ratio(r['main_artist'].lower(), this_song.artist.lower())
        

        #if similarity is above threshold, store lyrics + url
        if title_similarity > 60 and artist_similarity > 60:
            print("MATCHED:")
            print(r['search_title'], r['main_artist'], " | ", this_song.title, this_song.artist)
            hot100_radio_streaming_sales.at[i, 'original_lyrics'] = this_song.lyrics
            hot100_radio_streaming_sales.at[i, 'Original URL'] = this_song.url

Searching for "Euphoria" by BTS...
Done.
MATCHED:
Euphoria BTS  |  Euphoria BTS (방탄소년단)
Searching for "I'm Fine" by BTS...
Done.
MATCHED:
I'm Fine BTS  |  I’m Fine BTS (방탄소년단)
Searching for "Epiphany" by BTS...
Done.
MATCHED:
Epiphany BTS  |  Epiphany BTS (방탄소년단)
Searching for "Trivia: Seesaw" by BTS...
Done.
MATCHED:
Trivia: Seesaw BTS  |  Trivia 轉: Seesaw BTS (방탄소년단)
Searching for "Answer: Love Myself" by BTS...
Done.
MATCHED:
Answer: Love Myself BTS  |  Answer: Love Myself BTS (방탄소년단)
Searching for "Trivia: Just Dance" by BTS...
Done.
MATCHED:
Trivia: Just Dance BTS  |  Trivia 起: Just Dance BTS (방탄소년단)
Searching for "Serendipity" by BTS...
Done.
Searching for "Trivia: Love" by BTS...
Done.
MATCHED:
Trivia: Love BTS  |  Trivia 承: Love BTS (방탄소년단)
Searching for "Pop/Stars" by K/DA...
Done.
MATCHED:
Pop/Stars K/DA  |  POP/STARS K/DA
Searching for "2! 3!" by BTS...
Done.
MATCHED:
2! 3! BTS  |  둘! 셋! (그래도 좋은 날이 더 많기를) [2! 3! (Still Wishing For More Good Days)] BTS (방탄소년단)
Searching for "

In [82]:
##iterate through all songs to scrape translated lyrics
for i, r in to_fill.iterrows():

    artist = r['main_artist']
    title = r['search_title']
    this_song = genius.search_song(artist + ' - ' + title + ' (English Translation)', 'Genius English Translations')

    #continue if it returns a result
    if this_song:
        #partial ratio threshold to prevent mismatches
        similarity = fuzz.partial_ratio((artist + ' - ' + title + ' (English Translation)').lower(), this_song.title.lower())
        

        #if similarity is above threshold, store lyrics + url
        if similarity > 60:
            print('MATCHED:')
            print((artist + ' - ' + title), this_song.title)
            hot100_radio_streaming_sales.at[i, 'translated_lyrics'] = this_song.lyrics
            hot100_radio_streaming_sales.at[i, 'English Translation URL'] = this_song.url

Searching for "BTS - Euphoria (English Translation)" by Genius English Translations...
Done.
MATCHED:
BTS - Euphoria BTS - Euphoria (English Translation)
Searching for "BTS - I'm Fine (English Translation)" by Genius English Translations...
Done.
MATCHED:
BTS - I'm Fine BTS - I’m Fine (English Translation)
Searching for "BTS - Epiphany (English Translation)" by Genius English Translations...
Done.
MATCHED:
BTS - Epiphany BTS - Epiphany (English Translation)
Searching for "BTS - Trivia: Seesaw (English Translation)" by Genius English Translations...
Done.
MATCHED:
BTS - Trivia: Seesaw BTS - Trivia 轉: Seesaw (English Translation)
Searching for "BTS - Answer: Love Myself (English Translation)" by Genius English Translations...
Done.
MATCHED:
BTS - Answer: Love Myself BTS - Answer: Love Myself (English Translation)
Searching for "BTS - Trivia: Just Dance (English Translation)" by Genius English Translations...
Done.
MATCHED:
BTS - Trivia: Just Dance BTS - Trivia 起: Just Dance (English Tran

In [100]:
no_original = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['original_lyrics'].isnull()]
no_original['search_title']

Series([], Name: search_title, dtype: object)

In [99]:
#### original lyrics mistakes

## missed Serendipity
#this_song = genius.search_song("Serendipity Full Length", "BTS")
#index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == 'Serendipity'].index[0]
#hot100_radio_streaming_sales.at[index, 'Original URL'] = this_song.url
#hot100_radio_streaming_sales.at[index, 'original_lyrics'] = this_song.lyrics

## missed winter flower
#this_song = genius.search_song("Winter Flower (雪中梅)", "Younha")
#index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == 'Winter Flower'].index[0]
#hot100_radio_streaming_sales.at[index, 'Original URL'] = this_song.url
#hot100_radio_streaming_sales.at[index, 'original_lyrics'] = this_song.lyrics

## missed zero o clock
this_song = genius.search_song("Zero O'Clock", "BTS")
index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == '00:00'].index[0]
hot100_radio_streaming_sales.at[index, 'Original URL'] = this_song.url
hot100_radio_streaming_sales.at[index, 'original_lyrics'] = this_song.lyrics

Searching for "Zero O'Clock" by BTS...
Done.


In [107]:
no_translated = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['translated_lyrics'].isnull()]
no_translated['search_title']

Series([], Name: search_title, dtype: object)

In [102]:
#### translated lyrics mistakes

## sweet night is entirely in english
sn_index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == 'Sweet Night'].index[0]
hot100_radio_streaming_sales.at[sn_index, 'English Translation URL'] = hot100_radio_streaming_sales.at[sn_index, 'Original URL']
hot100_radio_streaming_sales.at[sn_index, 'translated_lyrics'] = hot100_radio_streaming_sales.at[sn_index, 'original_lyrics']

## who is also entirely in english
who_index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == 'Who'].index[0]
hot100_radio_streaming_sales.at[who_index, 'English Translation URL'] = hot100_radio_streaming_sales.at[who_index, 'Original URL']
hot100_radio_streaming_sales.at[who_index, 'translated_lyrics'] = hot100_radio_streaming_sales.at[who_index, 'original_lyrics']

In [104]:
## missed translation for blueberry eyes
bbe_genius = genius.search_song("MAX - Blueberry Eyes ft. SUGA (English Translation)", "Genius English Translations")
bbe_index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == 'Blueberry Eyes'].index[0]
hot100_radio_streaming_sales.at[bbe_index, 'English Translation URL'] = bbe_genius.url
hot100_radio_streaming_sales.at[bbe_index, 'translated_lyrics'] = bbe_genius.lyrics

Searching for "MAX - Blueberry Eyes ft. SUGA (English Translation)" by Genius English Translations...
Done.


In [106]:
## genius does not have english translaltion of gbtb
gbtb_index = hot100_radio_streaming_sales.loc[hot100_radio_streaming_sales['search_title'] == 'G.B.T.B.'].index[0]
hot100_radio_streaming_sales.at[gbtb_index, 'English Translation URL'] = 'https://lyricstranslate.com/en/gbtb-gbtb.html'
hot100_radio_streaming_sales.at[gbtb_index, 'translated_lyrics'] = open("Manual Lyrics/GBTB.txt", "r").read()

In [108]:
#### re-process data now that it is all collected

#scale spotify valence values
#hot100_radio_streaming_sales['scaled_valence'] = [s*2 - 1 for s in hot100_radio_streaming_sales['valence']]

##process text 
#remove romanized lyrics that are in some genius pages
hot100_radio_streaming_sales["original_lyrics"] = [lyrics.split("Romanization")[0] for lyrics in hot100_radio_streaming_sales["original_lyrics"]]

#get rid of bracketed sections like verse indicators
hot100_radio_streaming_sales["original_lyrics"] = hot100_radio_streaming_sales["original_lyrics"].str.replace(r"(\[.*\])", "")
hot100_radio_streaming_sales["translated_lyrics"] = hot100_radio_streaming_sales["translated_lyrics"].str.replace(r"(\[.*\])", "")

#delete trailing whitespace
hot100_radio_streaming_sales["original_lyrics"] = hot100_radio_streaming_sales["original_lyrics"].str.strip()
hot100_radio_streaming_sales["translated_lyrics"] = hot100_radio_streaming_sales["translated_lyrics"].str.strip()

#clean spacing: add space before linebreaks (so theyre counted separately)
hot100_radio_streaming_sales["translated_lyrics"] = hot100_radio_streaming_sales["translated_lyrics"].str.replace(r"(\n+)", " \n")
hot100_radio_streaming_sales["translated_lyrics"] = hot100_radio_streaming_sales["translated_lyrics"].str.replace(r"(\s+)", " ") #restandardize whitespace
hot100_radio_streaming_sales["translated_lyrics"] = hot100_radio_streaming_sales["translated_lyrics"].str.replace(r"(\s[, | \' | \? | \! | \. | \-]+)", " ")  #delete hanging punctuation
hot100_radio_streaming_sales["original_lyrics"] = hot100_radio_streaming_sales["original_lyrics"].str.replace(r"(\n+)", " \n")
hot100_radio_streaming_sales["original_lyrics"] = hot100_radio_streaming_sales["original_lyrics"].str.replace(r"(\s+)", " ") #restandardize whitespace
hot100_radio_streaming_sales["original_lyrics"] = hot100_radio_streaming_sales["original_lyrics"].str.replace(r"(\s[, | ' | \? | \! | \. | \-]+)", " ")  #delete hanging punctuation

#only roman characters and white space and punctuation
hot100_radio_streaming_sales["english_lyrics"] = hot100_radio_streaming_sales["original_lyrics"].str.replace(r"([^a-z | A-Z | \s | , | ' | \? | \! | \. | \- ])", " ")
hot100_radio_streaming_sales["english_lyrics"] = hot100_radio_streaming_sales["english_lyrics"].str.replace(r"(\s+)", " ") #restandardize whitespace
hot100_radio_streaming_sales["english_lyrics"] = hot100_radio_streaming_sales["english_lyrics"].str.replace(r"(\s[, | ' | \? | \! | \. | \-]+)", " ")  #delete hanging punctuation

##count number of english lyrics and total lyrics
hot100_radio_streaming_sales['english_count'] = [len(lyr.split(" ")) for lyr in hot100_radio_streaming_sales['english_lyrics']]
hot100_radio_streaming_sales['total_count'] = [len(lyr.split(" ")) for lyr in hot100_radio_streaming_sales['original_lyrics']]

##count number of unique english and total lyrics
hot100_radio_streaming_sales['english_unique'] = [len(set(lyr.split(" "))) for lyr in hot100_radio_streaming_sales['english_lyrics']]
hot100_radio_streaming_sales['total_unique'] = [len(set(lyr.split(" "))) for lyr in hot100_radio_streaming_sales['original_lyrics']]

### create columns for english percentage
hot100_radio_streaming_sales['english_percentage'] = hot100_radio_streaming_sales['english_count'] / hot100_radio_streaming_sales['total_count']
hot100_radio_streaming_sales['unique_english_percentage'] = hot100_radio_streaming_sales['english_unique'] / hot100_radio_streaming_sales['total_unique']

In [109]:
hot100_radio_streaming_sales

Unnamed: 0,chart_date,artist_x,song_title_x,Original URL,original_lyrics,English Translation URL,translated_lyrics,peak_position,chart_weeks,english_lyrics,...,chart_weeks_streaming,chart_date_sales,artist_y,song_title_y,peak_position_sales,chart_weeks_sales,english_unique,total_unique,english_percentage,unique_english_percentage
0,2019-01-12,Pinkfong,Baby Shark,https://genius.com/Pinkfong-baby-shark-lyrics,"Baby shark, doo doo doo doo doo doo Baby shark...",https://genius.com/Pinkfong-baby-shark-lyrics,"Baby shark, doo doo doo doo doo doo Baby shark...",32.0,20.0,"Baby shark, doo doo doo doo doo doo Baby shark...",...,6 (R) 59,,,,,,24,23,1.032520,1.043478
1,2009-10-31,Wonder Girls,Nobody (English release),https://genius.com/Wonder-girls-nobody-english...,You know I still love you baby And it will nev...,https://genius.com/Wonder-girls-nobody-english...,You know I still love you baby And it will nev...,76.0,1.0,You know I still love you baby And it will nev...,...,,,,,,,119,123,1.000000,0.967480
2,2012-09-22,PSY,Gangnam Style,https://genius.com/Psy-gangnam-style-lyrics,Hangul 오빤 강남 스타일 강남 스타일 낮에는 따사로운 인간적인 여자 커피 한잔...,https://genius.com/Genius-english-translations...,Oppa Gangnam Style Gangnam style Warm human wo...,2.0,3.0,Hangul Hey Hey Hey Hey sexy lady sexy lady Hey...,...,122,2012-10-06,PSY,Gangnam Style,1,32,12,83,0.114391,0.144578
3,2013-04-27,PSY,Gentleman,https://genius.com/Psy-gentleman-lyrics,알랑가몰라 왜 화끈해야 하는건지 알랑가몰라 왜 말끔해야 하는건지 알랑가몰라 아리까리...,https://lyricstranslate.com/en/gentleman-gentl...,I don’t know if you know why it needs to be ho...,5.0,1.0,"We like We-we-we like party Damn, girl You so...",...,19,2013-05-04,PSY,Gentleman,20,4,37,75,0.710660,0.493333
4,2014-06-28,PSY feat. Snoop Dogg,Hangover,https://genius.com/Psy-hangover-lyrics,"Hangover, hangover, hangover, hangover-over-ov...",https://colorcodedlyrics.com/2014/06/psy-ssai-...,"Hangover, hangover, hangover, hangover-over-ov...",26.0,1.0,"Hangover, hangover, hangover, hangover-over-ov...",...,1,,,,,,170,211,0.948603,0.805687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,,,,https://genius.com/Bts-stay-gold-lyrics,Ooh ooh In a world where you feel cold You got...,https://genius.com/Genius-english-translations...,"Oh, woah, oh In a world where you feel cold Yo...",,,Ooh ooh In a world where you feel cold You got...,...,,2020-07-04,BTS,Stay Gold,6,1,41,91,0.718919,0.450549
81,,,,https://genius.com/Bts-your-eyes-tell-lyrics,何故、こんなにも 涙が溢れるの ねえ、側にいて そして笑ってよ 君のいない未来は 色のない世...,https://genius.com/Genius-english-translations...,"Why are my eyes filled with tears? Hey, stay b...",,,So beautiful follow me I ll find you So color...,...,,2020-07-25,BTS,Your Eyes Tell,12,1,16,57,0.313433,0.280702
82,,,,https://genius.com/K-da-the-baddest-lyrics,Baddest do what the baddest do (Hey) The badde...,https://genius.com/Genius-english-translations...,Baddest do what the baddest do (Hey) The badde...,,,Baddest do what the baddest do Hey The baddest...,...,,2020-09-12,K/DA,The Baddest,28,1,180,220,0.903967,0.818182
83,,,,https://genius.com/Max-blueberry-eyes-lyrics,"Mmm, mmm Da-da, ooh woah-oh, yeah MAX, baby Le...",https://genius.com/Genius-english-translations...,"Mmm, mmm Da-da, ooh woah-oh, yeah MAX, baby Le...",,,"Mmm, mmm Da-da, ooh woah-oh, yeah MAX, baby Le...",...,,2020-09-26,Max Schneider feat. Suga,Blueberry Eyes,22,1,87,131,0.817427,0.664122


In [169]:
#write to csv
#radio.to_csv('subcharts - Radio_Kpop.csv')
#streaming.to_csv('subcharts - Streaming_Kpop.csv')
#sales.to_csv('subcharts - Sales_Kpop.csv')
hot100_radio_streaming_sales.to_csv('hot100_radio_streaming_sales_Kpop.csv')