In [63]:
import pandas as pd
from datetime import timedelta
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob

In [2]:
#function to convert strings to dates :)
def string_to_datetime(string):
    result = datetime.strptime(string, "%Y-%m-%d")       
    return result

In [48]:
lyrics_data = pd.read_csv('Spotify_WDSS_Kpop.csv', converters={'chart_date':string_to_datetime})
#drop old index rows
lyrics_data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis=1, inplace=True)

##mark whether song has original, translated lyrics and spotify data, and which have all of them
lyrics_data['has_translation'] = [True if type(t) == type('s') else False for t in lyrics_data['translated_lyrics'] ]
lyrics_data['has_original'] = [True if type(t) == type('s') else False for t in lyrics_data['original_lyrics'] ]
lyrics_data['has_spotify'] = [True if t >= 0 else False for t in lyrics_data['valence'] ]
lyrics_data['has_all'] = [True if r['has_translation'] and r['has_original'] and r['has_spotify'] else False for i, r in lyrics_data.iterrows()]

##locate songs that have all information
complete_data = lyrics_data.loc[lyrics_data['has_all'] == True]
print('# songs with translation, original lyrics, and spotify data:', len(complete_data))
#will use this data for the rest of analysis

# songs with translation, original lyrics, and spotify data: 113


In [62]:
##process text 
#remove romanized lyrics that are in some genius pages
complete_data["original_lyrics"] = [lyrics.split("Romanization")[0] for lyrics in complete_data["original_lyrics"]]

#get rid of bracketed sections like verse indicators
complete_data["original_lyrics"] = complete_data["original_lyrics"].str.replace(r"(\[.*\])", "")
complete_data["translated_lyrics"] = complete_data["translated_lyrics"].str.replace(r"(\[.*\])", "")

#delete trailing whitespace
complete_data["original_lyrics"] = complete_data["original_lyrics"].str.strip()
complete_data["translated_lyrics"] = complete_data["translated_lyrics"].str.strip()

#clean spacing: replace all add space before llinebreaks (so theyre counted separately)
complete_data["translated_lyrics"] = complete_data["translated_lyrics"].str.replace(r"(\n+)", " \n")
complete_data["original_lyrics"] = complete_data["original_lyrics"].str.replace(r"(\n+)", " \n")

#only roman characters and white space andd punctuation
complete_data["english_lyrics"] = complete_data["original_lyrics"].str.replace(r"([^a-z | A-Z | \s | \, | \' | \? | \! | \. | \- ])", " ")
complete_data["english_lyrics"] = complete_data["english_lyrics"].str.replace(r"(\s[, | \' | \? | \! | \. | \-]+)", " ")  #delete hanging punctuation

##count number of english lyrics and total lyrics
complete_data['english_count'] = [len(lyr.split(" ")) for lyr in complete_data['english_lyrics']]
complete_data['total_count'] = [len(lyr.split(" ")) for lyr in complete_data['original_lyrics']]

#add columns for sentiment polarity for both translated and english lyrics
complete_data['translated_sentiment'] = [0.0 for i in complete_data['artist']]
complete_data['english_sentiment'] = [0.0 for i in complete_data['artist']]

complete_data

Unnamed: 0,Unnamed: 0.1.1.1,chart_date,artist,song_title,original_lyrics,translated_lyrics,peak_position,entry_weeks,Total\nweeks,main_artist,...,scaled_valence,has_translation,has_original,has_spotify,has_all,english_lyrics,english_count,total_count,translated_sentiment,english_sentiment
0,3,2010-08-07,Shinee,Lucifer,Hangul \n숨을 곳도 찾지 못해 나는 \n피하려고 애써 봐도...,Even if I try to avoid you \nI can't find...,3,20,57,Shinee,...,0.732,True,True,True,True,Hangul \nHer whisper is the Lucifer \nT...,121,892,0.0,0.0
10,32,2011-08-06,2NE1,Ugly,밝게 웃어보지만 \n내 맘에 들지 않아 \n난 예쁘지 않아 아름답...,"I put a big smile on, but I don’t like how I l...",2,6,6,2NE1,...,0.036,True,True,True,True,\nOh oh oh oh \nOh oh oh oh \nOh oh oh o...,260,716,0.0,0.0
11,33,2011-08-13,2NE1,I Am the Best,내가 제일 잘 나가 \n내가 제일 잘 나가 \n내가 제일 잘 나가...,I am the Best \nI am the Best \nI am...,1,31,121,2NE1,...,0.374,True,True,True,True,"Beat! \nBam, ra-ta-ta-ta, ta-ta-ta-ta-ta ...",142,599,0.0,0.0
13,49,2011-11-19,Wonder Girls,Be My Baby,JYP \nAnd the Wonder Girls \nWe're b...,JYP \nAnd the Wonder Girls \nWe're b...,2,13,13,Wonder Girls,...,-0.138,True,True,True,True,JYP \nAnd the Wonder Girls \nWe're back! \nWat...,104,540,0.0,0.0
18,66,2012-03-17,Big Bang,Bad Boy,"Ah \nBaby, come back \nA-yo, Choice,...",Ah... \nBaby come back... \nAyo choi...,3,11,11,Big Bang,...,-0.150,True,True,True,True,"Ah \nBaby, come back \nA-yo, Choice, drop it o...",276,824,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,1372,2020-10-17,Blackpink,Pretty Savage,"Uh-huh, uh-huh \nUh-huh, uh-huh (Prrr) ...","Uh-huh, uh-huh \nUh-huh, uh-huh (Prrr) ...",2,2,4,Blackpink,...,-0.236,True,True,True,True,"Uh-huh, uh-huh \nUh-huh, uh-huh Prrr \nBLACKPI...",246,616,0.0,0.0
241,1393,2020-11-14,CL,Hwa,"Yeah, you know \nYou want some tight tigh...",Nana Nanna Nana Nana~ (*3) \nThe rose of ...,3,1,1,CL,...,0.802,True,True,True,True,"Yeah, you know \nYou want some tight tight pus...",627,1033,0.0,0.0
242,1405,2020-12-05,BTS,Life Goes On,어느 날 세상이 멈췄어 \n아무런 예고도 하나 없이 \n봄은 기다...,"I remember \nI, I, I remember \nAh, ...",1,1,1,BTS,...,-0.100,True,True,True,True,yeah oh \nLike an ec...,137,593,0.0,0.0
243,1406,2020-12-05,BTS,Blue & Grey,Where is my angel? \n하루의 끝을 드리운 \nSo...,Where is my angel? \nThe end of the day ...,2,1,1,BTS,...,-0.272,True,True,True,True,Where is my angel? \nSomeone come and save me...,115,564,0.0,0.0


In [64]:
for i, r in complete_data.iterrows():
    print(r['song_title'])

    eng_blob = TextBlob(r['english_lyrics'])
    print('english', eng_blob.sentiment.polarity)
    complete_data.at[i, 'english_sentiment'] = eng_blob.sentiment.polarity

    trans_blob = TextBlob(r['translated_lyrics'])
    print('translated', trans_blob.sentiment.polarity)
    complete_data.at[i, 'translated_sentiment'] = trans_blob.sentiment.polarity
    

Lucifer
english 0.0
translated 0.09833539476396619
Ugly
english 0.04583333333333334
translated -0.05732323232323231
I Am the Best
english 0.340087890625
translated 0.5342498110355253
Be My Baby
english 0.325
translated 0.1354761904761905
Bad Boy
english -0.14236111111111108
translated -0.029742547425474213
Fantastic Baby
english 0.26666666666666666
translated 0.08903846153846154
Gangnam Style
english -0.10191896226671007
translated 0.3921874999999999
No More Dream
english 0.031746031746031744
translated 0.08017625231910944
Ringa Linga
english 0.0987202380952381
translated 0.19795739348370928
Doom Dada
english 0.028241031843973013
translated 0.10162220026350462
Boy in Luv
english -0.6999999999999998
translated -0.00197916666666665
Eyes, Nose, Lips
english 0.0
translated 0.15582922824302137
Touch My Body
english 0.55
translated 0.14183872767857142
Ice Cream Cake
english 0.3229166666666667
translated 0.20024321556579616
Loser
english 0.0
translated -0.18020993343573985
Just Right
english 