In [1]:
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
import unidecode
from lyricsgenius import Genius
import spotipy
import spotipy.util as util
import text2emotion as te
from difflib import SequenceMatcher
import nltk
import ast
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package stopwords to /Users/f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/f/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
env_path = find_dotenv()
load_dotenv(env_path)

True

In [3]:
# loading the dataset into a dataframe
data = pd.read_csv('../raw_data/data.csv')
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [4]:
# create a sample to work with a smaller dataset throughout development
data_sample = data.sample(10, ignore_index=True, random_state=0)

In [5]:
# clean strings
def clean_text(text):
    """
    Removes symbols, accents and uppercases from text.
    """
    text = text.replace("[",'')\
        .replace("]",'')\
        .replace("'",'')\
        .replace(",",'')\
        .replace(":",'')\
        .replace(")",'')\
        .replace("(",'')\
        .replace(".",'')\
        .replace('"','')\
        .replace('/','')\
        .replace("\\",'')\
        .replace("(?)",'')\
        .replace("-",'')
        
    return unidecode.unidecode(text.lower())


In [6]:
# get lyrics
def get_lyrics(artist, song_title):
    """
    Returns lyrics from Genius API.
    """
    artist = clean_text(artist)
    song_title = clean_text(song_title)
    
    # loading needed credentials
    genius_token = os.environ.get('LYRICSGENIUS_ACCESS_TOKEN')
    
    # instantiate the class with some useful hyperparameters
    genius = Genius(genius_token, 
                timeout=220, 
                remove_section_headers=True, 
                sleep_time=0.3, 
                skip_non_songs=True)
    
    # call the API to search for the song
    song = genius.search_song(title=song_title, artist=artist)
    
    if song != None:
        
        # get song name from the API response to compare similarity
        api_response = unidecode.unidecode(song.to_dict()['full_title'].replace('\xa0', ' ').replace('\u200b', ' ').lower())
        api_response = api_response.split(' by ')

        # check similarity
        song_similar = similar(api_response[0], song_title)
        # artist_similar = similar(api_response[1], artist)
    
        if song_similar >= 0.9:
        
            # all lyrics responses come with the song's title and 'Lyrics' str
            # so we count how many characters should be removed in order 
            # to delete any extra text
            characters_to_remove = len(song.to_dict()['title'] + ' Lyrics')   
            
            # get lyrics from API
            lyrics = genius.lyrics(song.to_dict()['id'])[characters_to_remove:-5].replace('\n', ' ').strip()

        else:
            
            lyrics = 'None'
    
    else:
        
        lyrics = 'None'
    
    return lyrics


# we need to check similarity between the input of the API call and the response
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [7]:
test_lyrics = get_lyrics(data_sample['artists'][7], data_sample['name'][7])
test_lyrics

Searching for "look what youve done" by drake...
Done.


"Yeah Yeah Man  It's like '09 in your basement and I'm in love with Nebby And I still love her, but it fell through because I wasn't ready And your back hurt, and your neck hurt and you smokin' heavy And I sit next to you, and I lecture you because those are deadly And then you ash it, and we argue about spendin' money on bullshit And you tell me I'm just like my father, my one button, you push it Now it's fuck you, I hate you, I'll move out in a heartbeat And I leave out and you call me and you tell me that you sorry And you love me and I love you and your heart hurts, mine does too And it's just words and they cut deep, but it's our world, it's just us two I see painkillers on the kitchen counter, I hate to see it all hurt so bad But maybe I wouldn't have worked this hard if you were healthy and it weren't so bad, uh Maybe I should walk up the street And try and get a job at the bank 'Cause leave it up to me, J, and Neeks We'll probably end up robbin' a bank Then Wayne calls up my ph

In [8]:
# get sentiments
def get_sentiments(lyrics):
    """
    Get sentiments from lyrics using VADER.
    """
    if lyrics != 'None':
        
        sid = SentimentIntensityAnalyzer()

        sentiments = sid.polarity_scores(clean_text(lyrics))    
    
        return sentiments
    
    else:
        
        return 'None'
 

In [9]:
test_sentiments = get_sentiments(test_lyrics)
test_sentiments

{'neg': 0.118, 'neu': 0.714, 'pos': 0.168, 'compound': 0.992}

In [10]:
# get emotions
def get_emotions(lyrics):
    """
    Get emotions from lyrics using text2emotion.
    """
    if lyrics != 'None':
        emotions = te.get_emotion(clean_text(lyrics))
        
        emotions = {key.lower(): value for key, value in emotions.items()}
        
        return emotions
    
    else:
        
        return 'None'


In [11]:
test_emotions = get_emotions(test_lyrics)
test_emotions

{'happy': 0.23, 'angry': 0.04, 'surprise': 0.1, 'sad': 0.38, 'fear': 0.26}

In [12]:
# create df containing lyrics, sentiments and emotions
def get_full_dataframe(df):
    """
    Returns a DataFrame adding lyrics, sentiments and emotions.
    """
    # add lyrics to df
    df['lyrics'] = ''
    
    for index in range(len(df)):
        
        lyrics = get_lyrics(df['artists'][index], df['name'][index])
        
        df['lyrics'][index] = df['lyrics'][index].replace('', lyrics)
    
    # add sentiments to df
    df['sentiments'] = ''
    
    for index in range(len(df)):

        sentiments = get_sentiments(df['lyrics'][index])         
        
        df['sentiments'][index] = df['sentiments'][index].replace('', str(sentiments))
    
    # transform sentiments from string to dict    
    df['sentiments'] = df['sentiments'].apply(lambda x: ast.literal_eval(x))

    # create a dataframe with each sentiment in a separate columns
    sentiments_df = df['sentiments'].apply(pd.Series)
    sentiments_df = sentiments_df.fillna('None')
    
    df = pd.concat([df, sentiments_df], axis=1)

    # add emotions to df
    df['emotions'] = ''
    
    for index in range(len(df)):

        emotions = get_emotions(df['lyrics'][index])         
        
        df['emotions'][index] = df['emotions'][index].replace('', str(emotions))
    
    # transform emotions from string to dict    
    df['emotions'] = df['emotions'].apply(lambda x: ast.literal_eval(x))

    # create a dataframe with each emotion in a separate columns
    emotions_df = df['emotions'].apply(pd.Series)
    emotions_df = emotions_df.fillna('None')
    
    df = pd.concat([df, emotions_df], axis=1)
    
    return df


In [13]:
test_df = get_full_dataframe(data_sample)
test_df

Searching for "close your eyes" by parmalee...
Done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lyrics'][index] = df['lyrics'][index].replace('', lyrics)


Searching for "99 problems" by jayz...
Done.
Searching for "raised on it" by sam hunt...
Done.
Searching for "drifter  2015 remaster" by iron maiden...
No results found for: 'drifter  2015 remaster iron maiden'
Searching for "soultana maurofora" by markos vamvakaris apostolos xatzixristos...
No results found for: 'soultana maurofora markos vamvakaris apostolos xatzixristos'
Searching for "just because" by frankie yankovic...
No results found for: 'just because frankie yankovic'
Searching for "a girl in the night" by ray price...
Done.
Searching for "look what youve done" by drake...
Done.
Searching for "wild for the night feat skrillex & birdy nam nam" by a$ap rocky skrillex birdy nam nam lord flacko...
Done.
Searching for "painkiller" by ruel...
Done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiments'][index] = df['sentiments'][index].replace('', str(sentiments))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emotions'][index] = df['emotions'][index].replace('', str(emotions))


Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,neg,neu,pos,compound,emotions,happy,angry,surprise,sad,fear
0,0.817,2013,0.0158,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,0.0,...,0.0,0.854,0.146,0.9939,"{'happy': 0.24, 'angry': 0.07, 'surprise': 0.2...",0.24,0.07,0.28,0.34,0.07
1,0.548,2003,0.00661,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,0.0,...,0.269,0.633,0.098,-0.9995,"{'happy': 0.05, 'angry': 0.13, 'surprise': 0.2...",0.05,0.13,0.23,0.14,0.45
2,0.732,2014,0.0477,['Sam Hunt'],0.59,235507,0.94,0,3BuPop8SzLG2Q88TJcFAjp,0.0,...,0.06,0.824,0.116,0.9786,"{'happy': 0.09, 'angry': 0.13, 'surprise': 0.1...",0.09,0.13,0.15,0.31,0.32
3,0.475,1981,0.000473,['Iron Maiden'],0.34,288947,0.974,0,7EvjTEzuv7TWaIaWY63sWV,0.0928,...,,,,,,,,,,
4,0.55,1930,0.994,"['Markos Vamvakaris', 'Apostolos Xatzixristos']",0.41,197653,0.169,0,38PozVGXXoeO8dTEVzy74Y,0.901,...,,,,,,,,,,
5,0.973,1947,0.389,['Frankie Yankovic'],0.875,169760,0.391,0,1HK6Zbd0cqB6c67jLVtWjI,0.0,...,,,,,,,,,,
6,0.607,1963,0.742,['Ray Price'],0.482,168267,0.272,0,3GBqKmJ62SJ943NQ9i1JuE,0.0,...,0.118,0.756,0.126,0.7391,"{'happy': 0.12, 'angry': 0.12, 'surprise': 0.2...",0.12,0.12,0.29,0.33,0.12
7,0.271,2011,0.485,['Drake'],0.495,301960,0.351,1,7t1lBIr3WIEtqQEOdZFMUf,0.0,...,0.118,0.714,0.168,0.992,"{'happy': 0.23, 'angry': 0.04, 'surprise': 0.1...",0.23,0.04,0.1,0.38,0.26
8,0.686,2013,0.0481,"['A$AP Rocky', 'Skrillex', 'Birdy Nam Nam', 'L...",0.757,212640,0.848,1,2rzBvHM9h36Tpdj7Jdajka,0.0,...,,,,,,,,,,
9,0.439,2019,0.105,['Ruel'],0.675,213598,0.414,0,1abFkY2jm6KDFMZ7RD9YJh,0.0,...,0.248,0.66,0.092,-0.9957,"{'happy': 0.42, 'angry': 0.09, 'surprise': 0.2...",0.42,0.09,0.2,0.18,0.11


In [14]:
test_df.to_csv('../processed_data/processed_test_data.csv')