In [64]:
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
import unidecode
from lyricsgenius import Genius
import spotipy
import spotipy.util as util
import text2emotion as te
from difflib import SequenceMatcher
import nltk
import ast
import time
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [65]:
env_path = find_dotenv()
load_dotenv(env_path)

True

In [66]:
# loading the dataset into a dataframe
data = pd.read_csv('../raw_data/data.csv')
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [67]:
# create a sample to work with a smaller dataset throughout development
data_sample = data.sample(10, ignore_index=True, random_state=0)

In [68]:
# clean strings
def clean_text(text):
    """
    Removes symbols, accents and uppercases from text.
    """
    text = text.replace("[",'')\
        .replace("]",'')\
        .replace("'",'')\
        .replace(",",'')\
        .replace(":",'')\
        .replace(")",'')\
        .replace("(",'')\
        .replace(".",'')\
        .replace('"','')\
        .replace('/','')\
        .replace("\\",'')\
        .replace("(?)",'')\
        .replace("-",'')
        
    return unidecode.unidecode(text.lower())


In [69]:
# get lyrics
def get_lyrics(artist, song_title,genius):
    """
    Returns lyrics from Genius API.
    """
    artist = clean_text(artist)
    song_title = clean_text(song_title)
    
    # call the API to search for the song
    song = genius.search_song(title=song_title, artist=artist)
    
    if song != None:
        
        # get song name from the API response to compare similarity
        api_response = unidecode.unidecode(song.to_dict()['full_title'].replace('\xa0', ' ').replace('\u200b', ' ').lower())
        api_response = api_response.split(' by ')

        # check similarity
        song_similar = similar(api_response[0], song_title)
        # artist_similar = similar(api_response[1], artist)

        
        if song_similar >= 0.9:

            # all lyrics responses come with the song's title and 'Lyrics' str
            # so we count how many characters should be removed in order 
            # to delete any extra text
            characters_to_remove = len(song.to_dict()['title'] + ' Lyrics')   
            
            # get lyrics from API
            lyrics = genius.lyrics(song.to_dict()['id'])[characters_to_remove:-5].replace('\n', ' ').strip()

        else:
            
            lyrics = 'None'
    
    else:
        
        lyrics = 'None'
    
    return lyrics


# we need to check similarity between the input of the API call and the response
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [70]:
# test_lyrics = get_lyrics(data_sample['artists'][7], data_sample['name'][7])
# test_lyrics

In [71]:
# get sentiments
def get_sentiments(lyrics):
    """
    Get sentiments from lyrics using VADER.
    """
    if lyrics != 'None':
        
        sid = SentimentIntensityAnalyzer()

        sentiments = sid.polarity_scores(clean_text(lyrics))    
    
        return sentiments
    
    else:
        
        return 'None'
 

In [72]:
# test_sentiments = get_sentiments(test_lyrics)
# test_sentiments

In [73]:
# get emotions
def get_emotions(lyrics):
    """
    Get emotions from lyrics using text2emotion.
    """
    if lyrics != 'None':
        emotions = te.get_emotion(clean_text(lyrics))
        emotions = {key.lower(): value for key, value in emotions.items()}    
        return emotions
    
    else:
        
        return 'None'


In [74]:
# test_emotions = get_emotions(test_lyrics)
# test_emotions

In [96]:
# create df containing lyrics, sentiments and emotions
def get_full_dataframe(df,genius):
    """
    Returns a DataFrame adding lyrics, sentiments and emotions.
    """
    # add lyrics to df
    df['lyrics'] = ''
    df['sentiments'] = ''
    df['emotions'] = ''
    
    for index in range(len(df)):
        
        start = time.time()
        lyrics = get_lyrics(df['artists'][index], df['name'][index],genius)
        end = time.time()
        print(f"lyrics {end - start} ms")
 
        start = time.time()
        sentiments = get_sentiments(lyrics)         
        df['sentiments'][index] = df['sentiments'][index].replace('', str(sentiments))
        end = time.time()
        print(f"sentiments {end - start} ms")
 
        start = time.time()
        emotions = get_emotions(lyrics)         
        df['emotions'][index] = df['emotions'][index].replace('', str(emotions))
        end = time.time()
        print(f"emotions {end - start} ms")
 
    # transform sentiments from string to dict    
    df['sentiments'] = df['sentiments'].apply(lambda x: ast.literal_eval(x))

    # create a dataframe with each sentiment in a separate columns
    sentiments_df = df['sentiments'].apply(pd.Series)
    sentiments_df = sentiments_df.fillna('None')
    df = pd.concat([df, sentiments_df], axis=1)

    df['emotions'] = df['emotions'].apply(lambda x: ast.literal_eval(x))
    # create a dataframe with each emotion in a separate columns
    emotions_df = df['emotions'].apply(pd.Series)
    emotions_df = emotions_df.fillna('None')
    df = pd.concat([df, emotions_df], axis=1)
    
    return df

SyntaxError: unmatched '}' (4280174264.py, line 14)

In [95]:
genius_token = os.environ.get('LYRICSGENIUS_ACCESS_TOKEN')

  # instantiate the class with some useful hyperparameters
genius = Genius(genius_token, 
            timeout=220, 
            remove_section_headers=True, 
            sleep_time=0.05, 
            skip_non_songs=True)
test_df = get_full_dataframe(data_sample, genius)
# print(test_df)

Searching for "close your eyes" by parmalee...
Done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiments'][index] = df['sentiments'][index].replace('', str(sentiments))


KeyboardInterrupt: 

In [92]:
test_df.to_csv('../processed_data/processed_test_data.csv')

In [None]:
test_df["emotions"]

NameError: name 'df' is not defined