# 1) Imports

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv, find_dotenv
import unidecode
from lyricsgenius import Genius
import spotipy
import spotipy.util as util
import text2emotion as te
from difflib import SequenceMatcher
import nltk
import ast
import time
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from joblib import Parallel, delayed

[nltk_data] Downloading package stopwords to /Users/f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/f/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
env_path = find_dotenv()
load_dotenv(env_path)

True

# 2) Getting raw data

In [3]:
# loading the dataset into a dataframe
data = pd.read_csv('../raw_data/data.csv')
data.head(3)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339


In [4]:
# create a sample to work with a smaller dataset throughout development
data_sample = data.sample(100, ignore_index=True, random_state=0)

# 3) Functions

### 3.1) Clean text

In [5]:
# clean strings
def clean_text(text):
    """
    Removes symbols, accents and uppercases from text.
    """
    text = text.replace("[",'')\
        .replace("]",'')\
        .replace("'",'')\
        .replace(",",'')\
        .replace(":",'')\
        .replace(")",'')\
        .replace("(",'')\
        .replace(".",'')\
        .replace('"','')\
        .replace('/','')\
        .replace("\\",'')\
        .replace("(?)",'')\
        .replace("-",'')
        
    return unidecode.unidecode(text.lower())


### 3.2) get_lyrics

In [6]:
# get lyrics
def get_lyrics(artist, song_title, genius):
    """
    Returns lyrics from Genius API.
    """
    # cleanup the inputs
    artist = clean_text(artist)
    song_title = clean_text(song_title)
    
    # call the API to search for the song
    song = genius.search_song(title=song_title, artist=artist)

    if song != None:
        # get song name from the API response to compare similarity
        api_response = unidecode.unidecode(song.to_dict()['full_title'].replace('\xa0', ' ').replace('\u200b', ' ').lower())
        api_response = api_response.split(' by ')
        # check similarity
        song_similar = similar(api_response[0], song_title)
        # artist_similar = similar(api_response[1], artist)

        if song_similar >= 0.9:
            # all lyrics responses come with the song's title and 'Lyrics' str
            # so we count how many characters should be removed in order 
            # to delete any extra text
            characters_to_remove = len(song.to_dict()['title'] + ' Lyrics')   
            
            # get lyrics from API
            lyrics = genius.lyrics(song.to_dict()['id'])[characters_to_remove:-5].replace('\n', ' ').strip()

        else:
            lyrics = None
    
    else:
        lyrics = None
    
    return lyrics


# we need to check similarity between the input of the API call and the response
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


In [7]:
# test_lyrics = get_lyrics(data_sample['artists'][7], data_sample['name'][7])
# test_lyrics

### 3.3) get_sentiments

In [8]:
# get sentiments
def get_sentiments(lyrics):
    """
    Get sentiments from lyrics using VADER.
    """
    sid = SentimentIntensityAnalyzer()
    sentiments = sid.polarity_scores(clean_text(lyrics))
    
    return sentiments


In [9]:
# test_sentiments = get_sentiments(test_lyrics)
# test_sentiments

### 3.4) get_emotions

In [10]:
# get emotions
def get_emotions(lyrics):
    """
    Get emotions from lyrics using text2emotion.
    """
    emotions = te.get_emotion(clean_text(lyrics))
    emotions = {key.lower(): value for key, value in emotions.items()}   
     
    return emotions


In [11]:
# test_emotions = get_emotions(test_lyrics)
# test_emotions

# 4) Loading credentials

In [12]:
# get credentials for Genius API
genius_token = os.environ.get('LYRICSGENIUS_ACCESS_TOKEN')

# instantiate the Genius class with some useful hyperparameters
genius = Genius(genius_token, 
                timeout=220, 
                remove_section_headers=True,
                skip_non_songs=True)

# 5) Getting full DataFrame ❗️

❗️ Choose **one** option and remove the other: ❗️\
\
**A)** Regular Processing\
**B)** Parallel Multiprocessing

### A) Regular Processing ❌

In [13]:
'''
# create df containing lyrics, sentiments and emotions
def get_full_dataframe(df, genius):
    """
    Returns a DataFrame adding lyrics, sentiments and emotions.
    """
    # add lyrics to df
    df['lyrics'] = ''
    df['sentiments'] = ''
    df['emotions'] = ''
    
    for index in range(len(df)):
        #start = time.time()
        lyrics = get_lyrics(df['artists'][index], df['name'][index], genius)
        #end = time.time()
        #print(f"get_lyrics {end - start} s")
        
        #start = time.time()
        df['lyrics'][index] = df['lyrics'][index].replace('', str(lyrics))
        #end = time.time()
        #print(f"saving lyrics {end - start} s")

        if lyrics != 'None':
            #start = time.time()
            sentiments = get_sentiments(lyrics)         
            df['sentiments'][index] = df['sentiments'][index].replace('', str(sentiments))
            #end = time.time()
            #print(f"get_sentiments {end - start} s")

            #start = time.time()
            emotions = get_emotions(lyrics)         
            df['emotions'][index] = df['emotions'][index].replace('', str(emotions))
            #end = time.time()
            #print(f"get_emotions {end - start} s")
            
        else:
            df['sentiments'][index] = df['sentiments'][index].replace('', 'None')
            df['emotions'][index] = df['emotions'][index].replace('', 'None')
 
    # transform sentiments and emotions from string to dict    
    df['sentiments'] = df['sentiments'].apply(lambda x: ast.literal_eval(x))
    df['emotions'] = df['emotions'].apply(lambda x: ast.literal_eval(x))

    # create a dataframe with each sentiment in separate columns
    sentiments_df = df['sentiments'].apply(pd.Series)
    sentiments_df = sentiments_df.fillna('None')
    df = pd.concat([df, sentiments_df], axis=1)
    

    # create a dataframe with each emotion in separate columns
    emotions_df = df['emotions'].apply(pd.Series)
    emotions_df = emotions_df.fillna('None')
    df = pd.concat([df, emotions_df], axis=1)
    
    return df
'''

'\n# create df containing lyrics, sentiments and emotions\ndef get_full_dataframe(df, genius):\n    """\n    Returns a DataFrame adding lyrics, sentiments and emotions.\n    """\n    # add lyrics to df\n    df[\'lyrics\'] = \'\'\n    df[\'sentiments\'] = \'\'\n    df[\'emotions\'] = \'\'\n    \n    for index in range(len(df)):\n        #start = time.time()\n        lyrics = get_lyrics(df[\'artists\'][index], df[\'name\'][index], genius)\n        #end = time.time()\n        #print(f"get_lyrics {end - start} s")\n        \n        #start = time.time()\n        df[\'lyrics\'][index] = df[\'lyrics\'][index].replace(\'\', str(lyrics))\n        #end = time.time()\n        #print(f"saving lyrics {end - start} s")\n\n        if lyrics != \'None\':\n            #start = time.time()\n            sentiments = get_sentiments(lyrics)         \n            df[\'sentiments\'][index] = df[\'sentiments\'][index].replace(\'\', str(sentiments))\n            #end = time.time()\n            #print(f"get_se

In [14]:
# get full dataframe
#test_df = get_full_dataframe(data_sample, genius)

In [15]:
#test_df.head(5)

### B) Parallel Multiprocessing ✅

In [16]:
def get_full_song(artist, song_title, genius):

    lyrics = get_lyrics(artist, song_title, genius)

    if lyrics != None:
        sentiments = get_sentiments(lyrics)
        emotions = get_emotions(lyrics)         

    else:
        sentiments = None
        emotions = None
    
    return lyrics, sentiments, emotions

In [17]:
# create df containing lyrics, sentiments and emotions
def get_full_dataframe_parallel(df, genius):
    """
    Returns a DataFrame adding lyrics, sentiments and emotions.
    """
    full_songs = Parallel(n_jobs=os.cpu_count())\
        (delayed(get_full_song)(artist, song_title, genius)\
        for artist, song_title in zip(df['artists'], df['name']))
    
    full_songs = np.array(full_songs)
    
    df['lyrics'], df['sentiments'], df['emotions'] = full_songs[:,0], full_songs[:,1], full_songs[:,2]

    # create a dataframe with each sentiment in separate columns
    sentiments_df = df['sentiments'].apply(pd.Series)
    sentiments_df = sentiments_df.fillna('None')
    df = pd.concat([df, sentiments_df], axis=1)
    
    # create a dataframe with each emotion in separate columns
    emotions_df = df['emotions'].apply(pd.Series)
    emotions_df = emotions_df.fillna('None')
    df = pd.concat([df, emotions_df], axis=1)
    
    return df

In [18]:
start = time.time()
test_result = get_full_dataframe_parallel(data_sample, genius)
end = time.time()

[nltk_data] Downloading package stopwords to /Users/f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f/nltk_data...
[nltk_data] Downloading package stopwords to /Users/f/nltk_data...
[nltk_data] Downloading package stopwords to /Users/f/nltk_data...
[nltk_data] Downloading package stopwords to /Users/f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/f/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/f/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/f/nltk_data...
[nltk_data]   Package punkt 

Searching for "raised on it" by sam hunt...
Searching for "drifter  2015 remaster" by iron maiden...
Searching for "99 problems" by jayz...
Searching for "close your eyes" by parmalee...
No results found for: 'drifter  2015 remaster iron maiden'
Searching for "soultana maurofora" by markos vamvakaris apostolos xatzixristos...
Done.
Done.
Done.
No results found for: 'soultana maurofora markos vamvakaris apostolos xatzixristos'
Searching for "just because" by frankie yankovic...
No results found for: 'just because frankie yankovic'
Searching for "a girl in the night" by ray price...
Done.
Searching for "look what youve done" by drake...
Done.
Searching for "wild for the night feat skrillex & birdy nam nam" by a$ap rocky skrillex birdy nam nam lord flacko...
Searching for "painkiller" by ruel...
Done.
Searching for "available" by frank sinatra...
Done.
Done.
Searching for "voices" by disturbed...
Done.
Searching for "so icy feat young jeezy" by gucci mane jeezy...
Searching for "gone too 

In [19]:
print(f"get_full_dataframe_parallel for {len(test_result)} samples: {(end - start)/60} m")

get_full_dataframe_parallel for 100 samples: 1.4295635024706523 m


In [21]:
test_result.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,emotions,neg,neu,pos,compound,happy,angry,surprise,sad,fear
0,0.817,2013,0.0158,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,0.0,...,"{'happy': 0.24, 'angry': 0.07, 'surprise': 0.2...",0.0,0.854,0.146,0.9939,0.24,0.07,0.28,0.34,0.07
1,0.548,2003,0.00661,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,0.0,...,"{'happy': 0.05, 'angry': 0.13, 'surprise': 0.2...",0.269,0.633,0.098,-0.9995,0.05,0.13,0.23,0.14,0.45
2,0.732,2014,0.0477,['Sam Hunt'],0.59,235507,0.94,0,3BuPop8SzLG2Q88TJcFAjp,0.0,...,"{'happy': 0.09, 'angry': 0.13, 'surprise': 0.1...",0.06,0.824,0.116,0.9786,0.09,0.13,0.15,0.31,0.32
3,0.475,1981,0.000473,['Iron Maiden'],0.34,288947,0.974,0,7EvjTEzuv7TWaIaWY63sWV,0.0928,...,,,,,,,,,,
4,0.55,1930,0.994,"['Markos Vamvakaris', 'Apostolos Xatzixristos']",0.41,197653,0.169,0,38PozVGXXoeO8dTEVzy74Y,0.901,...,,,,,,,,,,


# 6) Saving DataFrame to CSV

In [20]:
# save dataframe to csv
test_df.to_csv('../processed_data/processed_test_data.csv')

NameError: name 'test_df' is not defined