In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv, find_dotenv
import unidecode
from lyricsgenius import Genius
from difflib import SequenceMatcher
from joblib import Parallel, delayed
import time

In [2]:
env_path = find_dotenv()
load_dotenv(env_path)

True

In [3]:
# loading the dataset into a dataframe
data = pd.read_csv('../raw_data/data.csv')

# create a sample to work with a smaller dataset throughout development
data_sample = data[30001:40000]
# Fede [:50000]
# Yann [50001:100000]
# Adri [100001:]

# get credentials for Genius API
genius_token = os.environ.get('LYRICSGENIUS_ACCESS_TOKEN')

# instantiate the Genius class with some useful hyperparameters
genius = Genius(genius_token, 
                timeout=220, 
                remove_section_headers=True,
                skip_non_songs=True)

In [4]:
# clean strings
def clean_text(text):
    """
    Removes symbols, accents and uppercases from text.
    """
    text = text.replace("[",'')\
        .replace("]",'')\
        .replace("'",'')\
        .replace(",",'')\
        .replace(":",'')\
        .replace(")",'')\
        .replace("(",'')\
        .replace(".",'')\
        .replace('"','')\
        .replace('/','')\
        .replace("\\",'')\
        .replace("(?)",'')\
        .replace("-",'')
        
    return unidecode.unidecode(text.lower())

In [5]:
# get lyrics
def get_lyrics(artist, song_title, genius):
    """
    Returns lyrics from Genius API.
    """
    # cleanup the inputs
    artist = clean_text(artist)
    song_title = clean_text(song_title)
    
    # call the API to search for the song
    song = genius.search_song(title=song_title, artist=artist)

    if song != None:
        # get song name from the API response to compare similarity
        api_response = unidecode.unidecode(song.to_dict()['full_title']\
            .replace('\xa0', ' ')\
            .replace('\u200b', ' ')\
            .lower())
        api_response = api_response.split(' by ')
        # check similarity
        song_similar = similar(api_response[0], song_title)
        # artist_similar = similar(api_response[1], artist)

        if song_similar >= 0.9:
            # all lyrics responses come with the song's title and 'Lyrics' str
            # so we count how many characters should be removed in order 
            # to delete any extra text
            characters_to_remove = len(song.to_dict()['title'] + ' Lyrics')   
            
            # get lyrics from API
            lyrics = genius.lyrics(song.to_dict()['id'])[characters_to_remove:-5]\
                .replace('\n', ' ')\
                .replace('\u205f', ' ')\
                .replace('\u2005', ' ')\
                .replace('\\', ' ')\
                .strip()

        else:
            lyrics = 'None'
    
    else:
        lyrics = 'None'
    
    return lyrics


# we need to check similarity between the input of the API call and the response
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [6]:
# create df containing lyrics, sentiments and emotions
def get_lyrics_df(df, genius):
    """
    Returns a DataFrame adding lyrics, sentiments and emotions.
    """
    full_songs = Parallel(n_jobs=os.cpu_count())\
        (delayed(get_lyrics)(artist, song_title, genius)\
        for artist, song_title in zip(df['artists'], df['name']))
    
    df['lyrics'] = full_songs
    
    return df

In [7]:
start = time.time()
data_lyrics = get_lyrics_df(data_sample, genius)
end = time.time()
print(f"get_lyrics_df for {len(data_sample)} samples: {(end - start)/60} m")

Searching for "what a difference youve made in my life" by ronnie milsap...
Searching for "im in you" by peter frampton...
Searching for "poor poor pitiful me" by linda ronstadt...
Searching for "egyptian reggae  live" by jonathan richman & the modern lovers...
No results found for: 'egyptian reggae  live jonathan richman & the modern lovers'
Searching for "venus" by television...
Done.
Done.
Done.
Done.
Searching for "ya me voy para siempre" by vicente fernandez...
Searching for "the heathen" by bob marley & the wailers...
Searching for "motorhead" by motorhead...
Searching for "scenes from an italian restaurant" by billy joel...
Done.
Done.
Searching for "lady love" by lou rawls...
Done.
Done.
Searching for "cheree  2019  remaster" by suicide...
Searching for "more than a woman  from saturday night fever soundtrack" by bee gees...
Done.
Searching for "chanson" by art farmer...
Searching for "bouree" by jethro tull...
Done.
Specified song does not contain lyrics. Rejecting.
Searching 

HTTPError: [Errno 503] 503 Server Error: Service Unavailable for url: https://genius.com/Dead-kennedys-nazi-punks-fuck-off-lyrics

In [None]:
# save dataframe to csv
data_lyrics.to_csv('../processed_data/data_lyrics.csv', mode='a')

In [None]:
# def get_translation(lyric):
#    translator = Translator()
#    lyrics_trans = translator.translate(lyric, dest='en').text
    
#   return lyrics_trans

In [None]:
# create df containing lyrics, sentiments and emotions
#def get_translations_df(df):
#    """
#    Returns a DataFrame adding lyrics, sentiments and emotions.
#    """
#    
#    full_songs = Parallel(n_jobs=os.cpu_count())\
#        (delayed(get_translation)(lyric)\
#        for lyric in df['lyrics'])
#
#    df['lyrics_trans'] = full_songs
#    
#    return df

In [None]:
#start = time.time()
#data_translated_10k = get_translations_df(data_lyrics_10k)
#end = time.time()
#print(f"get_translations_df for {len(data_sample_10k)} samples: {(end - start)/60} m")

In [None]:
# save dataframe to csv
#data_translated_10k.to_csv('../processed_data/data_translated_10k.csv')