In [2]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv, find_dotenv
import unidecode
from lyricsgenius import Genius
from difflib import SequenceMatcher
from joblib import Parallel, delayed
import time
from googletrans import Translator

In [3]:
env_path = find_dotenv()
load_dotenv(env_path)

True

In [4]:
# loading the dataset into a dataframe
data = pd.read_csv('../raw_data/data.csv')

# create a sample to work with a smaller dataset throughout development
data_sample =
# Fede [:50000]
# Yann [50001:100000]
# Adri [100001:]

# get credentials for Genius API
genius_token = os.environ.get('LYRICSGENIUS_ACCESS_TOKEN')

# instantiate the Genius class with some useful hyperparameters
genius = Genius(genius_token, 
                timeout=220, 
                remove_section_headers=True,
                skip_non_songs=True)

In [5]:
data[:50000]

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.9820,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878000,10,0.6650,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.9630,1921,0.7320,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,7,0.1600,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936
2,0.0394,1921,0.9610,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913000,3,0.1010,-14.850,1,Gati Bali,5,1921,0.0339,110.339
3,0.1650,1921,0.9670,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,0.000028,5,0.3810,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.2530,1921,0.9570,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,0.000002,3,0.2290,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.0380,101.665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.1280,1983,0.7640,['Amy Grant'],0.273,224747,0.206,0,7a8kBco1oyGHthxCeSll9P,0.000000,2,0.1400,-9.348,1,Heirlooms,35,1983,0.0299,173.546
49996,0.3910,1983,0.8930,['Amy Grant'],0.659,99373,0.189,0,5YLmwVmax8Ptix6qcJXJX0,0.767000,2,0.0837,-10.064,1,Preiset Dem Konig (Praise The King),35,1983,0.0315,111.758
49997,0.4550,1983,0.0745,['Violent Femmes'],0.367,332667,0.428,0,0BGwIQYp5vtd64sWdwo1Sa,0.000784,7,0.1010,-9.996,1,Confessions,37,1983-04-13,0.0712,79.121
49998,0.7380,1983,0.1120,['Bob Marley & The Wailers'],0.669,205693,0.268,0,1eN3VT6F8JNkjF9b4AMhzd,0.000024,9,0.1250,-12.571,0,Stiff Necked Fools,42,1983,0.2610,143.362


In [4]:
# clean strings
def clean_text(text):
    """
    Removes symbols, accents and uppercases from text.
    """
    text = text.replace("[",'')\
        .replace("]",'')\
        .replace("'",'')\
        .replace(",",'')\
        .replace(":",'')\
        .replace(")",'')\
        .replace("(",'')\
        .replace(".",'')\
        .replace('"','')\
        .replace('/','')\
        .replace("\\",'')\
        .replace("(?)",'')\
        .replace("-",'')
        
    return unidecode.unidecode(text.lower())

In [5]:
# get lyrics
def get_lyrics(artist, song_title, genius):
    """
    Returns lyrics from Genius API.
    """
    # cleanup the inputs
    artist = clean_text(artist)
    song_title = clean_text(song_title)
    
    # call the API to search for the song
    song = genius.search_song(title=song_title, artist=artist)

    if song != None:
        # get song name from the API response to compare similarity
        api_response = unidecode.unidecode(song.to_dict()['full_title']\
            .replace('\xa0', ' ')\
            .replace('\u200b', ' ')\
            .lower())
        api_response = api_response.split(' by ')
        # check similarity
        song_similar = similar(api_response[0], song_title)
        # artist_similar = similar(api_response[1], artist)

        if song_similar >= 0.9:
            # all lyrics responses come with the song's title and 'Lyrics' str
            # so we count how many characters should be removed in order 
            # to delete any extra text
            characters_to_remove = len(song.to_dict()['title'] + ' Lyrics')   
            
            # get lyrics from API
            lyrics = genius.lyrics(song.to_dict()['id'])[characters_to_remove:-5]\
                .replace('\n', ' ')\
                .replace('\u205f', ' ')\
                .replace('\u2005', ' ')\
                .replace('\\', ' ')\
                .strip()

        else:
            lyrics = 'None'
    
    else:
        lyrics = 'None'
    
    return lyrics


# we need to check similarity between the input of the API call and the response
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [6]:
# create df containing lyrics, sentiments and emotions
def get_lyrics_df(df, genius):
    """
    Returns a DataFrame adding lyrics, sentiments and emotions.
    """
    full_songs = Parallel(n_jobs=os.cpu_count())\
        (delayed(get_lyrics)(artist, song_title, genius)\
        for artist, song_title in zip(df['artists'], df['name']))
    
    df['lyrics'] = full_songs
    
    return df

In [7]:
#start = time.time()
data_lyrics = get_lyrics_df( , genius)
#end = time.time()
#print(f"get_lyrics_df for {len(data_sample)} samples: {(end - start)/60} m")

Searching for "99 problems" by jayz...
Searching for "drifter  2015 remaster" by iron maiden...
Searching for "close your eyes" by parmalee...
Searching for "raised on it" by sam hunt...
No results found for: 'drifter  2015 remaster iron maiden'
Searching for "soultana maurofora" by markos vamvakaris apostolos xatzixristos...
Done.
Done.
No results found for: 'soultana maurofora markos vamvakaris apostolos xatzixristos'
Searching for "just because" by frankie yankovic...
Done.
No results found for: 'just because frankie yankovic'
Searching for "a girl in the night" by ray price...
Searching for "look what youve done" by drake...
Done.
Done.
Searching for "wild for the night feat skrillex & birdy nam nam" by a$ap rocky skrillex birdy nam nam lord flacko...
Searching for "painkiller" by ruel...
Done.
Searching for "available" by frank sinatra...
Done.
Searching for "voices" by disturbed...
Done.
Searching for "so icy feat young jeezy" by gucci mane jeezy...
Searching for "gone too soon" 

In [12]:
# save dataframe to csv
data_lyrics.to_csv('../processed_data/data_lyrics.csv', mode='a')

In [17]:
#def get_translation(lyric):
#    translator = Translator()
#    lyrics_trans = translator.translate(lyric, dest='en').text
    
 #   return lyrics_trans

In [18]:
# create df containing lyrics, sentiments and emotions
#def get_translations_df(df):
#    """
#    Returns a DataFrame adding lyrics, sentiments and emotions.
#    """
#    
#    full_songs = Parallel(n_jobs=os.cpu_count())\
#        (delayed(get_translation)(lyric)\
#        for lyric in df['lyrics'])
#
#    df['lyrics_trans'] = full_songs
#    
#    return df

In [19]:
#start = time.time()
#data_translated_10k = get_translations_df(data_lyrics_10k)
#end = time.time()
#print(f"get_translations_df for {len(data_sample_10k)} samples: {(end - start)/60} m")

TypeError: the JSON object must be str, bytes or bytearray, not NoneType

In [None]:
# save dataframe to csv
#data_translated_10k.to_csv('../processed_data/data_translated_10k.csv')