In [None]:
import os
import time
import requests 
import numpy as np
import pandas as pd
import lyricsgenius

from ssl import *
from selenium import *
from PyLyrics import *
from lyricwikia import *
from bs4 import BeautifulSoup
from selenium import webdriver
from youtube_search import YoutubeSearch
from selenium.common.exceptions import NoSuchElementException, InvalidArgumentException

genius = lyricsgenius.Genius("ZsnGoTworxfA9binAE4P5enmkpu5MMOnW9FcNsYtPfl2EjFm2cskLNsuH_sv2wrf")
genius.verbose = False
genius.remove_section_headers = True
genius.skip_non_songs = False
genius.excluded_terms = ["(Remix)", "(Live)"]
genius.timeout = 10

headers_Get = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1',
               'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'}

# 0. Helper function

In [None]:
def format_string(lyrics):
    """ This function formats the lyrics in the same way as the
        lyrics already present in the billboard dataframe. 
        args:
            lyrics: the string that we want to format
        return:
            the formated lyrics """
    
    return lyrics.lower().replace("\n", " ").replace("(", " ").replace(")", " ").replace(",", "") \
                         .replace("'", "").replace("[", "").replace("]", "").replace(":", "") \
                         .replace(".", "").replace('"', "").replace("?", "").replace("!", "") \
                         .replace("/", "").replace("-", " ")

In [None]:
def slice_artist(string_to_remove, art):
    """ This function will keep the part of the artist's name that
        comes before string_to_remove. It is useful to get rid of
        the featuring artists."""
    artist_tok = art.split(" ")
    if(string_to_remove in artist_tok):
        artist = " ".join(artist_tok[:artist_tok.index(string_to_remove)])
        return artist
    return art

In [None]:
def get_lyrics_from_txt(df, missing_lyrics_artists, missing_lyrics_songs, manually_extracted=False):
    """ This function reads the txt files, format the lyrics
        and add them into the corresponding line in the dataframe.
        args:
            df: the df for which lyrics are missing
            missing_lyrics_artists: the artists for which the lyrics are missing
            missing_lyrics_songs: the corresponding songs for which the lyrics are missing
            manually_extracted: whether the lyrics have been extracted by hand
        returns:
            a new dataframe with missing lyrics filled in"""
    
    df_res = df.copy()
    for i in range(len(missing_lyrics_artists)):
        artist = missing_lyrics_artists[i]
        song = missing_lyrics_songs[i]
        try:
            if(manually_extracted):
                path = "datasets/songs/manually_extracted_lyrics/lyrics_{}_{}.txt".format(artist.replace(" ", "_").replace("'", ""),
                                                                                          song.replace(" ", "_").replace("/", "_"))
            else:
                path = "datasets/songs/missing_lyrics/lyrics_{}_{}.txt".format(artist.replace(" ", "_").replace("'", ""),
                                                                               song.replace(" ", "_").replace("/", "_"))
            with open(path, "r") as file:
                lyrics = file.read()
            df_res.loc[((df_res['Artist'] == artist) & (df_res['Song'] == song)), 'Lyrics'] = lyrics
        except FileNotFoundError:
            pass
    return df_res

In [None]:
def google_request(artist, song, feature):
    """ This function query the feature from Google, given
        an artist and a song, using the library Beautiful Soup.
        args:
            artist: the artist for which to query the feature
            song: the song for which to query the feature
            feature: feature to query
        returns: the feature found on Google or None """
    
    s = requests.Session()
    q = '+'.join("{} {} {}".format(artist, song, feature).split())
    url = 'https://www.google.com/search?q=' + q + '&ie=utf-8&oe=utf-8'
    r = s.get(url, headers=headers_Get)
    soup = BeautifulSoup(r.text, "html.parser")
    
    res1 = soup.find('div', {'class': 'Z0LcW'})
    if(res1 is not None):
        return res1.text.strip()
    res2 = soup.find('div', {'class': 'title'})
    if(res2 is not None):
        return res2.text.strip()
    res3 = soup.find('div', {'class': 'FLP8od'})
    if(res3 is not None):
        return res3.text.strip()

In [None]:
def selenium_request(browser, artist, song, feature):
    """ This function query the feature from Google, given
        an artist and a song, using the library Selenium.
        args:
            artist: the artist for which to query the feature
            song: the song for which to query the feature
            feature: feature to query
        returns: the feature found on Google or None """
    
    q = '+'.join("{} {} {}".format(artist, song, feature).split())
    url = 'https://www.google.com/search?q=' + q + '&ie=utf-8&oe=utf-8'
    
    try:
        browser.get(url)
    except InvalidArgumentException:
        return None
    try:
        res1 = browser.find_element_by_class_name("Z0LcW")
        return res1.text
    except NoSuchElementException:
        try:
            res2 = browser.find_element_by_class_name("title")
            return res2.text
        except NoSuchElementException:
            try:
                res3 = browser.find_element_by_class_name("FLP8od")
                return res3.text
            except NoSuchElementException:
                return None

In [None]:
def request(method, artist, song, feature):
    """ This function performs a request depending on the method
    args: 
        method: method for the query, can be 'bs4' or 'selenium'
        artist: the artist for which to query the feature
        song: the song for which to query the feature
        feature: the feature to query
    returns: 
        the result of the query """
    if(method == 'selenium'):
        browser = webdriver.Safari()
        feature_found = selenium_request(browser, artist, song, feature.lower())
        browser.close()
    elif(method == 'bs4'):
        feature_found = google_request(artist, song, feature.lower())
    return feature_found

In [None]:
def print_line(i, l, p_l, artist, song, feature_found):
    line = "\t\033[92m{}/{}\t{}, {}:\t{}\033[92;m".format(i+1, l, artist, song, feature_found)
    print(line + " " * (p_l - len(line)), end="\r")
    return len(line)

In [None]:
def print_line_nf(i, l, p_l, artist, song):
    line = "\t\033[91m{}/{}\t{}, {}\033[91;m".format(i+1, l, artist, song)
    print(line + " " * (p_l - len(line)), end="\r")
    return len(line)

In [None]:
def get_feature(df, feature, method, allow_artist_search=False):
    """ This function get the missing feature for the given dataframe. It first tries to 
        search for the missing feature given the artist and the song and if it cannot find
        anything, it tries to search for the missing feature given the artist only.
        args:
            df: the dataframe for which to fill in the missing feature
            feature: the feature to search
            allow_artist_search: whether it allows to search for a feature given only the artist's name
        returns:
            a new dataframe with the missing values filled in """
    
    print("Getting the {}s from \033[1mgoogle\033[1;m using \033[1m{}\033[1;m for:\n".format(feature, method))
    df_res = df.copy()
    done = False
    while not done:
        missing_artists = df_res[df_res[feature].isna()]['Artist'].values
        missing_songs = df_res[df_res[feature].isna()]['Song'].values
        try:
            p_l = 0
            for i in range(len(missing_artists)):
                artist = missing_artists[i]
                song = missing_songs[i]
                feature_found = request(method, artist, song, feature)
                if feature_found is not None:
                    p_l = print_line(i, len(missing_artists), p_l, artist, song, feature_found)
                    df_res.loc[(df_res['Artist'] == artist) & (df_res['Song'] == song), feature] = feature_found
                elif allow_artist_search:
                    # Then we try to search using only the artist
                    feature_found = request(method, artist, "", feature.lower())
                    if feature_found is not None:
                        p_l = print_line(i, len(missing_artists), p_l, artist, song, feature_found)
                        df_res.loc[(df_res['Artist'] == artist) & (df_res['Song'] == song), 'Genre'] = feature_found
                    else:
                        p_l = print_line_nf(i, len(missing_artists), p_l, artist, song)
                else:
                    p_l = print_line_nf(i, len(missing_artists), p_l, artist, song)
            done = True
            return df_res
        except KeyboardInterrupt:
            print("\n\nInterrupted by user.")
            return df_res
        except:
            print("\n\nInternal error. Process will restart. Found values won't be lost.\n")
            continue

# 1. Data preprocessing

### 1.1 Import the dataset

Note that some manual processing has been done on the dataset at the beginning, since for a few entries, there was 2 artists and 2 songs ranked at the same position the same year. This is why we get a dataframe with 5118 entries instead of 5100. Also since we are using libraries that are not 100% accurate, this script has been ran over many iterations and some manually modification have been performed to obtain a clean dataset.

In [None]:
if(os.path.exists("datasets/songs/billboard_full.csv")):
    df_billboard = pd.read_csv('datasets/songs/billboard_full.csv', sep=',', index_col=0)
else:
    df_billboard = pd.read_csv('datasets/songs/billboard_lyrics_1964-2015.csv', sep=';')
    # Drop the unnecessary features
    df_billboard.drop(['Source'], axis=1, inplace=True)
# Keep only the main artist's name
df_billboard['Artist'] = df_billboard['Artist'].apply(lambda x: slice_artist("featuring", x))
# Replace the empty values by NaN values
df_billboard['Lyrics'] = df_billboard['Lyrics'].apply(lambda lyrics: float('nan') if (str(lyrics).isspace()) else lyrics)
df_billboard.head(5)

### 1.2 Handle NaN values

#### 1.2.1 Getting lyrics with lyricsgenius

We are going to use the library lyricsgenius to fill in the lyrics. You can find more information [there](https://pypi.org/project/lyricsgenius/). This library will export the lyrics as a .txt file. After that we, open the corresponding files, we format the text for it to match the format of the lyrics that are already in the dataframe and we fill in the missing values in the dataframe.

In [None]:
# We display the NaN values
df_nan = df_billboard[df_billboard['Lyrics'].isna()]
print("We have {} entries for which the lyrics are not present.".format(len(df_nan)))
missing_lyrics_artists = df_nan['Artist'].values
missing_lyrics_songs = df_nan['Song'].values

In [None]:
# We get the missing values from the library lyricsgenius
print("Getting the missing lyrics from \033[1mlyricsgenius\033[1;m for:\n")
done = False
while not done:
    try:  
        if not os.path.exists("missing_lyrics"):
            os.makedirs("missing_lyrics")
        for i in range(len(missing_lyrics_artists)):
            artist = missing_lyrics_artists[i]
            song = missing_lyrics_songs[i]
            file_name = "lyrics_{}_{}.txt".format(artist.replace(" ", "_").replace("'", ""),
                                                  song.replace(" ", "_").replace("/", "_"))
            if(not os.path.isfile("datasets/songs/missing_lyrics/{}".format(file_name))):
                x = genius.search_song(artist, song)
                if x is not None:
                    print("\t\033[92m{}, {}\033[92;m".format(artist, song))
                    x.save_lyrics(filename=file_name, extension='txt', full_data=False,
                                  verbose=False, overwrite=False)
                    try:
                        os.rename(file_name, "datasets/songs/missing_lyrics/{}".format(file_name))
                    except FileNotFoundError:
                        pass
                else:
                    print("\t\033[91m{}, {}\033[91;m".format(artist, song))
        done = True
    except:
        print("\nInternal error. Process will restart. Saved files won't be lost.\n")
        continue   
print("\nDone")

#### 1.2.2 Getting lyrics with lyricwikia

For the remaining missing lyrics, we try with another library called lyricwikia. More information about this library can be found [here](https://pypi.org/project/lyricwikia/).

In [None]:
# We fill in the missing values
df_billboard = get_lyrics_from_txt(df_billboard, missing_lyrics_artists, missing_lyrics_songs)
# We display the NaN values
df_nan = df_billboard[df_billboard['Lyrics'].isna()]
print("We have {} entries for which the lyrics are not present.".format(len(df_nan)))
missing_lyrics_artists = df_nan['Artist'].values
missing_lyrics_songs = df_nan['Song'].values

In [None]:
# We get the missing values from the library lyricwikia
print("Getting the missing lyrics from \033[1mlyricwikia\033[1;m for:\n")
for i in range(len(missing_lyrics_artists)):
    artist = missing_lyrics_artists[i]
    song = missing_lyrics_songs[i]
    try:    
        lyrics = get_lyrics(artist, song)
        print("\t\033[92m{}, {}\033[92;m".format(artist, song))
        file_name = "lyrics_{}_{}.txt".format(artist.replace(" ", "_").replace("'", ""),
                                              song.replace(" ", "_").replace("/", "_"))
        with open(file_name, "w") as file:
            file.write(lyrics)
            file.close()
        os.rename(file_name, "datasets/songs/missing_lyrics/{}".format(file_name))
    except LyricsNotFound:
        print("\t\033[91m{}, {}\033[91;m".format(artist, song))
        pass
print("\nDone")

#### 1.2.3 Getting lyrics by hand

For the remaining missing lyrics, we need to fill them in by hand, which is not very hard since most of them are instrumental.

In [None]:
# We fill in the missing values
df_billboard = get_lyrics_from_txt(df_billboard, missing_lyrics_artists, missing_lyrics_songs)
# We display the NaN values
df_nan = df_billboard[df_billboard['Lyrics'].isna()]
print("We have {} entries for which the lyrics are not present.".format(len(df_nan)))
missing_lyrics_artists = df_nan['Artist'].values
missing_lyrics_songs = df_nan['Song'].values

In [None]:
# Here we create the empty files for the lyrics to be filled in by hand
for i in range(len(missing_lyrics_artists)):
    artist = missing_lyrics_artists[i]
    song = missing_lyrics_songs[i]
    path = "datasets/songs/manually_extracted_lyrics/lyrics_{}_{}.txt".format(artist.replace(" ", "_").replace("'", ""),
                                                                              song.replace(" ", "_").replace("/", "_"))
    if(not os.path.isfile(path)):
        with open(path, "w") as file:
            pass

In [None]:
# We fill in the missing values
df_billboard = get_lyrics_from_txt(df_billboard, missing_lyrics_artists, missing_lyrics_songs, manually_extracted=True)
# We display the NaN values
df_nan = df_billboard[df_billboard['Lyrics'].isna()]
print("We have {} entries for which the lyrics are not present.".format(len(df_nan)))

### 1.3 Add genre feature

#### 1.3.1 Using web scrapping from Google

In [None]:
if(not 'Genre' in df_billboard.columns):
    df_billboard['Genre'] = float(np.nan)

In [None]:
# We get the missing values for Genre using Beautiful Soup
df_billboard = get_feature(df_billboard, "Genre", "bs4", allow_artist_search=True)

In [None]:
# We get the missing values for Genre using Selenium
df_billboard = get_feature(df_billboard, "Genre", "selenium", allow_artist_search=True)

In [None]:
# We display the NaN values
df_genre_nan = df_billboard[df_billboard['Genre'].isna()]
print("We have {} entries for which the genre is not present.".format(len(df_genre_nan)))

#### For the remaining genres, we query them by hand

In [None]:
df_missing_genres = pd.read_csv("datasets/songs/missing_genres/missing_genres.csv",
                                sep=';', index_col=0)
for i, row in df_genre_nan.iterrows():
    artist = row['Artist']
    song = row['Song']
    genre = df_missing_genres[(df_missing_genres['Artist'] == artist) &
                              (df_missing_genres['Song'] == song)]['Genre'].values
    df_billboard.loc[(df_billboard['Artist'] == artist) & (df_billboard['Song'] == song), 'Genre'] = genre

In [None]:
# We display the NaN values
df_genre_nan = df_billboard[df_billboard['Genre'].isna()]
print("We have {} entries for which the genre is not present.".format(len(df_genre_nan)))

In [None]:
df_billboard['Genre'] = df_billboard['Genre'].apply(lambda genre: genre.lower().replace("-", " ").replace("/", " "))
df_billboard['Genre'] = df_billboard['Genre'].apply(lambda genre: genre.replace("zeitgenössischer", "contemporary"))
df_billboard['Genre'] = df_billboard['Genre'].apply(lambda genre: genre.replace("klassischer", "classic"))
df_billboard['Genre'] = df_billboard['Genre'].apply(lambda genre: genre.replace("rnb ", "r&b"))
map_genre = {"country musik": "country",
             "klassische musik": "classic music",
             "elektropop": "electro",
             "christliche musicgospel": "gospel",
             "elektronische tanzmusik": "electro",
             "elektronische popmusik": "electro",
             "urban adult contemporary": "adult contemporary",
             "rock ’n’ roll": "rock",
             "progressive rock artrock": "rock",
             "untergrund rap": "undergrund rap",
             "rock aus spanien": "rock",
             "midwest rap": "rap",
             "southern rock": "rock",
             "musik": "music",
             "christliche popmusik": "pop",
             "indie folk indie pop rock": "indie folk",
             "arena rock": "rock",
             "dance électronique": "electro",
             "dirty rap": "rap",
             "popfolk": "pop",
             "euro disco": "disco",
             "eastcoast hip hop": "hip hop",
             "westcoast hip hop": "hip hop",
             "kindermusik": "children music", 
             "synthiepop": "pop",
             "traditionelle popmusik": "pop",
             "bcontemporain": "r&b",
             "beatmusik": "beat music",
             "moderne country": "country",
             "phillysound": "philadelphia soul",
             "filmmusik": "soundtrack",
             "rockabilly": "rock",
             "volksmusik": "folk",
             "dutch house": "house",
             "my whole world ended (the moment you left me)": "r&b soul",
             "actionfilm": "soundtrack",
             "artrock": "rock",
             "vokal jazz": "jazz",
             "elton john": "rock",
             "young, wild & free": "pop rap",
             "jazzfunk": "jazz funk",
             "electronica": "electro",
             "hip house": "house",
             "liebesfilm": "soundtrack",
             "deep house": "house",
             "regionale mexikanische musik": "regional mexican music",
             "country music": "country",
             "rootsmusik": "roots",
             "musicder romantik": "romantic",
             "early morning love": "r&b",
             "tanzmusik": "dance",
             "italo disco": "disco",
             "saisonale musik": "season music",
             "rockmusik": "rock",
             "christliche musik gospel": "gospel"}
df_billboard['Genre'] = df_billboard['Genre'].apply(lambda genre: map_genre[genre] if genre in map_genre.keys() else genre)

### 1.4 Add album feature

#### 1.4.1 Using web scrapping from Google

In [None]:
if(not 'Album' in df_billboard.columns):
    df_billboard['Album'] = float(np.nan)

In [None]:
# We get the missing values for Album
df_billboard = get_feature(df_billboard, "Album", "bs4")

In [None]:
# We get the missing values for Genre
df_billboard = get_feature(df_billboard, "Album", "selenium")

In [None]:
# We display the NaN values
df_album_nan = df_billboard[df_billboard['Album'].isna()]
print("We have {} entries for which the album is not present.".format(len(df_album_nan)))

For all the remainings missing albums, we will simply use the title of the song.

In [None]:
for i, row in df_album_nan.iterrows():
    artist = row['Artist']
    song = row['Song']
    df_billboard.loc[(df_billboard['Artist'] == artist) & (df_billboard['Song'] == song), 'Album'] = song

In [None]:
# We display the NaN values
df_album_nan = df_billboard[df_billboard['Album'].isna()]
print("We have {} entries for which the album is not present.".format(len(df_album_nan)))

### 1.5 Add Youtube video link

In [None]:
if(not 'Youtube' in df_billboard.columns):
    df_billboard['Youtube'] = float(np.nan)

In [None]:
youtube_nan = df_billboard[df_billboard['Youtube'].isna()]
done = False
while not done:
    for i, row in youtube_nan.iterrows():
        try:
            print("{}/{}".format(i, len(youtube_nan)), end='\r')
            artist = row['Artist']
            song = row['Song']
            results = YoutubeSearch('{} {}'.format(artist, song), max_results=1)
            if(len(results.videos) > 0):
                url = "https://www.youtube.com" + results.videos[0]['link']
                df_billboard.loc[(df_billboard['Artist'] == artist) & (df_billboard['Song'] == song), 'Youtube'] = url
        except KeyboardInterrupt:
            done = True
            break
        except:
            continue
    done = True

In [None]:
youtube_nan = df_billboard[df_billboard['Youtube'].isna()]
print("We have {} entries for which the Youtube link is not present.".format(len(youtube_nan)))

We will save the dataframe and fill in the missing values by hand.

### 1.6 Save the dataframe

In [None]:
# We save the dataframe
df_billboard.to_csv("datasets/songs/billboard_full.csv", sep=",")