In [1]:
import pandas as pd
import numpy as np

import base64
import os
import time

from requests import get
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

import discogs_client

from utils import lower_and_remove_diacritics, check_row_similariy, similar

In [2]:
def scrape_tekstovinet(artist, diskografija_artist_name, url_teksovinet):
    # Scraping tekstovi.net for song lyrics by artist (change url to change artist)

    ## Create empty df:
    column_names = ["Song_ID", "Artist", "Artist_diskografija", "Song_tekstovinet", "Views_tekstovinet", "Lyrics_tekstovinet", "Url_tekstovinet"]
    df = pd.DataFrame(columns = column_names)


    # generate random artist ID and make sure it doesnt have '_'
    artist_ID = base64.b64encode(os.urandom(6)).decode('ascii')
    while(artist_ID.find('_')>=0):
        artist_ID = base64.b64encode(os.urandom(6)).decode('ascii')
        
    delay = 2

    response = get(url_teksovinet)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    songs = html_soup.find_all('p', class_='artLyrList')


    for index, song in enumerate(songs):
        song_name = song.findChild("a").text
        song_url = song.findChild("a")["href"]

        url = 'https://tekstovi.net/'+song_url

        response = get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')

        song_lyrics = html_soup.find_all('p', class_='lyric')
        lyrics = [l.text for l in song_lyrics]
        lyrics = " ".join(lyrics)
        song_ID = artist_ID + '_' + str(1) + '_' + str(index)
        views_text = html_soup.find('p', class_='lyric_impressions').text
        views_count = int("".join([c for c in views_text if c.isdigit()]))
        new_entry = pd.DataFrame([[song_ID, artist, diskografija_artist_name, song_name, views_count, lyrics, url]], columns=["Song_ID", "Artist", "Artist_diskografija", "Song_tekstovinet", "Views_tekstovinet", "Lyrics_tekstovinet", "Url_tekstovinet"])
        df = df.append(new_entry, ignore_index=True)

        print("Lyrics successfully written to file for : " + song_name)
                            
        time.sleep(delay)

    df.to_csv(f'data/{artist}_tekstovinet.csv', index=False)

    


In [3]:
def search_google(search_query):
     #search
    driver.find_element(By.NAME, 'q').clear()
    search_box = driver.find_element(By.NAME, 'q')
    search_box.send_keys(search_query)
    search_box.send_keys(Keys.RETURN)
    time.sleep(3)

In [4]:
def get_lyricstranslate():

    driver.find_element(By.PARTIAL_LINK_TEXT, 'lyricstranslate.com').send_keys(Keys.CONTROL + Keys.RETURN)
    time.sleep(0.5)
    window_handles = driver.window_handles
    driver.switch_to.window(window_name=window_handles[1])    
    time.sleep(1)


    try: driver.find_element(By.XPATH, "//*[contains(text(), 'AGREE')]").click()
    except: pass
    time.sleep(0.5)
    
    try: driver.find_element(By.PARTIAL_LINK_TEXT, 'English').click()
    except: pass
    time.sleep(0.5)

    try: driver.find_element(By.PARTIAL_LINK_TEXT, 'Click to see the original lyrics').click()
    except: pass
    time.sleep(0.5)

    
    try:
        original_lyrics_raw = driver.find_element(By.CLASS_NAME, 'song-node-text')
        original_title = original_lyrics_raw.find_element(By.CLASS_NAME, 'title-h2').text
        par_class = original_lyrics_raw.find_elements(By.CLASS_NAME, 'par')
        original_lyrics = ". ".join([t.text for t in par_class])
        original_lyrics = original_lyrics.replace("*", "")
    except: 
        print("Couldn't find english lyrics")
        original_title = np.nan
        original_lyrics = np.nan

    try:
        eng_lyrics_raw = driver.find_element(By.CLASS_NAME, 'translate-node-text')
        eng_title = eng_lyrics_raw.find_element(By.CLASS_NAME, 'title-h2').text
        par_class = eng_lyrics_raw.find_elements(By.CLASS_NAME, 'par')
        eng_lyrics = ". ".join([t.text for t in par_class])
        eng_lyrics = eng_lyrics.replace("*", "")
    except: 
        print("Couldn't find english lyrics")
        eng_title = np.nan
        eng_lyrics = np.nan

        
    try:
        copyright = original_lyrics_raw.find_element(By.CLASS_NAME, 'copyrighttext').text.split("\n")
        writer_info = [c for c in copyright if "Writer" in c][0]
        if len(writer_info)>0:
            writer = writer_info.split(": ")[1]

    except: 
        print("Couldn't find writer info")
        writer = np.nan


    try:
        song_info = driver.find_element(By.CLASS_NAME, 'song-node-info')
        album_name = song_info.find_element(By.XPATH, "//*[contains(text(), 'Album')]").text
        album_name = album_name.split(": ")[1]
    except: 
        print("Couldn't find album info")
        album_name = np.nan

    
    song_url = driver.current_url
    driver.close()
    driver.switch_to.window(window_name=window_handles[0])
    
    
    return writer, original_title, eng_title, album_name, original_lyrics, eng_lyrics, song_url

In [5]:
def scrape_lyricstranslate(artist_name):
    df = pd.read_csv(f"data/{artist_name}_tekstovinet.csv")

    ser = Service("chromedriver.exe")
    op = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=ser, options=op)
    driver.get("https://google.com")
    time.sleep(0.5)
    try: driver.find_element(By.XPATH, "//*[contains(text(), 'Aceito')]").click()
    except: driver.find_element(By.XPATH, "//*[contains(text(), 'I agree')]").click()
    time.sleep(0.5)
    try: driver.find_element(By.XPATH, "//*[contains(text(), 'English')]").click()
    except: pass
    time.sleep(0.5)

    df_new = df.copy()#[0:1]##[16:17]
    for row in df_new.iterrows():
        time.sleep(1)
        artist_name = row[1].Artist
        song_name = row[1].Song_tekstovinet
        try: song_name = song_name[0:song_name.index("(")]
        except:pass
        
        try:
            search_google(artist_name+" "+song_name+ " english lyricstranslate")
            writer, original_title, eng_title, album_name, original_lyrics, eng_lyrics, song_url = get_lyricstranslate()

            df.loc[row[0], "Writer_lyricstranslate"] = writer 
            df.loc[row[0], "Song_lyricstranslate"] = original_title
            df.loc[row[0], "EngTitle_lyricstranslate"] = eng_title
            df.loc[row[0], "Album_lyricstranslate"] = album_name
            df.loc[row[0], "Lyrics_lyricstranslate"] = original_lyrics
            df.loc[row[0], "EngLyrics_lyricstranslate"] = eng_lyrics
            df.loc[row[0], "Url_lyricstranslate"] = song_url

            print(f"Succesfully found information for {song_name} from {artist_name}")
        
        except Exception as e:
            print(f"Problem with finding info for {song_name} from {artist_name}")
            print(e)
        print("#######")
        time.sleep(3)

    wrong_rows = check_row_similariy(df, "Lyrics_tekstovinet", "Lyrics_lyricstranslate", threshold=0.7)
    
    ##set parameters as nan for wrong rows
    for row in wrong_rows:
        df.loc[row, "Writer_lyricstranslate"] = np.nan
        df.loc[row, "Song_lyricstranslate"] = np.nan
        df.loc[row, "EngTitle_lyricstranslate"] = np.nan
        df.loc[row, "Album_lyricstranslate"] = np.nan
        df.loc[row, "Lyrics_lyricstranslate"] = np.nan
        df.loc[row, "EngLyrics_lyricstranslate"] = np.nan
        df.loc[row, "Url_lyricstranslate"] = np.nan

    df.to_csv(f"data/{artist_name}_tekstovinet_lyricstranslate.csv", index=False)


In [6]:
def scrape_for_authors_and_albums(song_name, artist_name):
    #search
    driver.find_element(By.CLASS_NAME, 'search-page').clear()
    search_box = driver.find_element(By.CLASS_NAME, 'search-page')
    song_name = lower_and_remove_diacritics(song_name)
    search_box.send_keys(song_name)
    search_box.send_keys(Keys.RETURN)
    time.sleep(3)
    
    
    #get results of search
    #search_result = driver.find_element(By.CLASS_NAME, 'search-result')
    try:
        song_results = driver.find_element(By.XPATH, f"//*[contains(text(), 'Pjesme')]/following-sibling::ul")
        song_results = song_results.find_elements(By.TAG_NAME, 'a')

        #go through search results
        for song in song_results:

            song.send_keys(Keys.CONTROL + Keys.RETURN)
            window_handles = driver.window_handles

            driver.switch_to.window(driver.window_handles[1])
            time.sleep(1)            

            if  check_performer(artist_name):
                subfields, authors = get_authors()
                albums = get_albums()
                song_url = driver.current_url
                song_name_website = driver.find_element(By.XPATH, f"//*[contains(text(), 'Pjesma')]/following-sibling::h1").text
                driver.close()
                driver.switch_to.window(window_name=window_handles[0])
                return subfields, authors, albums, song_url, song_name_website
                
            else:
                driver.close()
                driver.switch_to.window(window_name=window_handles[0])
    
    except Exception as e: print(f"Problem with scraping authors and albums for song {song_name}")

    return False    


In [7]:
def check_performer(artist_name):
    authors = driver.find_elements(By.TAG_NAME, "h3")[1:]
    artist_name = lower_and_remove_diacritics(artist_name)
    for index, author in enumerate(authors):
        subfield = author.text.capitalize()
        if subfield == "Izvedba":
            siblings_of_current = driver.find_elements(By.XPATH, f"//*[contains(text(), '{subfield}')]/following-sibling::a")
            if index<len(authors)-1: 
                siblings_of_next = driver.find_elements(By.XPATH, f"//*[contains(text(), '{authors[index+1].text.capitalize()}')]/following-sibling::a")
                result = [r.text for r in siblings_of_current[0: len(siblings_of_current)-len(siblings_of_next)]]
            else: result = [r.text for r in siblings_of_current] 
            
    return True if len([r for r in result if artist_name in lower_and_remove_diacritics(r)])>0 else False


In [8]:
def get_authors():
    all_fields = []
    all_authors = []
    #authors = driver.find_elements(By.TAG_NAME, "h3")[1:]
    authors = driver.find_elements(By.XPATH, f"//*[contains(text(), 'Autori')]/following-sibling::h3")
    for index, author in enumerate(authors):
        subfield = author.text.capitalize()
        siblings_of_current = driver.find_elements(By.XPATH, f"//*[contains(text(), '{subfield}')]/following-sibling::a")
        if len(siblings_of_current)>0:
            if index<len(authors)-1: 
                siblings_of_next = driver.find_elements(By.XPATH, f"//*[contains(text(), '{authors[index+1].text.capitalize()}')]/following-sibling::a")
                result = [r.text for r in siblings_of_current[0: len(siblings_of_current)-len(siblings_of_next)]]
            else: result = [r.text for r in siblings_of_current]
            all_fields.append(subfield)
            all_authors.append(str(result))
            
    return all_fields, all_authors


In [9]:
def get_albums():
    albums = driver.find_elements(By.XPATH, f"//*[contains(text(), 'Albumi')]/following-sibling::div")
    return [{album.text} for album in albums]      
    

In [10]:
def scrape_diskografija(artist_name):
    df = pd.read_csv(f"data/{artist_name}_tekstovinet_lyricstranslate.csv")
    
    ser = Service("chromedriver.exe")
    op = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=ser, options=op)
    driver.get("https://diskografija.com/")


    df_new = df.copy()#[0:5]
    for row in df_new.iterrows():
        artist_name = row[1].Artist_diskografija
        song_name = row[1].Song_lyricstranslate
        if type(song_name)==float: song_name = row[1].Song_tekstovinet
        try: song_name = song_name[0:song_name.index("(")]
        except:pass
        
        try: 
            subfields, authors, albums, song_url, song_title_website = scrape_for_authors_and_albums(song_name, artist_name)
        
            for subfield, author in zip(subfields, authors):
                df.loc[row[0], f"{subfield}_diskografija"] = author
                
            for i, album in enumerate(albums):
                df.loc[row[0], f"Album_{i+1}_diskografija"] = album
            df.loc[row[0], "Url_diskografija"] = song_url
            df.loc[row[0], "Song_diskografija"] = song_title_website

            print(f"Succesfully found information for {song_name} from {artist_name}")
        
        except Exception as e:
            print(f"Problem with finding info for {song_name} from {artist_name}")
            print(e)
        print("#######")
        time.sleep(3)
        

    df.to_csv(f"data/{artist_name}_tekstovinet_lyricstranslate_diskografija.csv", index=False)



In [11]:
def scrape_cuspajz(artist_name, url_cuspajz):
    df = pd.read_csv(f"data/{artist_name}_tekstovinet_lyricstranslate_diskografija.csv")

    # Scraping cuspajz.com for song lyrics by artist

    response = get(url_cuspajz)
    html_soup = BeautifulSoup(response.text, 'html.parser')

    # Get the list of songs
    songs = html_soup.find_all('ul', class_='songList')

    song_names = []
    song_urls = []
    for col_index, column in enumerate(songs):
        songs = column.find_all('li')
        for index, song in enumerate(songs):
            song_names.append(song.a.text)
            song_urls.append(song.a['href'])


    similarity_threshold = 0.9
    for row in df.iterrows():
        song_title = lower_and_remove_diacritics(row[1].Song_tekstovinet).split('(')[0]
        similarities = [similar(song_title, lower_and_remove_diacritics(s.split('(')[0])) for s in song_names]
        max_similarity = max(similarities)
        
        if max_similarity>similarity_threshold:
            cuspajz_name = song_names[similarities.index(max_similarity)] 
            song_url = song_urls[similarities.index(max_similarity)]
            url = 'https://cuspajz.com/'+song_url

            response = get(url)
            html_soup = BeautifulSoup(response.text, 'html.parser')

            song_lyrics = html_soup.find_all('p', class_='clearfix')
            lyrics =  song_lyrics[0].get_text()
            
            df.loc[row[0], "Lyrics_cuspajz"] = lyrics
            df.loc[row[0], "Song_cuspajz"] = cuspajz_name
            df.loc[row[0], "Url_cuspajz"] = url
            

            print("Lyrics successfully written to file for : " + song_title)
                        
            time.sleep(3)
            

    wrong_rows = check_row_similariy(df, "Lyrics_tekstovinet", "Lyrics_cuspajz", threshold=0.7)
    ##set parameters as nan for wrong rows
    for row in wrong_rows:
        df.loc[row[0], "Lyrics_cuspajz"] = np.nan
        df.loc[row[0], "Song_cuspajz"] = np.nan
        df.loc[row[0], "Url_cuspajz"] = np.nan

    df.to_csv(f'data/{artist_name}_tekstovinet_lyricstranslate_diskografija_cuspajz.csv', index=False)


In [12]:
def scrape_discogs(artist_name, artist_id_discogs):
    d = discogs_client.Client('datamilas/0.1', user_token='BDGKHOLMZJxywoowKXvyRhXeqJhkezYxwVjxvBJR')
    artist_object = d.artist(artist_id_discogs)
    artist_discogs = artist_object.name
    albums = artist_object.releases
    df_discogs = pd.DataFrame(columns=["Artist"])


    i=0
    for album in albums:
        if album.data["role"] == "Main":
            print(album)
            time.sleep(0.5)
            for track in album.tracklist:
                if len(track.artists)==0 or (artist_object in track.artists):
                    df_discogs.loc[i, "Artist"] = artist_name
                    df_discogs.loc[i, "Artist_discogs"] = artist_discogs
                    df_discogs.loc[i, "Song_discogs"] = track.title
                    if "duration" in track.data.keys(): df_discogs.loc[i, "Song_duration_discogs"] = track.data["duration"]
                    if "extraartists" in track.data.keys():
                        for ea in track.data["extraartists"]:
                            roles = ea["role"].split(", ")

                            for role in roles:
                                try: df_discogs.loc[i, f"{role}_discogs"] = df_discogs.loc[i, role]+"/"+ea["name"]
                                except: df_discogs.loc[i, f"{role}_discogs"] = ea["name"]
                    df_discogs.loc[i, "Album_title_discogs"] = album.title
                    df_discogs.loc[i, "Album_year_discogs"] = album.year
                    df_discogs.loc[i, "Album_genres_discogs"] = str(album.genres)

                    i+=1


    df_discogs["Album_year_discogs"] = df_discogs["Album_year_discogs"].astype(int)
    df_discogs["Album_year_title_discogs"] = df_discogs["Album_year_discogs"].astype(str)+"-"+df_discogs["Album_title_discogs"]
    df_discogs["All_albums_discogs"] = df_discogs.groupby("Song_discogs")["Album_year_title_discogs"].transform(lambda x: '/'.join(x))

    df_discogs.sort_values("Album_year_discogs").reset_index(drop=True, inplace=True)
    df_discogs = df_discogs.drop_duplicates(subset=["Song_discogs"])
    df_discogs = df_discogs.reset_index(drop=True)

    columns_to_take = [col for col in df_discogs.columns if col in ['Artist_discogs', 'Song_discogs', 'Song_duration_discogs',
       'Album_title_discogs', 'Album_year_discogs', 'Album_genres_discogs','All_albums_discogs',
       'Lyrics By_discogs', 'Music By_discogs', 'Featuring_discogs']]

    df = pd.read_csv(f"data/{artist_name}_tekstovinet_lyricstranslate_diskografija_cuspajz.csv")

    similarity_threshold = 0.9
    song_names = [lower_and_remove_diacritics(s).split('(')[0] for s in df_discogs.Song_discogs]
    for row in df.iterrows():
        song_title = lower_and_remove_diacritics(row[1].Song_tekstovinet).split('(')[0]
        similarities = [similar(song_title, s) for s in song_names]
        max_similarity = max(similarities)
        
        if max_similarity>similarity_threshold:
            df.loc[row[0], columns_to_take] = df_discogs.loc[similarities.index(max_similarity)][columns_to_take]

    df.to_csv(f"data/{artist_name}_final.csv", index=False)
    df_discogs.to_csv(f"data/{artist_name}_discogs.csv", index=False)



In [None]:
artist_name = "Magazin"
artist_name_diskografija = "Magazin"
url_tekstovinet = "https://tekstovi.net/2,95,0.html"
url_cuspajz = "https://cuspajz.com/tekstovi-pjesama/izvodjac/magazin.html"
artist_id_discogs = 345842
scrape_tekstovinet(artist_name, artist_name_diskografija, url_tekstovinet)
scrape_lyricstranslate(artist_name)
scrape_diskografija(artist_name)
scrape_cuspajz(artist_name, url_cuspajz)
scrape_discogs(artist_name, artist_id_discogs)