In [None]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup
import urllib
import re

In [None]:
def get_singers(url=None):
    if url is None:
        raise ValueError('You have to enter a valid URL')
    else: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        table = soup.find_all("div", class_="itemListSubCategories")[0]
        singers = table.find_all('a')
        all_singers = []
        for singer in singers:
            all_singers.append([singer.text.strip(), singer.get('href')])
        all_singers_df = pd.DataFrame(all_singers, columns=['name', 'link'])
    return all_singers_df

In [None]:
def get_lyrics(song_url=None):
    if song_url is None:
        raise ValueError('You have to enter a valid URL')
    else: 
        base = 'http://fnanen.net'
        url = ''.join(base + song_url)
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        lyrics = soup.find_all("div", class_="itemFullText")[0]
        raw_lyrics = re.sub('<[^>]*>', '\n', str(lyrics))    
    return raw_lyrics

In [None]:
def get_songs_from_url(url=None, with_lyrics = True, verbose=True):
    if url is None:
        raise ValueError('You have to enter a valid URL')
    else:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        table = soup.find_all("div", class_="itemListView")[0]
        titles = table.find_all('a',attrs={'class':'', 'data-animation':'true'})    
        songs = []
        if with_lyrics:
            for title in titles:
                songs.append([title.text, title.get('href'), get_lyrics(title.get('href'))])     
                all_songs_df = pd.DataFrame(songs, columns=['title', 'link', 'lyrics'])
                if verbose: 
                    print('Song ', title.text, ' fetched.')
        else:
            for title in titles:
                songs.append([title.text, title.get('href')])
                all_songs_df = pd.DataFrame(songs, columns=['title', 'link'])
                if verbose: 
                    print('Song ', title.text, ' fetched.')
    return all_songs_df

In [None]:
def get_songs(url=None, with_lyrics=True, verbose=True):
    base = 'http://fnanen.net'
    if url is None:
        raise ValueError('You have to enter a valid URL')
    else:
        r = requests.get(''.join(base + url))
        soup = BeautifulSoup(r.text, "html.parser")
        table = soup.find_all("div", class_="itemListView")[0]
        titles = table.find_all('a',attrs={'class':''})        
        pagination = soup.find_all('div', 'k2Pagination')        
        songs_df = pd.DataFrame({'title':[], 'link':[],'lyrics':[]})
        links = [''.join(base + url)]
        
        if pagination != []:
            pages = pagination[0].find_all('a', attrs={'title':['2','3','4']})
            for p in pages:
                links.append(''.join(base + p.get('href')))
        
        for link in links:
            if verbose: 
                print('Parsing.. ', link)
            try:
                songs = get_songs_from_url(link, with_lyrics=with_lyrics, verbose=verbose)
                songs_df = songs_df.append(songs, ignore_index=True)
            except:
                print('ERROR: broken link? check ', link)
                continue    
                
    return songs_df

In [None]:
r = requests.get('http://fnanen.net/')
soup = BeautifulSoup(r.text, "html.parser")
letters = soup.find_all("ul", class_="menu menu-vertical dropdown-hover ")[0]
subpages = [''.join('http://fnanen.net' + l.get('href')) for l in letters.find_all('a')]

In [None]:
subpages.remove(subpages[16])

In [None]:
all_songs = []
for page in subpages[17:]:
    m_singers = get_singers(page)
    for singer in m_singers.iterrows():
        url = singer[1][1]
        songs = get_songs(url, with_lyrics=True, verbose=False)
        songs = songs.assign(singer=singer[1][0])
        all_songs.append(songs)
        print('done with ', singer[1][0], '. All songs len is: ', len(all_songs) )

In [None]:
all_songs = pd.concat(all_songs, ignore_index=True)

In [None]:
all_songs.shape

In [None]:
all_songs.to_pickle('all_songs.pickle')