In [135]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

START_PAGE = 'wiki/Categoria:Singoli_certificati_disco_d%27oro_in_Italia'
BASE_URL = 'https://it.wikipedia.org/'

def scrape_page(page):
    with urlopen(page) as request:
        body = BeautifulSoup(request.read().decode('utf-8'), 'html.parser')
        songs = get_songs_urls(body)
        next_page = get_next_page(body)
    return songs, next_page

def get_songs_urls(body):
    songs_div = body.select_one('.mw-category')
    return [song.find('a', href=True)['href'] for song in songs_div.find_all('li')]

def get_next_page(body):
    try:
        return body.select_one('.mw-category-generated').find('a', string='pagina successiva')['href']
    except TypeError:
        return None

In [139]:
def scrape_song_urls():
    next_page = START_PAGE
    songs_url = []
    while next_page:
        new_songs, next_page = scrape_page(BASE_URL + next_page)
        songs_url += new_songs
        print('Scraping links . . .')
    print('Links scraped')
    return songs_url

In [277]:
songs_urls = scrape_song_urls()
print('Num of songs: {}'.format(len(songs_urls)))

Scraping links . . .
Scraping links . . .
Scraping links . . .
Scraping links . . .
Scraping links . . .
Scraping links . . .
Scraping links . . .
Links scraped
Num of songs: 1261


In [321]:
import pandas as pd

def scrape_song(url):
    with urlopen(url) as request:
        body = BeautifulSoup(request.read().decode('utf-8'), 'html.parser')
    return parse_song(body)


def parse_song(body):
    song, data_dict = dict(), dict()
    table_body = body.find('table', {"class": "sinottico"}).find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        row_key = row.find('th', text=True)
        if row_key:
            value = row.find_all('td')
            if value:
                text = value[0].text
                data_dict[row_key.text] = text.strip()
    try:
        song['title'] = table_body.find('tr', {"class": "sinottico_testata"}).text.strip()
    except:
        song['title'] = None
    
    try:
        song['artists'] = ','.join([x.strip() for x in  data_dict['Artista'].split(',')])
    except:
        song['artists'] = None
    
    try:
        song['date'] = data_dict['Pubblicazione']
    except:
        try:
            song['date'] = data_dict['Data']
        except:
            song['date'] = None
    
    try:
        song['duration'] = data_dict['Durata']
    except:
        song['duration'] = None

    try:
        song['genre'] = data_dict['Genere']
    except:
        song['genre'] = None
    
    return song

def scrape_songs_info(songs_urls):
    songs = []
    header = [x for x in scrape_song(BASE_URL + next(iter(songs_urls))).keys()]
    df = pd.DataFrame(data=[], columns=header)
    for i, url in enumerate(songs_urls):
        percentage = round(i/len(songs_urls) * 100, 3)
        print('\rDownloading: {}/{} - {}%'.format(i + 1, len(songs_urls), percentage), end = '')
        if i == 10:
            break
        new_song = scrape_song(BASE_URL + url)
        df = df.append(new_song, ignore_index=True)
    return df

In [322]:
df = scrape_songs_info(songs_urls)
df.head()

Downloading: 11/1261 - 0.793%

Unnamed: 0,title,artists,date,duration,genre
0,0ffline,Tha Supreme,31 luglio 2020,3:21,Trap
1,"1, 2, 3",Sofía Reyes,16 febbraio 2018,3:21,Pop latinoReggaeton
2,3 Words,Cheryl,18 dicembre 2009,4:33,Dance pop
3,4/3/1943/Il fiume e la città,Lucio Dalla,1971,3:40 (4/3/1943)3:48 (Il fiume e la città),Musica d'autore
4,5olo,Tha Supreme,9 febbraio 2018,2:07,Hip hop


In [324]:
import csv
df.to_csv('../data/disco-oro.csv', index=False, quoting=csv.QUOTE_ALL, quotechar='"')