# Input parameters

In [12]:
# URL of the target website
url = 'http://comunitatvalenciana.com/viaje/alojamiento/viviendas-turisticas'

# Number of pages of the target website
max_pages = 3810

# Maximum number of attempts to download the page
max_attempts = 5

# Timeout for loading page. Increase if internet connection is slow.
timeout = 30

# Time to wait between pages to avoid being blocked
sleep_time = 3

# Name of the output file
output_file = 'viviendas_turisticas.csv'

# Modules

In [2]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from tqdm import tnrange, tqdm_notebook

# Download data

In [3]:
# Initialize variables
signatures = list()
links = list()
cities = list()
provinces = list()
addresses = list()
phones = list()

# Iterate through pages
for page_number in tnrange(max_pages): # Use tnrange to display progress bar
    attempts = 0
    while True: 
        try:
            # Download page
            page = requests.get(url + '?page=' + str(page_number), timeout=timeout)
            # Extract HTML
            html = BeautifulSoup(page.content, 'html.parser')
            # Extract apartments
            apartments = html.tbody.find_all('tr')
            # Extract apartment's data
            for apartment in apartments:
                # Signature
                signature = apartment.find('td', 'views-field views-field-field-shared-ws-signature-value').text.strip()
                signatures.append(signature)
                # Link
                link = apartment.find('td', 'views-field views-field-field-shared-ws-signature-value').a.get('href').strip()
                links.append(link)
                # City
                city = apartment.find('td', 'views-field views-field-field-shared-rel-municipio-nid').text.strip()
                cities.append(city)
                # Province
                province = apartment.find('td', 'views-field views-field-field-shared-provincia-value').text.strip()
                provinces.append(province)
                # Address
                address = apartment.find('td', 'views-field views-field-field-shared-address-value').text.strip()
                addresses.append(address)
                # Phone
                phone = apartment.find('td', 'views-field views-field-field-shared-phone-value').text.strip()
                phones.append(phone)
            # Continue to next page
            time.sleep(sleep_time)
            break
        except:
            if attempts < max_attempts:
                attempts += 1
                print(f'Page {page_number}, attempt {attempts} failed. Retrying...')
                time.sleep(sleep_time)                
            else:
                raise Exception('Unable to download the page. Maximum number of retries reached.')

HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))

Page 571, attempt 1 failed. Retrying...
Page 581, attempt 1 failed. Retrying...



# Process data

In [4]:
# Convert raw data to a dataframe
raw_data = list(zip(signatures, links, cities, provinces, addresses, phones))
data = pd.DataFrame(raw_data, columns=['firma', 'enlace', 'municipio', 'provincia', 'direccion', 'telefono']) 

# Export data

In [13]:
# Export data to CSV
data.to_csv(output_file, index=False, encoding='utf-8-sig')