# Input parameters

In [21]:
# URL of the target website
url = 'http://comunitatvalenciana.com/viaje/alojamiento/apartamentos'

# Number of pages of the target website
max_pages = 4

# Maximum number of attempts to download the page
max_attempts = 5

# Timeout for loading page. Increase if internet connection is slow.
timeout = 30

# Time to wait between pages to avoid being blocked
sleep_time = 3

# Name of the output file
output_file = 'apartamentos.csv'

# Modules

In [14]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from tqdm import tnrange, tqdm_notebook

# Download data

In [65]:
# Initialize variables
names = list()
links = list()
cities = list()
signatures = list()
addresses = list()
provinces = list()
phones = list()
emails = list()
webs = list()
postal_codes = list()
locations = list()
categories = list()
aparthotels = list()
eeat_signs = list()
eeat_names = list()
capacities = list()
nums_apartments = list()

# Iterate through pages
for page_number in tnrange(max_pages): # Use tnrange to display progress bar
    attempts = 0
    while True: 
            # Download page
            page = requests.get(url + '?page=' + str(page_number), timeout=timeout)
            # Extract HTML
            html = BeautifulSoup(page.content, 'html.parser')
            # Extract apartments
            apartments = html.tbody.find_all('tr')
            # Extract apartment's data
            for apartment in apartments:
                # Get link of apartment's page
                link = apartment.find('td', 'views-field views-field-title active').a.get('href').strip()
                links.append(link)
                # Extract apartment details from apartment's page
                page = requests.get(link, timeout=timeout)
                html = BeautifulSoup(page.content, 'html.parser') 
                # Name
                try:
                    name = html.find('title').text.strip()
                    names.append(name)
                except:
                    names.append(None)
                # Address  
                try:
                    street = html.find('div', 'field field-type-text field-field-shared-address').text.split(':')[1].strip()
                    street_type = html.find('div', 'field field-type-text field-field-shared-via').text.split(':')[1].strip()
                    street_number = html.find('div', 'field field-type-text field-field-shared-address-n').text.split(':')[1].strip()
                    address = street_type + ' ' + street + ', ' + street_number
                    addresses.append(address)   
                except:
                    addresses.append(None)
                # City    
                try:
                    city = html.find('div', 'field field-type-nodereference field-field-shared-rel-municipio').text.split(':')[1].strip()
                    cities.append(city)   
                except:
                    cities.append(None)
                # Province  
                try:
                    province = html.find('div', 'field field-type-text field-field-shared-provincia').text.split(':')[1].strip()
                    provinces.append(province)   
                except:
                    provinces.append(None)
                # Postal code 
                try:
                    postal_code = html.find('div', 'field field-type-text field-field-shared-code-postal').text.split(':')[1].strip()
                    postal_codes.append(postal_code)   
                except:
                    postal_codes.append(None)
                # Phone       
                try:
                    phone = html.find('div', 'field field-type-text field-field-shared-phone').text.split(':')[1].strip()[:9]
                    phones.append(phone)   
                except:
                    phones.append(None)
                # Email       
                try:
                    email = html.find('div', 'field field-type-email field-field-shared-email').text.split(':')[1].strip()
                    emails.append(email)   
                except:
                    emails.append(None) 
                # Web       
                try:
                    web = html.find('div', 'field field-type-link field-field-shared-web').text.split('Web:')[1].strip()
                    webs.append(web)   
                except:
                    webs.append(None) 
                # Category    
                try:
                    category = html.find('div', 'field field-type-text field-field-alojamiento-category').text.split(':')[1].strip()
                    categories.append(category)   
                except:
                    categories.append(None)
                # Location       
                try:
                    location = html.find('div', 'field field-type-text field-field-alojamiento-situation').text.split(':')[1].strip()
                    locations.append(location)   
                except:
                    locations.append(None)
                # Aparthotel 
                try:
                    aparthotel = html.find('div', 'field field-type-text field-field-appartment-aparthotel').text.split(':')[1].strip()
                    aparthotels.append(aparthotel)   
                except:
                    aparthotels.append(None)
                # EEAT signature       
                try:
                    eeat_sign = html.find('div', 'field field-type-text field-field-appartment-sign-eeat').text.split(':')[1].strip()
                    eeat_signs.append(eeat_sign)   
                except:
                    eeat_signs.append(None)
                # EEAT name 
                try:
                    eeat_name = html.find('div', 'field field-type-text field-field-appartment-name-eeat').text.split(':')[1].strip()
                    eeat_names.append(eeat_name)   
                except:
                    eeat_names.append(None)
                # Capacity 
                try:
                    capacity = html.find('div', 'field field-type-number-integer field-field-shared-capacity').text.split(':')[1].strip()
                    capacities.append(capacity)   
                except:
                    capacities.append(None)
                # Number of apartments   
                try:
                    num_apartments = html.find('div', 'field field-type-text field-field-appartment-num').text.split(':')[1].strip()
                    nums_apartments.append(num_apartments)   
                except:
                    nums_apartments.append(None)
                # Signature
                try:
                    signature = html.find('div', 'field field-type-text field-field-shared-ws-signature').text.split(':')[1].strip()
                    signatures.append(signature)   
                except:
                    signatures.append(None)
            # Continue to next page
            time.sleep(sleep_time)
            break


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

# Process data

In [66]:
# Convert raw data to a dataframe
raw_data = list(zip(names, links, signatures, capacities, nums_apartments, 
                    addresses, cities, provinces, postal_codes, phones, emails, webs,
                    locations, categories, aparthotels, eeat_signs, eeat_names))
data = pd.DataFrame(raw_data, columns=['nombre', 'enlace', 'firma', 'capacidad', 'numero apartamentos',
                                       'direccion', 'municipio', 'provincia', 'codigo postal', 'telefono', 'email', 'web',
                                       'situacion', 'categoria', 'aparthotel', 'firma eeat', 'nombre eeat']) 

# Export data

In [67]:
# Export data to CSV
data.to_csv(output_file, index=False, encoding='utf-8-sig')