In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Downloading the page where I will get the data
def url_vivareal(page):
    """ This function returns a BeautifulSoup object from the vivareal website with the right page query """
    url_vivareal = 'https://www.vivareal.com.br/aluguel/sp/sao-carlos/apartamento_residencial/?pagina=' + str(page)
    r_vivareal = requests.get(url_vivareal)
    return BeautifulSoup(r_vivareal.text, 'html.parser')

In [3]:
# Create the dict that holds the data that will be a dataframe
data = {
    'neighbourhood': [],
    'area': [],
    'rooms': [],
    'bathrooms': [],
    'rent': [],
}

In [4]:
# When webscraping, integer values will be in the form of a string, this function transforms the price into an integer

def make_int(price, numbers=['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']):
    """ Receives the string of the price and returns an integers """
    
    for letter in price:
        if letter not in numbers:
            price = price.replace(letter, '')
    
    return int(price)

In [5]:
# When webscraping, it will retrieve the full address, this function will retrieve only the neighbourhood

def find_neighbourhood(address):
    """ Given an address, return the neighbourhood """
    
    if 'rua' in address[:5].lower() or 'avenida' in address[:8].lower():
        for i in range(len(address)):
            if address[i] == '-':
                for j in range(i, len(address)):
                    if address[j] == ',':
                        aux_address = address[i+2:j]
    else:
        for i in range(len(address)):
            if address[i] == ',':
                aux_address = address[:i]
    
    if aux_address[0] == ' ':
        aux_address = aux_address[1:]
    if '-' in aux_address:
        for i in range(len(aux_address)):
            aux_address = aux_address[:i]
    return aux_address

In [None]:
# Web scraping

for page in range(1, 73):
    vivareal_soup = url_vivareal(page)
    appartments = vivareal_soup.find_all('div', class_='js-card-selector')
    for appartment in appartments:
        address = appartment.find('span', class_='property-card__address-container').text[:-11]
        neighbourhood = find_neighbourhood(address)
        property_card_value = appartment.find_all('span', class_='js-property-card-value')
        area = int(property_card_value[0].text)
        rooms = int(property_card_value[1].text)
        bathrooms = int(property_card_value[2].text)
        rent = make_int(appartment.find('p').text)
        data['neighbourhood'].append(neighbourhood)
        data['area'].append(area)
        data['rooms'].append(rooms)
        data['bathrooms'].append(bathrooms)
        data['rent'].append(rent)

# Creating and saving the dataframe
df = pd.DataFrame(data)
df.to_csv('rent_sao_carlos.csv')
df