# Web Scraping on a Brazilian real estate website

In [1]:
from urllib.request import Request, urlopen #Opening and page request
from urllib.error import URLError, HTTPError #Error treatment
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np

In [2]:
def website_acess(url):
    header= {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

    try:
        req= Request(url, headers= header)
        response= urlopen(req)
        return response.read()
    except HTTPError as error:
        print(error.status, error.reason)
    except URLerror as error:
        print(error.reason)

In [3]:
def get_code(soup):
    try:
        code= soup.find('span', class_= 'title__code js-external-id').get_text().strip().split()[1]
    except:
        code= np.nan
    
    return code  

In [4]:
def get_area(features):
    try:
        area= int(features[0].get_text().split()[0].replace('m²',''))
    except:
        area= np.nan
    
    return area

In [5]:
def get_bedrooms(features):
    try:
        bedrooms= int(features[1].get_text().split()[0])
    except:
        bedrooms= np.nan
        
    return bedrooms

In [6]:
def get_bathrooms(features):
    try:
        bathrooms= int(features[2].get_text().split()[0])
    except:
        bathrooms= np.nan
    
    return bathrooms

In [7]:
def get_parking(features):
    try:
        parking= int(features[3].get_text().split()[0])
    except:
        parking= np.nan
    
    return parking

In [8]:
def get_value(soup):
    try:
        valor= soup.find('h3', class_= 'price__price-info js-price-sale').get_text()
        valor= valor.strip().replace('R$ ', '').replace('.','')
        valor= int(valor)
    except: 
        valor= np.nan
   
    return valor

In [9]:
def get_condo(soup):
    if soup.findAll('span', class_= 'price__list-value') == []:
        condo= np.nan
    else:
        try:
            condo= soup.findAll('span', class_= 'price__list-value')[0].get_text()
            condo= condo.strip().replace('R$ ', '').replace('.','')
            condo= int(condo)
        except:
            condo= np.nan
    
    return condo

In [10]:
def get_IPTU_tax(soup):
    if soup.findAll('span', class_= 'price__list-value') == []:
        IPTU_tax= np.nan
    else:
        try:
            IPTU_tax= soup.findAll('span', class_= 'price__list-value')[1].get_text()
            IPTU_tax= IPTU_tax.strip().replace('R$ ', '').replace('.','')
            IPTU_tax= int(IPTU_tax)
        except:
            IPTU_tax= np.nan
            
    return IPTU_tax

In [11]:
def get_amenities(soup):
    if soup.find('ul', class_= 'amenities__list') == []:
        amenities= np.nan
    else:
        try:
            amenities= soup.find('ul', class_= 'amenities__list').get_text().split('  ')        
            amenities= amenities[1:-1]
        except:
            amenities= np.nan
    return amenities

In [13]:
def data_recovery(soup):
    info= {}
    # Code
    info['COD.:']= get_code(soup)
             
    # Features
    features= soup.findAll('li',class_='features__item')
    
    ## Area
    info['Area']= get_area(features)

    ## Bedrooms
    info['Beadrooms']= get_bedrooms(features)

    ## Bathrooms
    info['Bathrooms']= get_bathrooms(features)

    ## Parking 
    info['Parking']= get_parking(features)

    # Recupera valor de compra do imóvel
    info['Value']= get_value(soup)

    # Recupera valor de condominio do imóvel
    info['Condo']= get_condo(soup)

    # Recupera valor de IPTU do imóvel
    info['IPTU_tax']= get_IPTU_tax(soup)

    # Amenities
    info['Amenities']= get_amenities(soup)
    
    return info

In [14]:
#rotina
def get_inners_urls(soup):
    advertisemnts= soup.findAll('div', class_= 'property-card__main-info')
    link= []
    for ad in advertisemnts:
        link.append('https://www.vivareal.com.br/' + ad.find('a', class_='property-card__main-link js-carousel-link').get('href'))
    return link

In [15]:
def soup_create(html):
    html= html.decode('UTF-8')
    soup= BeautifulSoup(html, 'html.parser')
    
    return soup    

In [16]:
def create_dataset(nPages=200):
    anuncios= []
    for i in range(nPages):
        html= website_acess('https://www.vivareal.com.br/venda/minas-gerais/belo-horizonte/?pagina='+ str(i))
        soup= soup_create(html)
        
        links= get_inners_urls(soup)     
        
        for j in links:
            inner= website_acess(j)
            inner_soup= soup_create(inner)
            anuncios.append(data_recovery(inner_soup))
    return anuncios

In [17]:
page_numbers = 10
dataset= pd.DataFrame(create_dataset(page_numbers))

In [18]:
dataset

Unnamed: 0,COD.:,Area,Beadrooms,Bathrooms,Parking,Value,Condo,IPTU_tax,Amenities
0,VULDTO6C,300,4,3,4.0,1180000.0,2740.0,,"[Lavanderia, Churrasqueira, Piscina, Aceita..."
1,34331,57,2,1,1.0,,,,
2,2VUWZTQN,75,3,2,2.0,385000.0,350.0,2000.0,"[Aceita animais, Janela de alumínio, Área de..."
3,UHJSZZTJ,80,3,3,1.0,250000.0,,,[Aceita animais]
4,WPCHAKX6,170,4,3,3.0,700000.0,900.0,375.0,
...,...,...,...,...,...,...,...,...,...
355,XV7LXY5X,550,7,8,4.0,1995000.0,5900.0,,"[Aceita animais, Closet, Academia, Churrasq..."
356,KHHFG6SA,60,3,2,1.0,189900.0,320.0,726.0,"[Condomínio fechado, Portão eletrônico, Port..."
357,QW4QFCJE,250,4,4,2.0,1650000.0,1500.0,5200.0,"[Aceita animais, Closet, Acesso para deficie..."
358,1203,100,3,3,2.0,475000.0,400.0,156.0,


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   COD.:      360 non-null    object 
 1   Area       360 non-null    int64  
 2   Beadrooms  360 non-null    int64  
 3   Bathrooms  360 non-null    int64  
 4   Parking    350 non-null    float64
 5   Value      350 non-null    float64
 6   Condo      300 non-null    float64
 7   IPTU_tax   230 non-null    float64
 8   Amenities  280 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 25.4+ KB


In [21]:
dataset.to_csv('VivaReal_BD.csv', sep= ';', index= False)