In [2]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Web Scraping

In [3]:
import requests as rq
import json
import pandas as pd
import numpy as np

In [4]:
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
HEADERS = {"User-Agent": USER_AGENT}

In [5]:
URL = "https://www.zapimoveis.com.br/venda/?pagina="
pages = [str(i) for i in range(1, 11)]

In [7]:
def process_json(json_dict):
    results_listining = json_dict["results"]["listings"]
    for i, house in enumerate(results_listining):
        parking_spaces = results_listining[i]["listing"]["parkingSpaces"]
        results_listining[i]["listing"]["parkingSpaces"] = (
            int(parking_spaces[0]) if parking_spaces else 0
        )
        suites = results_listining[i]["listing"]["suites"]
        results_listining[i]["listing"]["suites"] = int(suites[0]) if suites else np.nan
        bathrooms = results_listining[i]["listing"]["bathrooms"]
        results_listining[i]["listing"]["bathrooms"] = (
            int(bathrooms[0]) if bathrooms else 0
        )
        bedrooms = results_listining[i]["listing"]["bedrooms"]
        results_listining[i]["listing"]["bedrooms"] = (
            int(bedrooms[0]) if bedrooms else np.nan
        )
        usable_areas = results_listining[i]["listing"]["usableAreas"]
        results_listining[i]["listing"]["usableAreas"] = (
            int(usable_areas[0]) if usable_areas else np.nan
        )

        total_areas = results_listining[i]["listing"]["totalAreas"]
        results_listining[i]["listing"]["totalAreas"] = (
            int(total_areas[0]) if total_areas else np.nan
        )
        iptu = results_listining[i]["listing"]["pricingInfo"]["yearlyIptu"]
        results_listining[i]["listing"]["pricingInfo"]["yearlyIptu"] = (
            float(iptu.replace("R$ ", "").replace(".", "")) if iptu else np.nan
        )

    json_dict["results"]["listings"] = results_listining
    return json_dict

In [8]:
def get_house_info(html: str):
    json_start = html.find('"results":{"listings"')
    json_end = html.find(";(function(){var s")

    json_str = "{" + html[json_start:json_end]

    json_dict = json.loads(json_str)
    json_dict = process_json(json_dict)
    flattened_json = pd.json_normalize(json_dict["results"]["listings"])
    return flattened_json


def get_page_info(url: str, pages: list, columns: list):
    result = []
    for page in pages:
        response = rq.get(URL + page, headers=HEADERS)
        if response.status_code == 200:
            html = str(response.text)
            result.append(get_house_info(html))

    data = pd.concat(result, ignore_index=True)
    if columns:
        data = data[columns]

    return data

#### Extract Data

In [9]:
raw_data = get_page_info(URL, pages, [])

#### Save Data

In [10]:
raw_data.to_csv(f"../data/raw_data_{len(raw_data)}.csv", index=False)

In [11]:
raw_data.head()

Unnamed: 0,type,link.data.zone,link.data.neighborhood,link.data.street,link.data.streetNumber,link.data.state,link.data.city,link.name,link.href,link.rel,...,listing.link,listing.showRentalDatazapStamp,listing.listingsCount,listing.address.point.lon,listing.address.point.source,listing.address.point.lat,account.minisite.logoUrl,account.minisite.coverUrl,account.minisite.coverUrlMobile,account.minisite.description
0,superPremium,Bairros,Ponta da Praia,,,São Paulo,Santos,"Apartamento com 2 Quartos à venda, 82m²",/imovel/venda-apartamento-2-quartos-com-piscin...,,...,/imovel/venda-apartamento-2-quartos-com-piscin...,False,0,,,,,,,
1,superPremium,Zona Sul,Ipanema,Avenida Vieira Souto,,Rio de Janeiro,Rio de Janeiro,"Apartamento com 2 Quartos à venda, 100m²",/imovel/venda-apartamento-2-quartos-mobiliado-...,,...,/imovel/venda-apartamento-2-quartos-mobiliado-...,False,0,,,,,,,
2,superPremium,Zona Sul,Botafogo,Rua Marechal Ramon Castilla,,Rio de Janeiro,Rio de Janeiro,"Apartamento com 3 Quartos à venda, 100m²",/imovel/venda-apartamento-3-quartos-com-piscin...,,...,/imovel/venda-apartamento-3-quartos-com-piscin...,False,0,,,,,,,
3,,Bairros,Residencial Menezes,,,São Paulo,Bady Bassitt,"Terreno / Lote / Condomínio à venda, 200m²",/imovel/venda-terreno-lote-condominio-residenc...,,...,/imovel/venda-terreno-lote-condominio-residenc...,False,0,,,,,,,
4,,Bairros,Centro,Avenida Getúlio Vargas,1.0,Bahia,Feira de Santana,"Casa com 3 Quartos à venda, 178m²",/imovel/venda-casa-3-quartos-centro-feira-de-s...,,...,/imovel/venda-casa-3-quartos-centro-feira-de-s...,False,0,-38.96384,GOOGLE,-12.255758,,,,
