In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

## Web Scrapping

In [2]:
import requests as rq
import json
import pandas as pd
import numpy as np
import time

In [3]:
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
HEADERS = {"User-Agent": USER_AGENT}

In [59]:
S_URL = "https://www.zapimoveis.com.br/venda/imoveis/ce+"
E_URL = "/?pagina="
CITYS = [
    "fortaleza",
    "itaitinga",
    "juazeiro_do_norte",
    "caucaia",
    "crato",
    "eusebio",
    "horizonte",
    "aquiraz",
    "pacatuba",
    "paracuru",
    "maracanau",
    "guaiuba",
    "jijoca_de_jericoacoara",
    "sao_gonçalo_do_amarante",
    "mulungu",
    "cascavel",
    "beberibe",
    "maranguape",
    "paraipaba",
    "cruz",
    "barbalha",
    "quixeramobim",
    "fortim",
    "missao_velha",
    "guaramiranga",
    "umirim",
    "itapipoca",
    "trairi",
    "camocim",
    "aracati",
    "itarema",
    "pentecoste",
    "sobral",
    "milagres",
    "amontada",
    "pindoretama",
    "pacoti",
    "irauçuba",
    "tabuleiro_do_norte",
    "pacajus",
    "acarau",
]
pages = 101  # Máximo disponivel no site

In [60]:
def process_json(json_dict):
    results_listining = json_dict["results"]["listings"]
    for i, house in enumerate(results_listining):
        parking_spaces = results_listining[i]["listing"]["parkingSpaces"]
        results_listining[i]["listing"]["parkingSpaces"] = (
            int(parking_spaces[0]) if parking_spaces else 0
        )
        suites = results_listining[i]["listing"]["suites"]
        results_listining[i]["listing"]["suites"] = int(suites[0]) if suites else 0
        bathrooms = results_listining[i]["listing"]["bathrooms"]
        results_listining[i]["listing"]["bathrooms"] = (
            int(bathrooms[0]) if bathrooms else np.nan
        )
        bedrooms = results_listining[i]["listing"]["bedrooms"]
        results_listining[i]["listing"]["bedrooms"] = (
            int(bedrooms[0]) if bedrooms else np.nan
        )
        usable_areas = results_listining[i]["listing"]["usableAreas"]
        results_listining[i]["listing"]["usableAreas"] = (
            int(usable_areas[0]) if usable_areas else np.nan
        )

        total_areas = results_listining[i]["listing"]["totalAreas"]
        results_listining[i]["listing"]["totalAreas"] = (
            int(total_areas[0]) if total_areas else np.nan
        )
        iptu = results_listining[i]["listing"]["pricingInfo"]["yearlyIptu"]
        results_listining[i]["listing"]["pricingInfo"]["yearlyIptu"] = (
            float(iptu.replace("R$ ", "").replace(".", "")) if iptu else np.nan
        )

    json_dict["results"]["listings"] = results_listining
    return json_dict

In [61]:
S_URL + "caucaia" + E_URL + "1"

'https://www.zapimoveis.com.br/venda/imoveis/ce+caucaia/?pagina=1'

In [62]:
from tqdm.auto import tqdm


def get_house_info(html: str):
    json_start = html.find('"results":{"listings"')
    json_end = html.find(";(function(){var s")

    json_str = "{" + html[json_start:json_end]

    json_dict = json.loads(json_str)

    json_dict = process_json(json_dict)
    flattened_json = pd.json_normalize(json_dict["results"]["listings"])
    return flattened_json


def get_page_info(s_url: str, e_url: str, pages: int, columns: list):
    result = []

    for city in tqdm(CITYS):
        for page in range(1, pages + 1):
            try:
                response = rq.get(s_url + city + e_url + str(page), headers=HEADERS)
            except Exception as e:
                print(e)
                time.sleep(1000)

            if response.status_code == 200:
                try:
                    html = str(response.text)
                    result.append(get_house_info(html))
                except:
                    pass
    data = pd.concat(result, ignore_index=True)
    if columns:
        data = data[columns]

    return data

#### Extract Data

In [63]:
raw_data = get_page_info(S_URL, E_URL, pages, [])

100%|██████████| 41/41 [28:59<00:00, 42.44s/it]  


In [64]:
len(raw_data)

11724

In [65]:
citys = [city.lower() for city in list(raw_data["link.data.city"].unique())]

In [66]:
len(citys)

5

In [67]:
set(CITYS) - set(citys)

{'acarau',
 'amontada',
 'aquiraz',
 'aracati',
 'barbalha',
 'beberibe',
 'cascavel',
 'caucaia',
 'crato',
 'cruz',
 'eusebio',
 'fortim',
 'guaiuba',
 'guaramiranga',
 'horizonte',
 'irauçuba',
 'itapipoca',
 'itarema',
 'jijoca_de_jericoacoara',
 'juazeiro_do_norte',
 'maracanau',
 'maranguape',
 'milagres',
 'missao_velha',
 'pacajus',
 'pacatuba',
 'pacoti',
 'paracuru',
 'paraipaba',
 'pentecoste',
 'pindoretama',
 'quixeramobim',
 'sao_gonçalo_do_amarante',
 'sobral',
 'tabuleiro_do_norte',
 'umirim'}

#### Save Data

In [10]:
raw_data.to_csv(f"../data/raw_data/ce_raw_data_{len(raw_data)}.csv", index=False)