In [1]:
import os
import requests
import requests
import ast

import pandas as pd
import glob
import plotly.express as px

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from time import sleep

from locations import *
from parsing import *

# VARIABLES GLOBALES

In [2]:
PROVINCE = 'malaga'

# CONSTANTES GLOBALES

In [3]:
# Base

base_url = 'https://www.pisos.com/'

venta_url = f'{base_url}venta/'

URL = f'https://www.pisos.com/viviendas/{PROVINCE}/'

# SINGLE PAGE SCRAPING

In [4]:
def scrape(urls):
    browser = webdriver.Chrome()

    for idx, url in enumerate(urls):
        browser.get(url)
        # browser.maximize_window()

        if idx == 0:
            element = WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="didomi-notice-agree-button"]'))
            )

            element.click() # Accept cookies

        # <SCROLLING>
        while True:
            is_at_bottom = browser.execute_script("return window.scrollY + window.innerHeight >= document.body.scrollHeight")
            browser.execute_script("window.scroll({ top: document.body.scrollHeight, behavior: 'smooth' });")
            if is_at_bottom:
                break
            sleep(0.2)
        # </SCROLLING>
        
        html_content = browser.page_source

        timestamp = ''.join(str(datetime.datetime.now().timestamp()).split('.'))

        soft_url = url.replace('https://www.pisos.com/comprar/', '')
        soft_url = soft_url.replace('/', '_')
        soft_url = soft_url.replace('-', '_')

        file_path = f'../html_content/{timestamp}_{soft_url}.html'
        try:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(html_content)
        except Exception as e:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(repr(e))

    browser.quit()

# TESTING

In [5]:
def scanRegions(url):
    response = requests.get(url)

    print(f'URL: {url} | STATUS {response.status_code}')

    soup = BeautifulSoup(response.text, 'html.parser')

    items = soup.select('div.zoneList a.item:not(.item-subitem)')

    endpoints = {}
    for item in items:
        endpoint = item['href']
        n_results = item.find('span', class_ = 'total').text

        # <INT>
        if len(n_results) != 0:
            try:
                n_results = n_results[1:-1]
                n_results = ''.join(n_results.split('.'))
                n_results = int(n_results)
            except:
                print(f'FAIL CASTING TO INTEGER {endpoint}')
        else:
            print(f'n_results EMPTY {endpoint}')
        #</INT>

        # <RECURSSION>
        if n_results > 3000:
            endpoints[endpoint] = scanRegions(base_url[:-1] + endpoint)
        # </RECURSSION>

        else: endpoints[endpoint] = n_results

    return endpoints

In [6]:
def parseRegions(endpoints):

    def extract(endpoints):
        array = []
        for key, value in endpoints.items():
            if isinstance(value, int):
                array.append(key)
                continue

            data = extract(value)
            array.extend(data)

        return array
    
    endpoints = extract(endpoints)

    array = []
    
    for endpoint in endpoints:
        data = endpoint
        if '/venta/pisos-' in endpoint:
            data = endpoint.replace('/venta/pisos-', '/viviendas/')
        array.append(data)

    return array

In [7]:
# urls = scanRegions(URL)
# urls = parseRegions(urls)
# urls = [base_url + x for x in urls]
# urls = [x.replace('//viviendas/', '/venta/pisos-') for x in urls]
# print(urls)

In [8]:
def scrapeUrls(endpoint):
    response = requests.get(endpoint)

    soup = BeautifulSoup(response.text, 'html.parser')

    results = soup.find('div', class_ = 'grid__title').find_all('span')[-1].text

    n_results = int(''.join([x for x in results if x.isnumeric()]))
    n_pages = (n_results // 30) + 1

    urls = []
    for i in range(n_pages):
        url = f'{endpoint}{i + 1}'
        response = requests.get(url)

        soup = BeautifulSoup(response.text, 'html.parser')

        ads = soup.find_all('a', class_ = 'ad-preview__title')
        urls.extend([x['href'] for x in ads])
    return urls

In [9]:
# for url in urls:
#     urls_ = scrapeUrls(url)
#     urls_ = list(set(urls_))
#     urls_ = [base_url[:-1] + x for x in urls_]
#     with open('urls.csv', 'a+') as file:
#         file.writelines([x + ',\n' for x in urls_])

In [10]:
with open('urls.csv') as file:
    urls = file.read()

urls = urls.split(',\n')

In [11]:
# scrape(urls[24686:])

In [12]:
# Obtener la Data a partir de los archivos de la carpeta 'html_content'

data = {
    'price' : [],
    'title' : [],
    'province' : [],
    'location' : [],
    'lat' : [],
    'lng' : [],
    'characteristics' : [],
    'agency' : [],
    'updated' : [],
    'numeric_data' : [],
}

files = glob.glob('../html_content/*.html')

for file in files:
    with open(file, encoding='utf-8') as f:
        source = f.read()
    soup = BeautifulSoup(source, 'html.parser')

    price = getPrice(soup)
    title = getTitle(soup)
    location = getLocation(soup)
    lat, long = getLatLong(soup)
    characteristics = getCharacteristics(soup)
    updated, agency = getAgencyDate(soup)
    numeric_data = [x for x in file[16:].split('_') if x.isnumeric()]

    data['price'].append(price)
    data['title'].append(title)
    data['province'].append(PROVINCE)
    data['location'].append(location)
    data['lat'].append(lat)
    data['lng'].append(long)
    data['characteristics'].append(characteristics)
    data['agency'].append(agency)
    data['updated'].append(updated)
    data['numeric_data'].append(numeric_data)

df = pd.DataFrame(data)

In [13]:
df.to_csv(f'../data/{PROVINCE}.csv', index = False)

In [14]:
####################################################################################################################