In [19]:
import os
import requests
import numpy as np
import requests
import random
import re
from unidecode import unidecode

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from time import sleep

In [20]:
def replace_with_underscore(names_list):
    cleaned_list = []
    for name in names_list:
        # Remove any sequence within parentheses including the parentheses
        name_no_parenthesis = re.sub(r'\(.*?\)', '', name).strip()
        # Convert to ASCII, remove tildes, and replace spaces/dashes with underscores
        cleaned_name = re.sub(r'[\s\-]+', '_', unidecode(name_no_parenthesis))
        cleaned_list.append(f"pisos-{cleaned_name}/")
    return cleaned_list


def getOffersFrom(url):
    soups = []
    for i in range(1,101):
        endpoint = f'{url}{i}/'
        response = requests.get(endpoint)
        
        if response.status_code != 200:
            return soups
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        no_results = soup.find('div', class_ = 'no-results')
        
        if no_results:
            return soups
        
        soups.append(soup.body)
        print(f'Scraped page {i}')
        #sleep(0.5)
    return soups

def getUrls(soups):
    ads = []
    for soup in soups:
        ad = [x.find('a', class_='ad-preview__title')['href'] for x in soup.find('div', class_ = 'grid__wrapper').find_all('div', class_ = 'ad-preview')]
        ads.extend(ad)
    return ads

def getPrice(soup):
    try:
        price = soup.find('div', class_ = 'maindata').find('div', class_ = 'priceBox-price').text.strip()
    except:
        price = np.nan
    return price

def getTitle(soup):
    try:
        title = soup.find('div', class_ = 'maindata').find('h1', class_ = 'title').text.strip()
    except:
        title = np.nan
    return title

def getLocation(soup):
    try:
        location = soup.find('div', id = 'location').find('div', class_ = 'location').find('div', class_ = 'subtitle').text.strip()
    except:
        location = np.nan
    return location

def getLatLong(soup):
    try:
        lat = soup.find('div', id = 'location').find('script', type = 'text/javascript').text.strip().split(';')[0].split('=')[-1].strip()
        long = soup.find('div', id = 'location').find('script', type = 'text/javascript').text.strip().split(';')[1].split('=')[-1].strip()
    except:
        lat, long = np.nan, np.nan
    return lat, long

def getCharacteristics(soup):
    try:
        charblocks = soup.find('div', class_ = 'characteristics').find_all('div', class_ = 'charblock')
        characteristics = []
        for charblock in charblocks:
            characteristics.extend(charblock.find_all('li'))
            
        characteristics = [' '.join(x.text.split('\n')).strip() for x in characteristics]
    except:
        characteristics = np.nan
    return characteristics

def scrapeOnePage(url):

    chrome_driver = f"{os.getcwd()}/chromedriver"

    browser = webdriver.Chrome(executable_path = chrome_driver)

    browser.get(url)

    browser.maximize_window()

    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="didomi-notice-agree-button"]'))
    )

    element.click() # Accept cookies

#     def wait_for_scroll_to_finish(browser, expected_position):
#         while True:
#             current_position = browser.execute_script("return window.pageYOffset;")
#             if current_position >= expected_position - 5:  # A small threshold to account for any small discrepancies
#                 break
#             sleep(0.1)  # Check every 100ms

#     total_height = browser.execute_script("return document.body.scrollHeight;")
#     intervals = random.randint(2, 5)  # You can adjust this range based on your preference

#     for i in range(1, intervals):
#         partial_scroll = total_height * i / intervals
#         browser.execute_script(f"window.scroll({{ top: {partial_scroll}, behavior: 'smooth' }});")

#         wait_for_scroll_to_finish(browser, partial_scroll)

#         waiting = random.uniform(0, 0.3)
#         sleep(waiting)  # Adjust sleep as necessary for each scroll to complete

    while True:
        is_at_bottom = browser.execute_script("return window.scrollY + window.innerHeight >= document.body.scrollHeight")
        browser.execute_script("window.scroll({ top: document.body.scrollHeight, behavior: 'smooth' });")
        if is_at_bottom:
            break
        sleep(1)

    html_content = browser.page_source

    browser.quit()
    
    return html_content

def scrape(urls):
    data = {'title' : [],
            'price' : [],
            'location' : [],
            'lat' : [],
            'long' : [],
            'characteristics' : [],
           }
    for url in urls:
        html_content = scrapeOnePage(url)
        soup = BeautifulSoup(html_content, "html.parser")
        
        title = getTitle(soup)
        data['title'].append(title)
        
        price = getPrice(soup)
        data['price'].append(price)
        
        location = getLocation(soup)
        data['location'].append(location)
        
        
        lat, long = getLatLong(soup)
        data['lat'].append(lat)
        data['long'].append(long)
        
        characteristics = getCharacteristics(soup)
        data['characteristics'].append(characteristics)
        
    return data

# Scraping

In [21]:
# Base

base_url = 'https://www.pisos.com/'

venta_url = f'{base_url}venta/'

In [22]:
# Sacamos todas las paginas filtradas por provincias

response = requests.get(base_url)

soup = BeautifulSoup(response.text, "html.parser")

provincias_str = soup.find('div', class_ = 'home-container').find('div', class_ = 'selectBox').find_all('ul')[-1].text

provincias = [x.lower() for x in provincias_str.strip().split('\n')]

endpoints = replace_with_underscore(provincias)

endpoints = [venta_url + endpoint for endpoint in endpoints]

print(endpoints)

with open('locations.txt', 'a+') as file:
    for endpoint in endpoints:
        file.writelines(endpoint + '\n')

['https://www.pisos.com/venta/pisos-a_coruna/', 'https://www.pisos.com/venta/pisos-alava_araba/', 'https://www.pisos.com/venta/pisos-albacete/', 'https://www.pisos.com/venta/pisos-alicante/', 'https://www.pisos.com/venta/pisos-almeria/', 'https://www.pisos.com/venta/pisos-andorra/', 'https://www.pisos.com/venta/pisos-asturias/', 'https://www.pisos.com/venta/pisos-avila/', 'https://www.pisos.com/venta/pisos-badajoz/', 'https://www.pisos.com/venta/pisos-baleares_balears/', 'https://www.pisos.com/venta/pisos-barcelona/', 'https://www.pisos.com/venta/pisos-bilbao_bilbo/', 'https://www.pisos.com/venta/pisos-bizkaia/', 'https://www.pisos.com/venta/pisos-burgos/', 'https://www.pisos.com/venta/pisos-caceres/', 'https://www.pisos.com/venta/pisos-cadiz/', 'https://www.pisos.com/venta/pisos-cantabria/', 'https://www.pisos.com/venta/pisos-castellon_castello/', 'https://www.pisos.com/venta/pisos-cerdanya_francesa/', 'https://www.pisos.com/venta/pisos-ceuta/', 'https://www.pisos.com/venta/pisos-ciud