## Save listing contents as text
Save the contents of each listing (house or apartment) into a text file, to be used later by BeautifulSoup.

In [3]:
# Import necessary libraries
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import time
import random
import re
from queue import Queue
from threading import Thread
import requests

In [4]:
# Create a folder for data
data_folder_path = os.path.abspath("./data/")
if not os.path.exists(data_folder_path):
    os.makedirs(data_folder_path)

In [5]:
# Folder for page contents
contents_folder_path = os.path.join(data_folder_path,"contents/")
if not os.path.exists(contents_folder_path):
    os.makedirs(contents_folder_path)

In [6]:
# The txt file for the links of the individual listings
listings_url_file_name = "listings_url_list_processed.txt" 
listings_url_file_path = os.path.join(data_folder_path,listings_url_file_name)

In [7]:
# Load links of listings
with open(listings_url_file_path, "r+") as file:
    urls = file.readlines()
print(urls)

['https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/8956021\n', 'https://www.immoweb.be/en/classified/house/for-sale/denderleeuw/9470/9528614\n', 'https://www.immoweb.be/en/classified/duplex/for-sale/geraardsbergen/9500/9555271\n', 'https://www.immoweb.be/en/classified/house/for-sale/uccle/1180/9064209\n', 'https://www.immoweb.be/en/classified/house/for-sale/anderlecht/1070/9391667\n', 'https://www.immoweb.be/en/classified/duplex/for-sale/arlon/6700/9551192\n', 'https://www.immoweb.be/en/classified/house/for-sale/bredene/8450/9559175\n', 'https://www.immoweb.be/en/classified/villa/for-sale/nivelles/1400/9451348\n', 'https://www.immoweb.be/en/classified/apartment/for-sale/uccle/1180/9317406\n', 'https://www.immoweb.be/en/classified/house/for-sale/woluwe-saint-pierre/1150/9406426\n', 'https://www.immoweb.be/en/classified/house/for-sale/limal-wavre/1300/9478369\n', 'https://www.immoweb.be/en/classified/house/for-sale/wangenies/6220/9419687\n', 'https://www.immoweb.be/en

In [8]:
print(len(urls))

9306


In [9]:
# For a single entry, get contents as a string
#set up the queue to hold all the urls
q = Queue(maxsize=0)
# Use many threads (50 max, or one for each url)
num_theads = min(20, len(urls))

In [10]:
# Populating Queue with tasks
# Specify the urls to run
range_url = range(0,min(30, len(urls)))  # Change to run for each person

In [11]:
#Populating Queue with tasks
results = [{} for x in urls];
# load up the queue with the urls to fetch and the index for each job (as a tuple):
for i in range_url:
    #need the index and the url in each queue item.
    q.put((i,urls[i],contents_folder_path))

In [12]:
print(q.queue)

deque([(0, 'https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/8956021\n', 'C:\\Users\\ecebo\\MyRepos\\challenge-collecting-data\\data\\contents/'), (1, 'https://www.immoweb.be/en/classified/house/for-sale/denderleeuw/9470/9528614\n', 'C:\\Users\\ecebo\\MyRepos\\challenge-collecting-data\\data\\contents/'), (2, 'https://www.immoweb.be/en/classified/duplex/for-sale/geraardsbergen/9500/9555271\n', 'C:\\Users\\ecebo\\MyRepos\\challenge-collecting-data\\data\\contents/'), (3, 'https://www.immoweb.be/en/classified/house/for-sale/uccle/1180/9064209\n', 'C:\\Users\\ecebo\\MyRepos\\challenge-collecting-data\\data\\contents/'), (4, 'https://www.immoweb.be/en/classified/house/for-sale/anderlecht/1070/9391667\n', 'C:\\Users\\ecebo\\MyRepos\\challenge-collecting-data\\data\\contents/'), (5, 'https://www.immoweb.be/en/classified/duplex/for-sale/arlon/6700/9551192\n', 'C:\\Users\\ecebo\\MyRepos\\challenge-collecting-data\\data\\contents/'), (6, 'https://www.immoweb.be/en/classified

In [13]:
def save_contents_to_text(q, result):
    while not q.empty():
        work = q.get()  #fetch new work from the Queue
        #try:
        url = work[1]
        folder_path = work[2]
        print(folder_path)
        m = re.search("/(?P<immoweb_code>[\w\-]+)$", url)
        immoweb_code = m.group("immoweb_code")
            
        file_name = f"Contents_{immoweb_code}.txt"
        filepath = os.path.join(folder_path,file_name)
        print(filepath)
        # Check if the text file already exists
        if os.path.exists(filepath):
            print("File already exists")
            # return True
            
        print(url)
        r = requests.get(url)
        time.sleep(random.uniform(2.0, 3.0))
        print("Made request")
        # Display the requested url and the return of the server
        print(url, r.status_code)
        # Store the HTML content of the website in a text
        page_contents = r.text
        print(len(page_contents))
        
        with open(filepath,"w", encoding="utf-8") as input_file:
            input_file.write(page_contents)

        print("Written file ..." + immoweb_code)
            
             # Save file to text
        #except:
            #print('Error with URL check!')
        # Signal to the queue that task has been processed
        q.task_done()
    return True

In [14]:
# Starting worker threads on queue processing
for i in range(num_theads):
    print('Starting thread ', i)
    worker = Thread(target=save_contents_to_text, args=(q,results))
    worker.setDaemon(True)    #setting threads as "daemon" allows main program to 
                              #exit eventually even if these dont finish 
                              #correctly.
    worker.start()

Starting thread  0
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_8956021.txt
Starting thread  1
https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/8956021

C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_9528614.txt
Starting thread  2
https://www.immoweb.be/en/classified/house/for-sale/denderleeuw/9470/9528614

C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_9555271.txt
https://www.immoweb.be/en/classified/duplex/for-sale/geraardsbergen/9500/9555271

Starting thread  3
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_9064209.txt
Starting thread  4
https://www.immoweb.be/en/classified/house/for-sale/uccle/1180/9064209

C:\

In [15]:
# Wait until the queue has been processed
q.join()
print('All tasks completed.')

Made request
https://www.immoweb.be/en/classified/duplex/for-sale/geraardsbergen/9500/9555271
 200
166041
Written file ...9555271
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_9374417.txt
https://www.immoweb.be/en/classified/apartment/for-sale/etterbeek/1040/9374417

Made request
https://www.immoweb.be/en/classified/house/for-sale/woluwe-saint-pierre/1150/9406426
 200
185199
Written file ...9406426
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_9559342.txt
https://www.immoweb.be/en/classified/duplex/for-sale/knokke-heist/8300/9559342

Made request
https://www.immoweb.be/en/classified/house/for-sale/anderlecht/1070/9391667
 200
177729
Written file ...9391667
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/
C:\Users\ecebo\MyRepos\challenge-collecting-data\data\contents/Contents_9557465.txt
https:/

Find the clickable for Search on the list and click on it

In [29]:
r = requests.get(urls[0])
# Display the requested url and the return of the server
print(urls[0], r.status_code)
# Store the HTML content of the website in a text
page_contents = r.content

https://www.immoweb.be/en/classified/apartment/for-sale/ixelles/1050/8956021
 200


In [30]:
print(type(page_contents))
print(page_contents)

<class 'bytes'>
