In [1]:
# For Python 3
# WARNING : Requires PhantomJS to be installed !

import bs4
import sys
from os.path import isfile
from urllib.request import urlopen
import subprocess
import numpy as np
import pandas as pd
import sqlite3
import datetime
import unicodedata

In [None]:
def get_list_urls_to_scrape(array_known_urls):
    """Get the list of urls to scrape, odered by date of publication."""

    """Only 100 pages are available for 1 request. This does not give us all the results.
    Then, we separate our search in several parts, thanks to the price filter"""

    """Stop as soon as an already known anouncement appears (want to get only new ones)."""

    arguments = ['prix-max=2000&', 'prix-min=2000&']

    list_urls_to_scrape = []
    n_pages_visited = 1

    for argument in arguments:
        url_recherche = 'https://www.bienici.com/recherche/location/paris-75000?tri=publication-desc&' + \
            argument + 'page='

        num_page = 1
        known_urls_found = 0

        while num_page != -1:

            text = "\r  Current page: {0}".format(n_pages_visited)
            sys.stdout.write(text)

            url = url_recherche + str(num_page)
            HTML = subprocess.check_output(["./files/casperjs-1.1.3/bin/casperjs" ,"./files/get_html.js", url])
            soup = bs4.BeautifulSoup(HTML, 'html.parser')



            articles = soup.find('div', {'class' : 'resultsListContainer'}).findAll('article')

            for article in articles:
                new_url = 'https://www.bienici.com' + article.\
                find('div', {'class' : 'sideListItemFirstBlock'}).find('a')['href']


                # Check is this announce is already known
                if new_url in array_known_urls:
                    known_urls_found += 1
                else:
                    list_urls_to_scrape.append(new_url)
                    known_urls_found = 0

                if known_urls_found >= 60:
                    # If we have 60 consecutive urls already scraped, no need to keep going
                    # on, because the advertisments are sorted in chronological order of modif
                    num_page = -2
                    break

            num_page += 1

            # Find if there are more pages with propositions for flats
            next_button = soup.find('div', {'class' : 'sideListFooter'}).\
            find('div', {'class' : 'pagination'}).find('a', {'class':'nextPage'})
            next_button_text = next_button.string.encode("UTF8")

            if next_button_text == 'Limite atteinte':
                num_page = -1
                print('WARNING: Limite atteinte')
            if next_button.has_attr('disabled'):
                next_button_status = next_button['disabled']
                if next_button_status=="disabled":
                    num_page = -1

            n_pages_visited += 1

            # To remove after the tests !!!
            #if n_pages_visited > 0:
            #    num_page = -1

    return np.unique(list_urls_to_scrape)

In [2]:
def make_list_to_scrape():
    print("Getting the list of pages to scrape")

    # Get the list of urls we already know we have to scrape
    if isfile('./results/scraping_bienici.db'):
        db_connector = sqlite3.connect('./results/scraping_bienici.db')
        cursor = db_connector.cursor()
        cursor.execute("""Select url from real_estate_ad""")
        res = cursor.fetchall()
        known_urls = [line[0] for line in res]
    else:
        known_urls = []

    list_urls_to_scrape = get_list_urls_to_scrape(known_urls)
    
    # Save the result:
    print("\n  %s new pages to scrape" %len(list_urls_to_scrape))
    if len(list_urls_to_scrape) > 0:
        df_urls_to_scrape = pd.DataFrame(list_urls_to_scrape, columns=['Url'])
        df_urls_to_scrape.to_csv('./files/urls_to_scrape.csv', sep=',', index=False)
        print("File of urls to scrape saved")
        
    print('Done')

In [3]:
def connect_dataBase():
    # Make sure the db exists
    db_connector = sqlite3.connect('./results/scraping_bienici.db')
    cursor = db_connector.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS real_estate_ad(
             url UNIQUE,
             scraping_date TEXT,
             title TEXT,
             price REAL,
             charges REAL,
             place TEXT,
             surface REAL,
             construction_year INTEGER,
             description TEXT,
             number_pictures INTEGER,
             furnished TEXT,
             number_rooms INTEGER,
             number_bedrooms INTEGER,
             number_bathromms INTEGER,
             floor TEXT,
             heating TEXT,
             lift TEXT,
             contact TEXT,
             contact_address TEXT,
             contact_rcs TEXT,
             contact_type TEXT,
             ref_annonce TEXT,
             publication_date TEXT,
             modification_date TEXT,
             infos_quartier TEXT,
             other_info TEXT
        )
        """)
    db_connector.commit()
    return db_connector

In [4]:
def scrape_page(url):
    # Returns an array of all values of interest
    
    # Get the HTML
    HTML = subprocess.check_output(["./files/casperjs-1.1.3/bin/casperjs" ,"./files/get_html_realEstateAd.js", url])
    if '!&!Not anymore to sell!&!' in str(HTML):
        return -1
    elif '!&!Connexion failed!&!' in str(HTML):
        return 0
    soup = bs4.BeautifulSoup(HTML, 'html.parser')

    # Get current date
    now = datetime.datetime.now()
    scraping_date = now.strftime("%Y-%m-%d@%H:%M")

    # Get the title and the city
    title = soup.find('title').string
    title_el = soup.find('div', {'class':'titleInside'}).find('h1')
    city = title_el.find('span', {'class':'city'}).string

    title_el.find('br').extract()
    title_el.find('span')
    title = title_el.string

    # Get the price (en euros, charges comprises, par mois)
    price_cont = soup.find('div', {'class':'titleInside'}).find('div', {'class':'itemPriceContainer'}).find('div', {'class':'price'})
    price_els = price_cont.findAll('span')
    price = ''
    for el in price_els:
        price += el.string
    if ' à ' not in price:
        price = float(price.replace('€CC\xa0/\xa0mois','').replace('\xa0', ''))

    # Get pulication date, modification date and reference
    publication_infos = soup.find('div', {'class':'realEstateAdsMainInfo'}).findAll('span')
    publication_date = ''
    ref_annonce = ''
    modification_date = ''
    for i in range(len(publication_infos)):
        line = publication_infos[i]
        if 'Publiée le ' in str(line):
            publication_date = line.string
        elif "Référence de l’annonce" in str(line):
            ref_annonce = line.string
        elif "Modifiée le " in str(line):
            modification_date = line.string
    publication_date = publication_date.replace("Publiée le ", "")
    ref_annonce = ref_annonce.replace("Référence de l’annonce\xa0: ", "")
    modification_date = modification_date.replace("Modifiée le ", '')

    # Get the number of pictures
    nb_pictures = soup.find('div', {'class':'detailedSheetHeaderDescriptionContainer'}).find('div', {'class':'photosCounter'})
    if nb_pictures == None:
        nb_pictures = 0
    else:
        span = nb_pictures.find('span')
        if span != None:
            nb_pictures.find('span').extract()
            nb_pictures.find('i').extract()
            nb_pictures = int(nb_pictures.string.replace('/',''))
        else:
            nb_pictures = 1

    # Get the description
    if soup.find('section',{'class':'description'}) == None:
        description = "NO DESCRIPTION AVAILABE"
    else:
        description = soup.find('section',{'class':'description'}).find('div', {'class':'descriptionContent'}).string
        if description == None:
            description = "Description HTML_format -- " + soup.find('section',{'class':'description'}).find('div', {'class':'descriptionContent'}).decode_contents(formatter="html")

    # Get the name of the contact and its address
    contact = soup.find('div', {'class':'contact-info'}).find('div', {'class':'contact-name'}).string
    address_location = soup.find('div', {'class':'contact-info'}).find('div', {'class':'contact-address'})
    
    if address_location != None:
        contact_address = address_location.string
    else:
        contact_address = ''
    
    rcs_location = soup.find('div', {'class':'contact-info'}).find('div', {'class':'contact-rcs'})
    if rcs_location != None:
        contact_rcs = rcs_location.string
    else:
        contact_rcs = ''

    # Get the type of seller (agency, usually)
    contact_type = soup.find('div',{'class':'contact-address'}).string


    # Get all the other info
    charges = '' # /month, in euros
    surface = '' # in m²
    n_rooms = '' # number of rooms
    construction_year = ''
    n_bedrooms = ''
    n_bathrooms = ''
    other_info = ''
    floor = ''
    has_lift = ''
    heating = ''
    furnished = 'No'

    about = soup.find('div', {'class':'allDetails'}).findAll('div')
    sep = '\n\\&\\\n' # a separator
    for line in about:
        line_string = line.find('span').string
        if '€/mois dont' in line_string:
            charges = float(line_string.split('€')[1].replace('/mois dont ','').replace('\xa0', ''))
        elif 'm²' in line_string and 'balcon' not in line_string and ' de ' not in line_string:
            surface = float(line_string.replace('m²',''))
        elif 'pièce' in line_string:
            n_rooms = int(line_string.replace('pièces', '').replace('pièce', ''))
        elif 'chambre' in line_string:
            n_bedrooms = int(line_string.replace('chambres','').replace('chambre', ''))
        elif 'étage' in line_string:
            floor = line_string
        elif 'Rez-de-chaussée' in line_string:
            floor = 'Rez-de-chaussée'
        elif 'Construit en' in line_string:
            construction_year = int(line_string.replace('Construit en ', ''))
        elif 'Ascenseur' in line_string:
            has_lift = 'True'
        elif 'salle de bain' in line_string or 'salles de bain' in line_string or 'salle d’eau' in line_string or 'salles d’eau' in line_string:
            n_bathrooms = int(line_string.replace('salles de bain', '').replace('salle de bain', '').replace("salles d’eau", '').replace("salle d’eau", ''))
        elif 'Chauffage' in line_string:
            heating = line_string.replace('Chauffage\xa0: ', '')
        elif 'Meublé' in line_string:
            furnished = 'Yes'
        else:
            content = line_string
            other_info += content
            other_info += sep
    
    # Get infos about the place
    quartier_HTML_place = soup.find('div', {'class':'neighborhoodDescription'})
    if quartier_HTML_place == None:
        quartier = ''
    else:
        quartier = quartier_HTML_place.decode_contents(formatter="html")

    values = [url,
              scraping_date,
              title,
              price,
              charges,
              city,
              surface,
              construction_year,
              description,
              nb_pictures,
              furnished,
              n_rooms,
              n_bedrooms,
              n_bathrooms,
              floor,
              heating,
              has_lift,
              contact,
              contact_address,
              contact_rcs,
              contact_type,
              ref_annonce,
              publication_date,
              modification_date,
              quartier,
              other_info]
    
    return values

In [5]:
def scrape_all_pages(list_url_to_scrape, db_connector, n_save=10):
    # n_save : the result of the scraping is saved each time n_save pages have been scraped.
    
    list_urls_not_to_scrape = []
    
    # Get the list of pages already scraped
    cursor = db_connector.cursor()
    res = cursor.execute("select url from real_estate_ad").fetchall()
    already_scraped = []
    for line in res:
        already_scraped.append(line[0])
    
    # Get the list of url we really need to scrape
    list_to_scrape = [url for url in list_url_to_scrape if url not in already_scraped]
    print('  %s pages to scrape' %len(list_to_scrape))
    
    counter = 0
    # Then, we scrape all the pages
    for url in list_to_scrape:
        #print(url)
        
        counter += 1
        
        text = "\r  Scraping the page {0}/{1}".format(counter, len(list_to_scrape))
        sys.stdout.write(text)
        
        
        values = scrape_page(url)
        if values == -1: #Means that this flat isn't anymore to sell
            list_urls_not_to_scrape.append(url)
            
        elif values != 0: # Then, I am sure the array has been found

            
            
            # Store this information in the db
            cursor.execute("""INSERT INTO real_estate_ad (
             url,
             scraping_date,
             title,
             price,
             charges,
             place,
             surface,
             construction_year,
             description,
             number_pictures,
             furnished,
             number_rooms,
             number_bedrooms,
             number_bathromms,
             floor,
             heating,
             lift,
             contact,
             contact_address,
             contact_rcs,
             contact_type,
             ref_annonce,
             publication_date,
             modification_date,
             infos_quartier,
             other_info)
            VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", values)


            # All n_save pages, the results are saved
            if counter%n_save == 0:
                db_connector.commit()
    
    # At the end, re-write the list of pages to scrape without the ads which are not available anymore
    if len(list_urls_not_to_scrape) > 0:
        list_urls = list_url_to_scrape
        for url in list_urls_not_to_scrape:
            list_urls.remove(url)
        df_urls_to_scrape = pd.DataFrame(list_urls, columns=['Url'])
        df_urls_to_scrape.to_csv('./files/urls_to_scrape.csv', sep=',', index=False)
        
    db_connector.commit()
    print('\nDone')

In [6]:
make_list_to_scrape()
array_urls_to_scrape = np.asarray(pd.read_csv('./files/urls_to_scrape.csv'))[:,0]
db_connector = connect_dataBase()

Getting the list of pages to scrape
  Current page: 14
  190 new pages to scrape
File of urls to scrape saved
Done


In [7]:
print('*********\nScraping')
managed_all = False
n_exceptions = 0
n_exceptions_max = 40
while managed_all != True:
    # Sometimes, an exception is raised beacause of internet issue.
    # Just need to re-execute the program, and it works !
    try:
        scrape_all_pages(array_urls_to_scrape, db_connector, n_save=5)
        managed_all = True
    except Exception:
        db_connector.commit()
        n_exceptions += 1
        if n_exceptions > n_exceptions:
            print("More than %s exceptions, BREAK..." %n_exceptions)
            break # If too many excetions, stop
        print('\nException caught - Restart scraping')
        pass
db_connector.close()

*********
Scraping
  190 pages to scrape
  Scraping the page 3/190
Exception caught - Restart scraping
  188 pages to scrape
  Scraping the page 41/188
Exception caught - Restart scraping
  148 pages to scrape
  Scraping the page 148/148
Done


In [8]:
db_connector.close()