# Realtor Scrapy

Scrapy the real state website and retrieve house listing of a target price and retrieve the info.

### Import Dependencies

In [1]:
import os
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import pandas as pd
import time
import pprint

### Setup Splinter (For Mac)

In [2]:
# identify location of chromedriver and store it as a variable
driverPath = !which chromedriver

# Setup configuration variables to enable Splinter to interact with browser
executable_path = {'executable_path': driverPath[0]}
browser = Browser('chrome', **executable_path, headless=False)

### Scraping

In [27]:
# URL of page to be scraped
url_realtor = "https://www.realtor.com/realestateandhomes-search/Houston_TX/price-"
link_details = "https://www.realtor.com"
min_price = '250000'
max_price = '300000'
sort_by = '/sby-2' # Highest to lowest price
page_number = 1

query_url = f"{url_realtor}{min_price}-{max_price}{sort_by}/pg-{page_number}"
print(query_url)

https://www.realtor.com/realestateandhomes-search/Houston_TX/price-250000-300000/sby-2/pg-1


### BeautifulSoup

In [None]:
# Scrap with BeautifulSoup. However it does 
# page = requests.get(query_url)
# soup = BeautifulSoup(page.content, 'html.parser')

### Splinter

In [4]:
# Use the browser to visit the url
browser.visit(query_url)

In [5]:
# Wait for 5 seconds for error purpouses
time.sleep(15)

In [6]:
# Return the rendered page by the browser
html_realtor = browser.html

In [7]:
# Use beatifulsoup to scrap the page rendered by the browser
soup = BeautifulSoup(html_realtor, 'html.parser')

In [35]:
# Search for the div where the title is located
results = soup.find_all('div', class_="card-box")
# print(results[1].prettify())
print(f"Total results: {len(results)}")
print('-----------------------------------')

news_title = results[1].text
# print(f"Title: {news_title}")
house_price = results[1].find('span', class_="sc-pZnSc jEfqsK")
# print(f"Price: {house_price.text}")
img_label = results[1].find('img', class_="fade top")
# print(f"Address: {img_label['alt']}")
link_page = results[1].find('a')['href']


print(f"Price: {house_price.text} | Address: {img_label['alt']}")
print(f"Link: {link_details}{link_page}")
print(f"Photo link: {img_label['srcset'].split(',')[1]}")

Total results: 48
-----------------------------------
Price: $300,000 | Address: 3419 Francis St, Houston, TX 77004
Link: https://www.realtor.com/realestateandhomes-detail/3419-Francis-St_Houston_TX_77004_M70177-82186
Photo link:  https://ap.rdcpix.com/fcaff661b166f17179ed2bfaf4c823c1l-m1141071019od-w480_h360_x2.jpg 2x


In [59]:
# Print results and save to a dictionary
n = 0
realstate_list = []
for result in results:
    n = n + 1
    print('-----------------------------------')
    print('')
    print(f'Result: {n} of {len(results)}')
    if not result.find('div', class_="ads"):
#         print(f'Result: {n} of {len(results)}')
        price_div = result.find('div', class_="price")
        house_price = price_div.find('span').text.split('$')[-1]
        link_page = result.find('a')['href']
        img_label = result.find('img')
        address = img_label['alt']
        print(f"Price: ${house_price} | Address: {address}")
        try:
            price_reduced = result.find('span', class_="price-reduced-amount")
            price_now = price_reduced.text
            print(f"Price reduced: {price_now}")
        except:
            pass
        print(f"Link: {link_details}{link_page}")
        try:
            photo_url = img_label['srcset'].split(',')[1]
            print(f"Photo link: {photo_url}")
        except:
            print(f"Photo not available")
    else:
        print('Data not available')

    print('')
    
#     Save results to a dictionary
    realstate_list.append(
        {
            "Price": int(house_price.replace(',','')),
            "Address": address,
            "Link": str(link_details+link_page),
            "Photo link": photo_url
        }
    )


-----------------------------------

Result: 1 of 48
Price: $300,000 | Address: 4719 Braesvalley Dr, Houston, TX 77096
Link: https://www.realtor.com/realestateandhomes-detail/4719-Braesvalley-Dr_Houston_TX_77096_M86439-72354
Photo link:  https://ap.rdcpix.com/323798002be5650a73cb16f207d38378l-m2592828076od-w480_h360_x2.jpg 2x

-----------------------------------

Result: 2 of 48
Price: $300,000 | Address: 3419 Francis St, Houston, TX 77004
Link: https://www.realtor.com/realestateandhomes-detail/3419-Francis-St_Houston_TX_77004_M70177-82186
Photo link:  https://ap.rdcpix.com/fcaff661b166f17179ed2bfaf4c823c1l-m1141071019od-w480_h360_x2.jpg 2x

-----------------------------------

Result: 3 of 48
Price: $300,000 | Address: 1911 Bering Dr Apt 26, Houston, TX 77057
Link: https://www.realtor.com/realestateandhomes-detail/1911-Bering-Dr-Apt-26_Houston_TX_77057_M85449-39776
Photo link:  https://ap.rdcpix.com/80ccfe9b5e9efd3a671dc639a49c413fl-m2381122050od-w480_h360_x2.jpg 2x

-----------------

In [11]:
# When you’ve finished testing, close your browser using browser.quit:
browser.quit()

### Data Cleaning

In [66]:
# Save the data to a dataframe
listing_df = pd.DataFrame(realstate_list)
listing_df.head()

Unnamed: 0,Price,Address,Link,Photo link
0,300000,"4719 Braesvalley Dr, Houston, TX 77096",https://www.realtor.com/realestateandhomes-det...,https://ap.rdcpix.com/323798002be5650a73cb16f...
1,300000,"3419 Francis St, Houston, TX 77004",https://www.realtor.com/realestateandhomes-det...,https://ap.rdcpix.com/fcaff661b166f17179ed2bf...
2,300000,"1911 Bering Dr Apt 26, Houston, TX 77057",https://www.realtor.com/realestateandhomes-det...,https://ap.rdcpix.com/80ccfe9b5e9efd3a671dc63...
3,300000,"1911 Bering Dr Apt 26, Houston, TX 77057",https://www.realtor.com/realestateandhomes-det...,https://ap.rdcpix.com/80ccfe9b5e9efd3a671dc63...
4,300000,"6534 Madrid St, Houston, TX 77021",https://www.realtor.com/realestateandhomes-det...,https://ap.rdcpix.com/2067127680/b4da0b225880...


In [67]:
# Dataframe to dictionary
listing_df.to_dict(orient="records")

[{'Price': 300000,
  'Address': '4719 Braesvalley Dr, Houston, TX 77096',
  'Link': 'https://www.realtor.com/realestateandhomes-detail/4719-Braesvalley-Dr_Houston_TX_77096_M86439-72354',
  'Photo link': ' https://ap.rdcpix.com/323798002be5650a73cb16f207d38378l-m2592828076od-w480_h360_x2.jpg 2x'},
 {'Price': 300000,
  'Address': '3419 Francis St, Houston, TX 77004',
  'Link': 'https://www.realtor.com/realestateandhomes-detail/3419-Francis-St_Houston_TX_77004_M70177-82186',
  'Photo link': ' https://ap.rdcpix.com/fcaff661b166f17179ed2bfaf4c823c1l-m1141071019od-w480_h360_x2.jpg 2x'},
 {'Price': 300000,
  'Address': '1911 Bering Dr Apt 26, Houston, TX 77057',
  'Link': 'https://www.realtor.com/realestateandhomes-detail/1911-Bering-Dr-Apt-26_Houston_TX_77057_M85449-39776',
  'Photo link': ' https://ap.rdcpix.com/80ccfe9b5e9efd3a671dc639a49c413fl-m2381122050od-w480_h360_x2.jpg 2x'},
 {'Price': 300000,
  'Address': '1911 Bering Dr Apt 26, Houston, TX 77057',
  'Link': 'https://www.realtor.com