## This notebook visits and scrapes a list of home details urls previously collected by scraping target home search results pages

In [1]:
import pandas as pd
import numpy as np

import requests, bs4
from bs4 import BeautifulSoup as bs

import time, random

import csv

# 1) Use Previously Grabbed URL List to Get Home Details from website: 

### Single Family Homes that have recently sold in the City of Los Angeles

#### (Note: Due to the website detecting & halting automated/repetitive activity, this should be done piecemeal - no more than 100 at a time)

In [2]:
#  Update the User-Agent per 
#     this StackOverflow post: https://stackoverflow.com/questions/38489386/python-requests-403-forbidden

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
           'Accept-Encoding': 'gzip, deflate, br', 
           'Accept': '*/*', 
           'Connection': 'keep-alive'}

In [3]:
url_base = '<website homepage url here>'

In [1]:
urls = [el[0] for el in pd.read_csv('<text filename with specific home urls to visit>', header=None).values]

In [2]:
for n, url in enumerate(urls):
    url = url_base + url
    response = requests.get(url, headers=headers)
    
    # Delay (make absolutely sure response was received)
    time.sleep(5)
    status = response.status_code
    if status == 200:
        print('Success! Got page {}'.format(n))
        page = response.text
        soup = bs(page)
        
        # Get the variables we need; start with zipcode, sold_price, beds, baths & hm_sqft 
        try:
            stats = [val.text for val in soup.find(id='content').find_all('div', {'class': 'top-stats'})]
            
            if 'Sold Price' in [val.text for val in soup.find(id='content').find_all('div', {'class': 'top-stats'})][0]:
                stats = stats[0].replace('CA',' ').replace('$',' ').replace('Sold Price',' ') \
                                .replace('Beds',' ').replace('Baths',' ').replace('Sq Ft',' ').replace(',','').split()[-5:]
                zipcode, sold_price, beds, baths, hm_sqft = stats[0], stats[1], stats[2], stats[3], stats[4]
            else:
                stats = stats[0].replace('CA',' ').replace('$',' ').replace('Redfin Estimate',' ') \
                                .replace('Beds',' ').replace('Baths',' ').replace('Sq Ft',' ').replace(',','').split()[-5:]
                zipcode, sold_price, beds, baths, hm_sqft = stats[0], np.nan, stats[2], stats[3], stats[4]
        except:
            zipcode = sold_price = beds = baths = hm_sqft = np.nan
            
        # Next get stories, yr_built and lot_size
        try:
            stories = soup.find(text='Stories').next.text
        except:
            stories = np.nan
        try:
            yr_built = soup.find(text='Year Built').next.text
        except:
            yr_built = np.nan
        try:
            lot_size = soup.find(text='Lot Size').next.text
        except:
            lot_size = np.nan
            
        # Now get garage & pool
        try:
            amenities = [el.text.lower() for el in soup.find('div', {'class': 'amenities-container'})]
        except:
            amenities = []
        garage = sum(['garage' in el for el in amenities]) > 0
        pool = sum(['pool' in el for el in amenities]) > 0
        
        # Finally, get schools
        try:
            schools = soup.find(text='GreatSchools Rating').next.text.split('/')
            schools = np.mean([int(el[-1:]) for el in schools if el[-1:].isnumeric()])
        except:
            schools = np.nan
        
        # Save this row of data to csv  
        tup = (sold_price, beds, baths, hm_sqft, lot_size, yr_built, zipcode, pool, garage, stories, schools)
        with open('home_details_1.csv', 'a') as f:
            writer = csv.writer(f , lineterminator='\n')
            writer.writerow(tup)        
    
    else:
        print('Fail! Received status code {} for page {}'.format(status, n))
    
    # Delay some more to linger on the page before moving on (35 +/- 5 seconds)
    time.sleep(20 + 7*(random.random() - 0.5))
    
print('DONE!')