In [159]:
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
import requests
import lxml
from lxml.html.soupparser import fromstring
import prettify
import numbers
import htmltext

In [160]:
#add headers in case you use chromedriver (captchas are no fun); namely used for chromedriver
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

In [161]:
#create url variables for each zillow page
with requests.Session() as s:    
    url = 'https://www.zillow.com/homes/for_sale/chicago'
    url2 = 'https://www.zillow.com/homes/for_sale/chicago/2_p/'

    r = s.get(url, headers=req_headers)
    r2 = s.get(url2, headers=req_headers)
    
    url_links = [url, url2]

In [162]:
#add contents of urls to soup variable from each url
soup = BeautifulSoup(r.content, 'html.parser')
soup1 = BeautifulSoup(r2.content, 'html.parser')

In [163]:
all_homes_df = pd.DataFrame()

In [164]:
def gatherZillowDetails(all_homes_df, soup):
    
    df = pd.DataFrame()
    
    #all for loops are pulling the specified variable using beautiful soup and inserting into said variable
    for i in soup:
        address = soup.find_all (class_= 'list-card-addr')
        price = list(soup.find_all (class_='list-card-price'))
        beds = list(soup.find_all("ul", class_="list-card-details"))
        details = soup.find_all ('div', {'class': 'list-card-details'})
        home_type = soup.find_all ('div', {'class': 'list-card-footer'})
        last_updated = soup.find_all ('div', {'class': 'list-card-top'})
        brokerage = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
        link = soup.find_all (class_= 'list-card-link')

        #create dataframe columns out of variables
        df['prices'] = price
        df['address'] = address
        df['beds'] = beds

    #create empty url list
    urls = []

    #loop through url, pull the href and strip out the address tag
    for link in soup.find_all("article"):
        href = link.find('a',class_="list-card-link")
        addresses = href.find('address')
        addresses.extract()
        urls.append(href)

    #import urls into a links column
    df['links'] = urls
    df['links'] = df['links'].astype('str')

    #remove html tags
    df['links'] = df['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
    df['links'] = df['links'].replace('" tabindex="0"></a>', ' ', regex=True)
    
    if len(all_homes_df.columns) > 0: # Not the first page
        #append first two dataframes
        return all_homes_df.append(df, ignore_index = True) 
    else:
        return df

In [165]:
all_homes_df = gatherZillowDetails(all_homes_df, soup)
all_homes_df = gatherZillowDetails(all_homes_df, soup1)

In [166]:
def getBedsBathsSqFt(html_str):
    split_strs = [s.split('>') for s in html_str.split('<')]
    key_stats = []
    for split_str in split_strs:
        for short_str in split_str:
            try:
                key_stats.append(int(short_str.replace(',','')))
            except:
                pass
    if len(key_stats) == 3: #All data available
        beds, baths, sqft = key_stats
    else:
        beds, baths = key_stats
        sqft = -1
    return beds, baths, sqft

In [167]:
def parseZillowDetails(df):

    #convert columns to str
    df['prices'] = df['prices'].astype('str')
    df['address'] = df['address'].astype('str')
    df['beds'] = df['beds'].astype('str')
    
    #remove html tags
    df['prices'] = df['prices'].replace('<div class="list-card-price">', ' ', regex=True)
    df['address'] = df['address'].replace('<address class="list-card-addr">', ' ', regex=True)
    df['prices'] = df['prices'].replace('</div>', ' ', regex=True)
    df['address'] = df['address'].replace('</address>', ' ', regex=True)
    df['prices'] = df['prices'].str.replace(r'\D', '')

    #split beds column into beds, bath and sq_feet
    df['beds'], df['baths'], df['sq_feet'] = zip(*all_homes_df.beds.apply(getBedsBathsSqFt))

    #remove commas from sq_feet and convert to float
    df.replace(',','', regex=True, inplace=True)

    #drop nulls
    df = df[(df['prices'] != '') & (df['prices']!= ' ')]

    #convert column to float
    df['prices'] = df['prices'].astype('float')
    # d['sq_feet'] = df['sq_feet'].astype('float')

    #remove spaces from link column
    df['links'] = df.links.str.replace(' ','')

    #rearrange the columns
    df = df[['prices', 'address', 'links', 'beds', 'baths', 'sq_feet']]
    return df

In [168]:
parsed_df = parseZillowDetails(all_homes_df)

In [169]:
parsed_df

Unnamed: 0,prices,address,links,beds,baths,sq_feet
0,317000.0,4736 W Addison St Chicago IL 60641,"<aclass=""list-card-linklist-card-link-top-marg...",3,2,1062
1,424900.0,2665 W Maypole Ave Chicago IL 60612,"<aclass=""list-card-linklist-card-link-top-marg...",3,3,1578
2,589900.0,4450 N Richmond St Chicago IL 60625,"<aclass=""list-card-linklist-card-link-top-marg...",4,4,2674
3,169999.0,7949 S Dobson Ave Chicago IL 60619,"<aclass=""list-card-linklist-card-link-top-marg...",4,3,2500
4,244900.0,7538 S Wood St Chicago IL 60620,"<aclass=""list-card-linklist-card-link-top-marg...",4,3,2800
...,...,...,...,...,...,...
75,699000.0,200 E Delaware Pl APT 15A Chicago IL 60611,"<aclass=""list-card-linklist-card-link-top-marg...",3,2,1550
76,239000.0,420 E Waterside Dr UNIT 306 Chicago IL 60601,"<aclass=""list-card-linklist-card-link-top-marg...",1,1,-1
77,229999.0,10553 S Sangamon St Chicago IL 60643,"<aclass=""list-card-linklist-card-link-top-marg...",3,2,1200
78,379999.0,7735 W Farragut Ave Chicago IL 60656,"<aclass=""list-card-linklist-card-link-top-marg...",3,2,1500


In [None]:
#calculate the zestimate and insert into a dataframe
zillow_zestimate = []
for link in df['links']:
    r = s.get(link, headers=req_headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    home_value = soup.select_one('h4:contains("Home value")')
    if not home_value:
        home_value = soup.select_one('.zestimate').text.split()[-1]
    else:
        home_value = home_value.find_next('p').get_text(strip=True)
    zillow_zestimate.append(home_value)

cols=['zestimate']
zestimate_result = pd.DataFrame(zillow_zestimate, columns=cols)
# zestimate_result

#convert zestimate column to float, and remove , and $
zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace('$','')
zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace('/mo','')
zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace(',','')

#covert rows with non zestimate to 0
def non_zestimate(zestimate_result):
    if len(zestimate_result['zestimate']) > 20:
        return '0'
    elif len(zestimate_result['zestimate']) < 5:
        return '0'
    else:
        return zestimate_result['zestimate']

zestimate_result['zestimate'] = zestimate_result.apply(non_zestimate,axis=1)

# zestimate_result

#concat zestimate dataframe and original df
df = pd.concat([df, zestimate_result], axis=1)
df['zestimate'] = df['zestimate'].astype('float')

#create best deal column and sort by best_deal
df ['best_deal'] = df['prices'] - df['zestimate']
df = df.sort_values(by='best_deal')

df