In [1]:
import os
import sys
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
sys.path.append("../python")

import general
import zillow_parse as zp

In [2]:
zillow_pages = zp.getZillowPages()

In [3]:
#add contents of urls to soup variable from each url
soup = BeautifulSoup(zillow_pages['req1'].content, 'html.parser')
soup1 = BeautifulSoup(zillow_pages['req2'].content, 'html.parser')

In [4]:
all_homes_df = pd.DataFrame()

In [5]:
all_homes_df = zp.gatherZillowDetails(all_homes_df, soup)
all_homes_df = zp.gatherZillowDetails(all_homes_df, soup1)

In [6]:
parsed_df = zp.parseZillowDetails(all_homes_df)

In [7]:
parsed_df.links[39]

'https://www.zillow.com/homedetails/2238-W-Carmen-Ave-Chicago-IL-60625/2095377216_zpid/'

In [8]:
parsed_df.links[40]

'https://www.zillow.com/homedetails/4940-N-Hamilton-Ave-Chicago-IL-60625/3698057_zpid/'

In [9]:
ZILLOW_FEATURES = ['basement', 'flooring', 'heating features', 'cooling features', 'laundry features', 
                   'total spaces', 'parking features', 'garage spaces', 'covered spaces', 'attached garage',
                   'construction materials', 'year built', 'hoa fee', 'annual tax amount']

In [37]:
def extractDetails(zillow_details, soup):
    
    details_found = {}
    for feature in ZILLOW_FEATURES:
        details_found[feature] = 0
    
    retrieved_zestimate = False # So we can grab once and only once
    for span in soup.find_all('span'):
        span_txt = span.get_text()
        if ':' in span_txt:
            split_span = [s.lower() for s in span_txt.split(':')]
            if len(split_span) == 2:
                if 'zestimate' in split_span[0] and not retrieved_zestimate and '$' in split_span[1]:
                    general.safeAppend(zillow_details, 'zestimate', int(split_span[1].split('$')[-1].replace(',','')))
                    retrieved_zestimate = True
                for feature in ZILLOW_FEATURES:
                    if len(split_span[0]) > 3 and split_span[0] in feature and details_found[feature] == 0 and len(split_span[1])>0:
                        general.safeAppend(zillow_details, feature, split_span[1].strip())
                        details_found[feature] = 1 # Feature located
                        
    for feature in ZILLOW_FEATURES:
        if details_found[feature] == 0: # Still haven't found it
            general.safeAppend(zillow_details, feature, -1)
    if not retrieved_zestimate: general.safeAppend(zillow_details, 'zestimate', -1)

In [38]:
import requests 

L = len(parsed_df)
zillow_details = {}
with requests.Session() as s:
    for i,link in enumerate(parsed_df.links):
        if i%10 == 0: print("Processing link {} of {}".format(i,L))
        r = s.get(link, headers=zp.REQ_HEADERS)
        soup = BeautifulSoup(r.content, 'html.parser')
        extractDetails(zillow_details, soup)

Processing link 0 of 80
Processing link 10 of 80
Processing link 20 of 80
Processing link 30 of 80
Processing link 40 of 80
Processing link 50 of 80
Processing link 60 of 80
Processing link 70 of 80


In [39]:
for z in zillow_details:
    print(z, len(zillow_details[z]), zillow_details[z][-1])

zestimate 80 270464
basement 80 none
heating features 80 natural gas, forced air
cooling features 80 central air
laundry features 80 in unit, laundry hook-up in unit
total spaces 80 1
parking features 80 off alley
garage spaces 80 1
covered spaces 80 -1
attached garage 80 -1
construction materials 80 brick
year built 80 -1
hoa fee 80 $236 monthly
flooring 80 hardwood
annual tax amount 80 -1


In [40]:
zillow_detail_df = pd.DataFrame(zillow_details)

In [41]:
zillow_detail_df[37:43]

    


Unnamed: 0,zestimate,basement,heating features,cooling features,laundry features,total spaces,parking features,garage spaces,covered spaces,attached garage,construction materials,year built,hoa fee,flooring,annual tax amount
37,1274155,"full,english,finished",natural gas,central air,"flooring(other), 2nd level, size(6x4)",2,garage,2,2,-1,brick,2010,-1,-1,"$20,542"
38,-1,none,"natural gas, forced air",central air,"in unit, in kitchen",0,,0,-1,-1,brick,1923,$259 monthly,hardwood,-1
39,-1,"full,rec/family area",natural gas,-1,-1,0,,0,-1,-1,-1,1890,-1,-1,-1
40,506774,unfinished,"forced air, gas",central,-1,1,"garage - detached, covered",1,-1,-1,frame,1906,-1,"tile, carpet, hardwood","$9,485"
41,343458,-1,-1,-1,-1,0,-1,0,-1,-1,-1,-1,$287 monthly,-1,"$6,196"
42,370741,none,"natural gas, forced air",central air,"gas dryer hookup, in unit, laundry hook-up in ...",1,"assigned, off alley",1,-1,-1,"brick, limestone",2006,$200 monthly,hardwood,"$5,678"


In [42]:
zillow_detail_df.columns

Index(['zestimate', 'basement', 'heating features', 'cooling features',
       'laundry features', 'total spaces', 'parking features', 'garage spaces',
       'covered spaces', 'attached garage', 'construction materials',
       'year built', 'hoa fee', 'flooring', 'annual tax amount'],
      dtype='object')

In [16]:
# #calculate the zestimate and insert into a dataframe
# zillow_zestimate = []
# for link in df['links']:
#     r = s.get(link, headers=req_headers)
#     soup = BeautifulSoup(r.content, 'html.parser')
#     home_value = soup.select_one('h4:contains("Home value")')
#     if not home_value:
#         home_value = soup.select_one('.zestimate').text.split()[-1]
#     else:
#         home_value = home_value.find_next('p').get_text(strip=True)
#     zillow_zestimate.append(home_value)

# cols=['zestimate']
# zestimate_result = pd.DataFrame(zillow_zestimate, columns=cols)
# # zestimate_result

# #convert zestimate column to float, and remove , and $
# zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace('$','')
# zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace('/mo','')
# zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace(',','')

# #covert rows with non zestimate to 0
# def non_zestimate(zestimate_result):
#     if len(zestimate_result['zestimate']) > 20:
#         return '0'
#     elif len(zestimate_result['zestimate']) < 5:
#         return '0'
#     else:
#         return zestimate_result['zestimate']

# zestimate_result['zestimate'] = zestimate_result.apply(non_zestimate,axis=1)

# # zestimate_result

# #concat zestimate dataframe and original df
# df = pd.concat([df, zestimate_result], axis=1)
# df['zestimate'] = df['zestimate'].astype('float')

# #create best deal column and sort by best_deal
# df ['best_deal'] = df['prices'] - df['zestimate']
# df = df.sort_values(by='best_deal')

# df