In [31]:
import os
import sys
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
sys.path.append("../python")

import general
import zillow_parse as zp

In [32]:
zillow_pages = zp.getZillowPages()

In [33]:
#add contents of urls to soup variable from each url
soup = BeautifulSoup(zillow_pages['req1'].content, 'html.parser')
soup1 = BeautifulSoup(zillow_pages['req2'].content, 'html.parser')

In [34]:
all_homes_df = pd.DataFrame()

In [35]:
all_homes_df = zp.gatherZillowDetails(all_homes_df, soup)
all_homes_df = zp.gatherZillowDetails(all_homes_df, soup1)

In [36]:
parsed_df = zp.parseZillowDetails(all_homes_df)

In [37]:
parsed_df.head()

Unnamed: 0,prices,address,links,beds,baths,sq_feet
0,435000.0,5067 N Lincoln Ave APT 304 Chicago IL 60625,https://www.zillow.com/homedetails/5067-N-Linc...,3,2,1587
1,209900.0,4923 N Wolcott Ave APT 2B Chicago IL 60640,https://www.zillow.com/homedetails/4923-N-Wolc...,1,1,900
2,1037000.0,4932 N Mozart St Chicago IL 60625,https://www.zillow.com/homedetails/4932-N-Moza...,5,3,-1
3,339900.0,4924 N Rockwell St APT 1N Chicago IL 60625,https://www.zillow.com/homedetails/4924-N-Rock...,2,2,-1
4,274900.0,2144 W Giddings St #2 Chicago IL 60625,https://www.zillow.com/homedetails/2144-W-Gidd...,2,1,-1


In [7]:
link = parsed_df.links[10]
print(link)

https://www.zillow.com/homedetails/4847-N-Seeley-Ave-Chicago-IL-60625/54533290_zpid/


In [12]:
ZILLOW_FEATURES = ['basement', 'flooring', 'heating features', 'cooling features', 'laundry features', 
                   'total spaces', 'parking features', 'garage spaces', 'covered spaces', 'attached garage',
                   'construction materials', 'year built', 'hoa fee', 'annual tax amount']

In [14]:
import requests 
with requests.Session() as s:
    r = s.get(link, headers=zp.REQ_HEADERS)

In [15]:
soup = BeautifulSoup(r.content, 'html.parser')

In [29]:
zillow_details = {}

details_found = {}
for feature in ZILLOW_FEATURES:
    details_found[feature] = 0
    

retrieved_zestimate = False # So we can grab once and only once
for span in soup.find_all('span'):
    span_txt = span.get_text()
    if ':' in span_txt:
        split_span = [s.lower() for s in span_txt.split(':')]
        if len(split_span) == 2:
            if 'zestimate' in split_span[0] and not retrieved_zestimate:
                general.safeAppend(zillow_details, 'zestimate', int(split_span[1].split('$')[-1].replace(',','')))
                retrieved_zestimate = True
            for feature in ZILLOW_FEATURES:
                if split_span[0] in feature and details_found[feature] == 0 and len(split_span[1])>0:
                    general.safeAppend(zillow_details, feature, split_span[1].strip())
                    details_found[feature] = 1 # Feature located

for feature in ZILLOW_FEATURES:
    if details_found[feature] == 0: # Still haven't found it
        general.safeAppend(zillow_details, feature, -1)

In [30]:
zillow_details

{'zestimate': [1008372],
 'basement': ['full,english,finished'],
 'flooring': ['hardwood'],
 'heating features': ['natural gas, forced air, zoned'],
 'cooling features': ['central air, zoned'],
 'laundry features': ['2nd floor laundry, flooring(porcelain tile), 2nd level, size(05x06)'],
 'total spaces': ['2'],
 'parking features': ['garage'],
 'garage spaces': ['2'],
 'covered spaces': ['2'],
 'construction materials': ['vinyl siding'],
 'year built': ['1996'],
 'attached garage': [-1],
 'hoa fee': [-1],
 'annual tax amount': [-1]}

In [None]:
#calculate the zestimate and insert into a dataframe
zillow_zestimate = []
for link in df['links']:
    r = s.get(link, headers=req_headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    home_value = soup.select_one('h4:contains("Home value")')
    if not home_value:
        home_value = soup.select_one('.zestimate').text.split()[-1]
    else:
        home_value = home_value.find_next('p').get_text(strip=True)
    zillow_zestimate.append(home_value)

cols=['zestimate']
zestimate_result = pd.DataFrame(zillow_zestimate, columns=cols)
# zestimate_result

#convert zestimate column to float, and remove , and $
zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace('$','')
zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace('/mo','')
zestimate_result['zestimate'] = zestimate_result['zestimate'].str.replace(',','')

#covert rows with non zestimate to 0
def non_zestimate(zestimate_result):
    if len(zestimate_result['zestimate']) > 20:
        return '0'
    elif len(zestimate_result['zestimate']) < 5:
        return '0'
    else:
        return zestimate_result['zestimate']

zestimate_result['zestimate'] = zestimate_result.apply(non_zestimate,axis=1)

# zestimate_result

#concat zestimate dataframe and original df
df = pd.concat([df, zestimate_result], axis=1)
df['zestimate'] = df['zestimate'].astype('float')

#create best deal column and sort by best_deal
df ['best_deal'] = df['prices'] - df['zestimate']
df = df.sort_values(by='best_deal')

df