In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
import re
import math
import pandas as pd
import traceback 
import numpy as np

In [2]:
def get_soup_for_url(url):
    htmlfile = urlopen(url) 
    soup = BS(htmlfile,'html.parser')
    return soup

In [3]:

property_data = list({})

def get_table_values(soup, prefix=''):
    values = {}
    rows = soup.find_all('div',{'class':'table-row'})
    for row in rows:
        values[prefix+row.find('span',{'class':'table-label'}).text] = row.find('div',{'class':'table-value'}).text 
    
    return values

In [4]:

def get_head_stats(soup, id):
    values = {}
    return soup.find('div',{'data-rf-test-id':id}).find('div',{'class':'statsValue'}).text


In [5]:

def get_sale_home_data(soup,):
    property = dict()

    
    price_details  = soup.find_all('div',{'class': 'keyDetailsList'})[0] 
     
    property.update(get_table_values(price_details))
    house_details = soup.find_all('div',{'class': 'keyDetailsList'})[1] 
    property.update(get_table_values(house_details))

    return property

In [6]:
def get_pet_fee_details(fee):
    pets = fee.find_all('div',{'class':'PetsBlock'})
    values = {}
    for pet in pets:
        pet_kind = pet.find('div',{'class':'block-label'}).text
        values.update(get_table_values(pet.find('div',{'class':'listed-pet-info'}),pet_kind+'_'))
    
    return values

In [7]:
def get_parking_fee_details(soup):
    
    
    values = {}
    values['parking_size'] = 0
    
    try:
        
        parkings = soup.find_all('div',{'class':'ParkingTypeBlock'})
        values['parking_size'] = len(parkings)
    except:
        traceback.exc()
        
    return values
   
       

In [8]:
def get_leasing_fee_details(soup):
    
    lease = soup.find('div',{'class':'LeaseTermBlock'})
    values = {}
    try:
        values = get_table_values(lease.find('div',{'class':'DPTableDisplay listed-lease-info'}),'lease_')
    except:
        values['lease_deposit'] = np.nan
    
    return values

In [9]:
def get_aminities(aminities):
    values = {}
    values['in_unit_aminity_count'] = 0
    values['community_aminity_count'] = 0
    
    try:
        aminities = soup.find_all('div',{'class':'AmenitiesBlock'})

    
        for aminity in aminities:
            aminity_type = aminity.find('h3').text
            if aminity_type == 'In-unit amenities':
                values['in_unit_aminity_count'] = len(aminity.find_all('li'))
            elif aminity_type == 'Community amenities':
                values['community_aminity_count'] = len(aminity.find_all('li'))
    except Exception as e:
        traceback.exc()
    
    return values


In [10]:
def get_rent_home_data(soup):
    values = {}
    values.update(get_parking_fee_details(soup))
    values.update(get_aminities(soup))
    values.update(get_leasing_fee_details(soup))
    return values

In [11]:
df = pd.read_csv('house_links.csv')
df

Unnamed: 0.1,Unnamed: 0,price,address,link,zipcode,city,rent_or_sale
0,0,"$1,999,888","41247 Apricot Ln, Fremont, CA 94539",/CA/Fremont/41247-Apricot-Ln-94539/home/1637276,94539,Fremont,Sale
1,1,"$1,180,000","34116 Pavia Ter, Fremont, CA 94555",/CA/Fremont/34116-Pavia-Ter-94555/home/17183337,94555,Fremont,Sale
2,2,"$1,499,888","36778 Oak St, Fremont, CA 94536",/CA/Fremont/36778-Oak-St-94536/home/1050807,94536,Fremont,Sale
3,3,"$598,500","4046 Abbey Ter #113, Fremont, CA 94536",/CA/Fremont/4046-Abbey-Ter-94536/unit-113/home...,94536,Fremont,Sale
4,4,"$799,888","345 Torrano Cmn, Fremont, CA 94536",/CA/Fremont/345-Torrano-Cmn-94536/home/1878408,94536,Fremont,Sale
...,...,...,...,...,...,...,...
4397,4397,"$2,100/mo","468 90th St Unit 205, Daly City, CA 94015",/CA/Daly-City/468-90th-St-94015/unit-205/home/...,94015,Daly City,Rent
4398,4398,"$3,150/mo","468 90th St Unit 204, Daly City, CA 94015",/CA/Daly-City/468-90th-St-94015/unit-204/apart...,94015,Daly City,Rent
4399,4399,"$3,999/mo","40 John Glenn Cir, Daly City, CA 94015",/CA/Daly-City/40-John-Glenn-Cir-94015/home/173...,94015,Daly City,Rent
4400,4400,"$2,650/mo","372 Imperial Way #1, Daly City, CA 94015",/CA/Daly-City/372-Imperial-Way-94015/unit-1/ho...,94015,Daly City,Rent


In [12]:
sale_df = df[df['rent_or_sale']== 'Sale']
sale_df

Unnamed: 0.1,Unnamed: 0,price,address,link,zipcode,city,rent_or_sale
0,0,"$1,999,888","41247 Apricot Ln, Fremont, CA 94539",/CA/Fremont/41247-Apricot-Ln-94539/home/1637276,94539,Fremont,Sale
1,1,"$1,180,000","34116 Pavia Ter, Fremont, CA 94555",/CA/Fremont/34116-Pavia-Ter-94555/home/17183337,94555,Fremont,Sale
2,2,"$1,499,888","36778 Oak St, Fremont, CA 94536",/CA/Fremont/36778-Oak-St-94536/home/1050807,94536,Fremont,Sale
3,3,"$598,500","4046 Abbey Ter #113, Fremont, CA 94536",/CA/Fremont/4046-Abbey-Ter-94536/unit-113/home...,94536,Fremont,Sale
4,4,"$799,888","345 Torrano Cmn, Fremont, CA 94536",/CA/Fremont/345-Torrano-Cmn-94536/home/1878408,94536,Fremont,Sale
...,...,...,...,...,...,...,...
4356,4356,"$1,150,000","151 Wellington Ave, Daly City, CA 94014",/CA/Daly-City/151-Wellington-Ave-94014/home/20...,94014,Daly City,Sale
4357,4357,"$850,000","127 Wellington Ave, Daly City, CA 94014",/CA/Daly-City/127-Wellington-Ave-94014/home/13...,94014,Daly City,Sale
4358,4358,"$875,000","475 Irvington St, Daly City, CA 94014",/CA/Daly-City/475-Irvington-St-94014/home/2054701,94014,Daly City,Sale
4359,4359,"$1,250,000","68 Lausanne Ave, Daly City, CA 94014",/CA/Daly-City/68-Lausanne-Ave-94014/home/1207054,94014,Daly City,Sale


In [13]:
rent_df = df[df['rent_or_sale']== 'Rent']
rent_df

Unnamed: 0.1,Unnamed: 0,price,address,link,zipcode,city,rent_or_sale
185,185,"$3,020+ /mo","Palmia, Aged 55+ Luxury Apartments",/CA/Fremont/Palmia-Aged-55-Luxury-Apartments/a...,ents,Fremont,Rent
186,186,"$2,551+ /mo",Creekside Village,/CA/Fremont/Creekside-Village/apartment/21968393,lage,Fremont,Rent
187,187,"$2,250+ /mo",Sundale North And South,/CA/Fremont/Sundale-North-And-South/apartment/...,outh,Fremont,Rent
188,188,"$2,300/mo","39993 Fremont Blvd, Fremont, CA 94538",/CA/Fremont/39993-Fremont-Blvd-94538/home/1426751,94538,Fremont,Rent
189,189,"$2,430+ /mo",Pebble Creek Communities,/CA/Fremont/Pebble-Creek-Communities/apartment...,ties,Fremont,Rent
...,...,...,...,...,...,...,...
4397,4397,"$2,100/mo","468 90th St Unit 205, Daly City, CA 94015",/CA/Daly-City/468-90th-St-94015/unit-205/home/...,94015,Daly City,Rent
4398,4398,"$3,150/mo","468 90th St Unit 204, Daly City, CA 94015",/CA/Daly-City/468-90th-St-94015/unit-204/apart...,94015,Daly City,Rent
4399,4399,"$3,999/mo","40 John Glenn Cir, Daly City, CA 94015",/CA/Daly-City/40-John-Glenn-Cir-94015/home/173...,94015,Daly City,Rent
4400,4400,"$2,650/mo","372 Imperial Way #1, Daly City, CA 94015",/CA/Daly-City/372-Imperial-Way-94015/unit-1/ho...,94015,Daly City,Rent


In [14]:
 
rent_houses = rent_df.to_dict('records')
sale_houses = sale_df.to_dict('records')



In [15]:
def get_common_props(soup,city):
    property = {}
    property['address'] = soup.find('div', {'class':'bp-homeAddress'}).text
    property['price'] =  get_head_stats(soup, 'abp-price')
    property['beds'] = get_head_stats(soup, 'abp-beds') 
    property['baths'] = get_head_stats(soup, 'abp-baths')
    property['area'] = soup.find('div',{'data-rf-test-id':'abp-sqFt'}).find('span',{'class':'statsValue'}).text + ' Sq ft'
    property['city'] = city
    property['zipcode'] = property['address'][-5:]
    
    return property

In [16]:

rent_property_data = list({})

for rent_house in rent_houses:
    try:
        soup = get_soup_for_url('https://www.redfin.com'+rent_house['link'])
        property = get_common_props(soup,rent_house['city'])
        property.update(get_rent_home_data(soup))
        rent_property_data.append(property)
    except:
        continue



In [17]:

df = pd.DataFrame(rent_property_data)
df.to_csv('rent_property_data.csv', sep=',', encoding='utf-8')



In [18]:
sale_property_data = list({})


for sale_house in sale_houses:
    try:
        soup = get_soup_for_url('https://www.redfin.com'+sale_house['link'])
        property = get_common_props(soup,sale_house['city'])
        property.update(get_sale_home_data(soup))
        sale_property_data.append(property)
    except:
        continue


In [19]:

df = pd.DataFrame(sale_property_data)
df.to_csv('sale_prop_data.csv', sep=',', encoding='utf-8')

In [20]:
print('done')

done


In [21]:
len(sale_property_data)

2403