In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
import re
import math
import pandas as pd


In [2]:
## Function for creating the soup obj
def get_soup_for_url(url):
    htmlfile = urlopen(url) 
    soup = BS(htmlfile,'html.parser')
    return soup

In [3]:
## Function for geting the house links and other meta data by iterating from the list
def get_home_links(soup, rent_sale, city):
    
    # Using the home views container to get the list of houses in the page
    parent = soup.find('div',{'class':'HomeViews'})
    
    ## The redfine shows the each house as Card and each card has the unique ID. it ranges from 0-39 (max)
    # Each id value has the prefix of MapHomeCard_<ID>, hence usign the prefix to fetch all of cards in the page
    properties = parent.findChildren('div', {'id':re.compile('^MapHomeCard_')})
    
    property_list = list({})
    
    ## Iterating the house cards and creatng dictionary with house properties
    for property in properties:
        entry = {}
        entry['price'] = property.find('span', {'class':'homecardV2Price'}).text
        if("Sale" ==  rent_sale):
            entry['address'] = property.find('span', {'class':'collapsedAddress primaryLine'}).text
        else:
            try:
                entry['address'] = property.find('span', {'class':'propertyName collapsed'}).text
            except:
                entry['address'] = property.find('span', {'class':'collapsedAddress primaryLine'}).text
        entry['link'] = property.a['href']
        entry['zipcode'] = entry['address'][-5:]
        entry['city'] = city
        entry['rent_or_sale'] = rent_sale
        property_list.append(entry)

    
    return property_list
        

In [4]:
df = pd.read_csv ('redfine_url_links_citywise.csv')
df

Unnamed: 0,no,url,city,rent_sale
0,1,https://www.redfin.com/city/6671/CA/Fremont,Fremont,Sale
1,2,https://www.redfin.com/city/6671/CA/Fremont/ap...,Fremont,Rent
2,3,https://www.redfin.com/city/17420/CA/San-Jose,San Jose,Sale
3,4,https://www.redfin.com/city/17420/CA/San-Jose/...,San Jose,Rent
4,5,https://www.redfin.com/city/8439/CA/Hayward/ap...,Hayward,Rent
5,6,https://www.redfin.com/city/8439/CA/Hayward,Hayward,Sale
6,7,https://www.redfin.com/city/20321/CA/Union-City,Union City,Sale
7,8,https://www.redfin.com/city/20321/CA/Union-Cit...,Union City,Rent
8,9,https://www.redfin.com/city/17447/CA/San-Leandro,San Leandro,Sale
9,10,https://www.redfin.com/city/17447/CA/San-Leand...,San Leandro,Rent


In [5]:
target_cities =df.to_dict('records')

In [7]:
## The master list would be holding the house links of all the url given in the urls list


prop_master_list = {'Sale' : list({}), 'Rent': list({})}

for  target in target_cities:
    print(target)
    url = target['url']
    city = target['city']
    rent_sale = target['rent_sale']
    ## Getting the first (40) page homes
    soup = get_soup_for_url(url)
    ## Appendign the home links to the master list
    
    prop_master_list[rent_sale] =  prop_master_list[rent_sale]+ get_home_links(soup, rent_sale, city)
    
    ## Getting the total house count to make further calls on the same target area
    summary = soup.find('div',{'class':'homes summary'}).get_text()
     ## Redfine has the default 40 homes per page settings
    # So the total # of pages count could be obtained by dividing the total home count by 40 
    regx = '(\d+)  apartments for rent'
    if "Sale" == rent_sale:
        regx = '40 of (\d+)'
    
    try:
        total_house = int(re.findall(regx,summary)[0])
    except:
        total_house = int(re.findall('(\d+)  homes',summary)[0])
            
    page_count = math.ceil(total_house/40)
    
    ## Iterating the each page and creating the soup object for each page and parsing the links
    # Each iteration finally appends the list of homes to the master list
    for i in range(2, page_count+1):
        soup = get_soup_for_url(url+'/page-'+str(i))
        ## Appending the each page house list to the master
        prop_master_list[rent_sale] =  prop_master_list[rent_sale]+ get_home_links(soup,rent_sale,city)
    


{'no': 1, 'url': 'https://www.redfin.com/city/6671/CA/Fremont', 'city': 'Fremont', 'rent_sale': 'Sale'}
{'no': 2, 'url': 'https://www.redfin.com/city/6671/CA/Fremont/apartments-for-rent', 'city': 'Fremont', 'rent_sale': 'Rent'}
{'no': 3, 'url': 'https://www.redfin.com/city/17420/CA/San-Jose', 'city': 'San Jose', 'rent_sale': 'Sale'}
{'no': 4, 'url': 'https://www.redfin.com/city/17420/CA/San-Jose/apartments-for-rent', 'city': 'San Jose', 'rent_sale': 'Rent'}
{'no': 5, 'url': 'https://www.redfin.com/city/8439/CA/Hayward/apartments-for-rent', 'city': 'Hayward', 'rent_sale': 'Rent'}
{'no': 6, 'url': 'https://www.redfin.com/city/8439/CA/Hayward', 'city': 'Hayward', 'rent_sale': 'Sale'}
{'no': 7, 'url': 'https://www.redfin.com/city/20321/CA/Union-City', 'city': 'Union City', 'rent_sale': 'Sale'}
{'no': 8, 'url': 'https://www.redfin.com/city/20321/CA/Union-City/apartments-for-rent', 'city': 'Union City', 'rent_sale': 'Rent'}
{'no': 9, 'url': 'https://www.redfin.com/city/17447/CA/San-Leandro',

In [None]:

df = pd.DataFrame(prop_master_list['Sale'])
df.to_csv('house_links_sale.csv', sep=',', encoding='utf-8')

In [None]:

df = pd.DataFrame(prop_master_list['Rent'])
df.to_csv('house_links_rent.csv', sep=',', encoding='utf-8')