# Example showing usage of BeautifulSoup to extract/crawl data from a website

BeautifulSoup is a Python package for parsing HTML and XML documents. It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping

In [1]:
# import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd

Feth the web page using requests library

In [2]:
# fetch web page
r = requests.get("https://www.realtor.com/realestateagents/san-jose_ca").content

In [3]:
# Parse HTML Tags using BeautifulSoup
soup = BeautifulSoup(r, "lxml")

In [4]:
# Find Agent Details
agents = soup.find_all("div", {"class":"agent-list-card clearfix"})
print('Number of Agents:', len(agents))

Number of Agents: 20


Print details of the first agent. Now observe the output, this is used to identify tags to fetch data from

In [5]:
print(agents[0].prettify())

<div class="agent-list-card clearfix" data-linkname="realtors:srp_list:agent_list_card:whole_card" data-url="/realestateagents/gagan-singh_pleasanton_ca_2049387_112984048" onclick="myFunction()">
 <div class="agent-list-card-img-wrapper col-lg-2 col-sm-3 col-xxs-4">
  <a href="/realestateagents/gagan-singh_pleasanton_ca_2049387_112984048">
   <div class="agent-list-card-img">
    <img alt="Gagan Singh" class="lazy" data-original="https://ap.rdcpix.com/1028087148/02dd2af750889f4adf03bc448bab0321a-e0od-r7_w110.jpg" src="//static-far.rdc.moveaws.com/assets/Empty_AgentPhoto-076002f5c5b4a35ccaab1efd3d71b3565ddf293c040a602f2267d45b8e438849.svg"/>
   </div>
  </a>
 </div>
 <div class="agent-list-card-title col-lg-3 col-sm-4 col-xxs-12 clearfix" itemscope="" itemtype="//schema.org/RealEstateAgent">
  <div class="agent-list-card-title-text clearfix">
   <link content="https://ap.rdcpix.com/1028087148/02dd2af750889f4adf03bc448bab0321a-e0od-r7_w110.jpg" itemprop="image"/>
   <meta content="San Ra

Extracting firm address from the crawled web page

In [6]:
sub_soup = BeautifulSoup(str(agents))

In [7]:
address = sub_soup.find_all("span", {"itemprop":"address"})

In [8]:
addresses = []
for i in range(0,len(address),2):
    sub_soup_address = BeautifulSoup(str(address[i]))

    street = sub_soup_address.find("meta",{"itemprop":"streetAddress"}).get("content")
    loc = sub_soup_address.find("meta",{"itemprop":"addressLocality"}).get("content")
    reg = sub_soup_address.find("meta",{"itemprop":"addressRegion"}).get("content")
    pos = sub_soup_address.find("meta",{"itemprop":"postalCode"}).get("content")

    add = street+', '+loc+', '+reg+', '+pos
    
    addresses.append(add)

In [9]:
print(addresses,len(addresses))

['900 Main St, Pleasanton, CA, 94566', '12029 Saratoga Sunnyvale Rd, Saratoga, CA, 95070', '5580 Almaden Expressway, San Jose, CA, 95118', '1041 41st Avenue, Santa Cruz, CA, 95062', '3340 Walnut Ave Ste 110, Fremont, CA, 94538', '11 Innes Court, San Francisco, CA, 94124', '12029 Saratoga Sunnyvale Rd, Saratoga, CA, 95070', '5898 SILVER CREEK VALLEY RD, SAN JOSE, CA, 95138', '16268 LOS GATOS BLVD, LOS GATOS, , 95032', '19400 STEVENS CREEK BLVD STE 200, CUPERTINO, CA, 95014', '10105 S De Anza Blvd, Cupertino, CA, 95014', '42820 ALBRAE ST, FREMONT, CA, 94538', '1712 Meridian Ave, San Jose, CA, 95125', '750 UNIVERSITY AVE STE 150 , Los Gatos, CA, 95032', ', , , ', ', , , ', '1975 HAMILTON AVE STE 25, SAN JOSE, CA, 95125', '950 Tennant Station, Morgan Hill, CA, 95037', '1712 Meridian Ave, San Jose, CA, 95125', '467 1st Street #200, Los Altos, CA, 94022'] 20


Extracting agent name from the crawled web page

In [10]:
name = []

for i in range(0,len(agents)):
    sub_soup_1 = BeautifulSoup(str(agents[i]))
    names = sub_soup_1.find("div", {"class":"agent-name"})
    if names != None:
        name.append(names.text.strip())
    else:
        name.append(None)

Extracting firm agent works for from the crawled web page

In [11]:
group = []

for i in range(0,len(agents)):
    sub_soup_1 = BeautifulSoup(str(agents[i]))
    groups = sub_soup_1.find("div", {"class":"agent-group"})
    if names != None:
        group.append(groups.text.strip())
    else:
        group.append(None)

Extracting agent/firm phone number from the crawled web page

In [12]:
phone = []

for i in range(0,len(agents)):
    sub_soup_1 = BeautifulSoup(str(agents[i]))
    phones = sub_soup_1.find("div", {"class":"agent-phone"})
    if names != None:
        phone.append(phones.text.strip())
    else:
        phone.append(None)

Extracting agent's experience from the crawled web page

In [13]:
exp = []

for i in range(0,len(agents)):
    sub_soup_1 = BeautifulSoup(str(agents[i]))
    exps = sub_soup_1.find("div", {"class":"agent-detail-item ellipsis hidden-xs hidden-xxs"}) 
    if exps != None :
        exps_txt = exps.text.strip()
        exp.append(exps_txt.split('\n        ')[1])
    else:
        exp.append(None)

Storing the extracted data into a dataframe

In [14]:
df = pd.DataFrame(list(zip(name,group,exp,phone,addresses)),columns = ['Name','Firm','Experience','Phone','Address'])
df

Unnamed: 0,Name,Firm,Experience,Phone,Address
0,Gagan Singh,Compass,28 years 11 months,(925) 212-3478,"900 Main St, Pleasanton, CA, 94566"
1,Steve McCarrick,Coldwell Banker Residential Brokerage - Saratoga,,(408) 656-0788,"12029 Saratoga Sunnyvale Rd, Saratoga, CA, 95070"
2,Mike D'Ambrosio,Intero Almaden,12 years,(408) 630-0101,"5580 Almaden Expressway, San Jose, CA, 95118"
3,Genie Lawless,David Lyng Real Estate,18 years,(831) 464-4447,"1041 41st Avenue, Santa Cruz, CA, 95062"
4,Everett Eslinger\n \nTeam,Coldwell Banker Realty - Fremont,48 years,(510) 608-7636,"3340 Walnut Ave Ste 110, Fremont, CA, 94538"
5,Bernadette Troncales Weir,Lennar - The San Francisco Shipyard,17 years,(510) 290-2915,"11 Innes Court, San Francisco, CA, 94124"
6,Maryam Tabatabaei,Coldwell Banker Residential Brokerage - Saratoga,,(408) 872-3524,"12029 Saratoga Sunnyvale Rd, Saratoga, CA, 95070"
7,William Chea,Intero Real Estate Services,17 years 9 months,(408) 373-4424,"5898 SILVER CREEK VALLEY RD, SAN JOSE, CA, 95138"
8,LILA KAZEMI,Compass,21 years,(408) 206-4536,"16268 LOS GATOS BLVD, LOS GATOS, , 95032"
9,Prashant Vanka\n \nTeam,Keller Williams Realty - Cupertino,10 years 2 months,(650) 460-1188,"19400 STEVENS CREEK BLVD STE 200, CUPERTINO, C..."
