In [1]:
import pandas as pd
import math
import requests
from bs4 import BeautifulSoup

In [13]:
starting_url = "https://www.apartments.com/chicago-il/1/"

def main_crawler(starting_url):
    #the current main_crawler function now only returns the links of all properties
    """
    Running the mini crawler and returns the all property links starting from a url
    
    Input:
        starting_url: the url page where we start
    
    Output:
        all_retrieves: a list of all page link of properties
    """
    
    page_to_check = starting_url
    next_page_exists = True
    all_retrieves=[]
    
    while next_page_exists:
        page_to_check,apt_links = mini_crawler(page_to_check)
        all_retrieves += apt_links
        if page_to_check==None:
            next_page_exists=False
            
    return all_retrieves

def mini_crawler(url):
    """
    Extract all properties' link from current page
    
    Input:
        url: the url of the page we hope to check
        
    Output:
        link_next_page: the status of next page
        list_apt_link: a list of properties' link
    """
    
    #exstracting html information of current page
    headers ={"User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"}
    response = requests.get(url, headers = headers, timeout = 300)
    soup = BeautifulSoup(response.text, "html5lib")
    
    #getting the link of next page
    next_page = soup.find("link",rel="next")
    if next_page == None:
        link_next_page=None
    else:
        link_next_page = soup.find("link",rel="next").get("href")
    
    #extracting all links of properties within current page
    list_apt_link = []
    houses_html = soup.find_all("li", class_="mortar-wrapper")
    for house in houses_html:
        apt_link = house.find("a").get('href')
        list_apt_link.append(apt_link)
        
    return link_next_page,list_apt_link

all_properties = main_crawler(starting_url)

In [49]:
#feature list
features = [
    "property name",
    "address",
    "price range",
    "room area",
    "bedroom",
    "bathroom",
    "description",
    "security level"
]

In [14]:
def page_crawler(single_property_url):
    """
    Crawler for a single property page. Extract information of
    the features.
    """
    headers ={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"}
    response = requests.get(single_property_url, headers = headers, timeout = 300)
    response_txt = response.text
    soup = BeautifulSoup(response_txt, 'html.parser')
    
    # longitude and latitude
    longitude = soup.find('meta', property = "place:location:longitude")['content']
    latitude = soup.find('meta', property = "place:location:latitude")['content']
    
    # property name
    property_name = soup.find('h1', class_ = 'propertyName').get_text().strip()

    # address
    address_lst = soup.find('div', class_ = 'propertyAddressContainer').find_all('span')[0:3]
    address_lst = [add.get_text() for add in address_lst]
    street = address_lst[0]
    city_state_zip = (address_lst[1] + address_lst[2]).replace('\n', ' ').strip()
    address = street + ', ' + city_state_zip

    # price range, bedroom, bathroom, room area
    rent_info_lst = soup.find_all('p', class_ = 'rentInfoDetail')

    price_range = rent_info_lst[0].get_text()
    bedroom = rent_info_lst[1].get_text()
    bathroom = rent_info_lst[2].get_text()
    room_area = rent_info_lst[3].get_text()

    # description
    # p.property-amenities in outside page!

    # security level
    # later computations with crime data

    return property_name, address, longitude, latitude, price_range, bedroom, bathroom, room_area

In [16]:
# let's test code for first ten links
for property_link in all_properties[0:9]:
    p, a, long, lat, pr, be, ba, ra = page_crawler(property_link)
    print(p, a, long, lat, pr, be, ba, ra)

The Residences at NewCity 1457 N Halsted St, Chicago IL 60642 -87.64773 41.90801 $2,126 - $4,215 Studio - 2 bd 1 - 2 ba 590 - 1,315 sq ft
Elevate 930 W Altgeld St, Chicago IL 60614 -87.65243 41.9276 $1,895 - $14,435 Studio - 3 bd 1 - 2.5 ba 464 - 2,616 sq ft
1407 On Michigan 1407 S Michigan Ave, Chicago IL 60605 -87.6236 41.86377 $1,750 - $5,650 Studio - 3 bd 1 - 2 ba 446 - 1,687 sq ft
The Pavilion 5441 NE NEast River Rd, Chicago IL 60656 -87.84335 41.97831 $1,100 - $2,275 Studio - 4 bd 1 - 3 ba 502 - 2,317 sq ft
Gild 1206-1212 N State Pky, Chicago IL 60610 -87.62905 41.90423 $2,260 - $6,322 Studio - 2 bd 1 - 2 ba 463 - 1,272 sq ft
Residences at 8 East Huron 8 E Huron St, Chicago IL 60611 -87.62791 41.89508 $3,050 - $6,550 1 - 3 bd 1 - 3.5 ba 540 - 2,680 sq ft
Union West 939 W Washington Blvd, Chicago IL 60607 -87.65139 41.88281 $2,035 - $6,000 Studio - 2 bd 1 - 2 ba 482 - 1,122 sq ft
Optima Signature 220 E Illinois St, Chicago IL 60611 -87.62163 41.8914 $2,485 - $9,487 Studio - 3 bd 1

In [13]:
#load crime data
columns = ["CASE#", "LATITUDE", "LONGITUDE"]
crime = pd.read_csv("Crimes_Data.csv", usecols=columns)
crime = crime.dropna()
crime

CASE#           0
LATITUDE     2884
LONGITUDE    2884
dtype: int64


Unnamed: 0,CASE#,LATITUDE,LONGITUDE
2,JE266628,41.748486,-87.602675
3,JE266536,41.880661,-87.731186
5,JE267466,41.871540,-87.705839
6,JE266473,41.780851,-87.649674
7,JE267222,41.859989,-87.735995
...,...,...,...
205762,JE492545,41.902703,-87.629950
205763,JF103181,41.731960,-87.636205
205764,JF102974,41.904582,-87.714655
205765,JF103076,41.987483,-87.712672


In [22]:

#calculate number of crime incidents in the neighborhood\
#from website: https://www.geeksforgeeks.org/program-distance-two-points-earth/#:~:text=For%20this%20divide%20the%20values,is%20the%20radius%20of%20Earth.
def if_crime_in_neighborhood(lo1, la1, lo2, la2, radius):
    lo1 = math.radians(lo1)
    la1 = math.radians(la1)
    lo2 = math.radians(lo2)
    la2 = math.radians(la2)

    lo_dist = lo1 - lo2
    la_dist = la1 - la2

    a = math.sin(la_dist/2)**2 + math.cos(la1) * math.cos(la2) * math.sin(lo_dist/2)**2

    c = 2 * math.asin(math.sqrt(a))

    earth_r_in_mile = 3956

    if c * earth_r_in_mile <= radius:
        return True
    else:
        return False


def find_lo_range(lo1, la1):
    



In [None]:
#data visualization