In [1]:
import pandas as pd
import math
import requests
from bs4 import BeautifulSoup

In [2]:
starting_url = "https://www.apartments.com/chicago-il/1/"

def listing_page_crawler(url):
    """
    This crawler collects the links for every property listed in the
    current listing page.
    
    Input:
        url: the url of the listing page we hope to crawl
        
    Output:
        link_next_page: the link of next page
        lst_apts_link: a list of properties' links
    """

    # Extracting html information of current page
    headers = {"User-Agent":
        "Coco, Juno, Feihong and Chongyu from UChicago"}
    response = requests.get(url, headers = headers, timeout = 300)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Getting the link of next page
    next_page = soup.find("link", rel = "next")
    if next_page == None:
        link_next_page = None
    else:
        link_next_page = soup.find("link", rel = "next").get("href")

    # Extracting all links of properties within current page
    lst_apts_links = []
    apts = soup.find_all("li", class_ = "mortar-wrapper")
    for apt in apts:
        apt_link = apt.find("a").get('href')
        lst_apts_links.append(apt_link)

    return link_next_page, lst_apts_links

def collect_all_apts_links(starting_url):
    """
    Running the listing page crawler and returns
    all property links from the starting url.

    Input:
        starting_url: the url of the page we start

    Output:
        all_apts_links: a list of the links of all the properties
    """

    # Start from the starting page
    page_to_check = starting_url
    next_page_exists = True
    all_apts_links = []

    # Collect links until finished crawling all listing pages
    while next_page_exists:
        page_to_check, apt_links = listing_page_crawler(page_to_check)
        all_apts_links += apt_links
        if page_to_check == None:
            next_page_exists = False

    return all_apts_links

all_properties = collect_all_apts_links(starting_url)

In [49]:
#feature list
features = [
    "latitude",
    "longitude",
    "property_name",
    "address",
    "price_range",
    "bedroom",
    "bathroom",
    "room_area",
    "amenities",
    "security"
]

In [12]:
def property_page_crawler(single_property_url):
    """
    Crawler for a single property page. Extract information of
    the features.

    Input:
        single_property_url: the url of a single property

    Output:
        features information
    """

    headers = {"User-Agent":
        "Coco, Juno, Feihong and Chongyu from UChicago"}
    response = requests.get(single_property_url, headers = headers, timeout = 300)
    response_txt = response.text
    soup = BeautifulSoup(response_txt, 'html.parser')
    
    # latitude and longitude
    latitude = soup.find('meta', property = "place:location:latitude")['content']
    longitude = soup.find('meta', property = "place:location:longitude")['content']
    
    # property name
    property_name = soup.find('h1', class_ = 'propertyName').get_text().strip()

    # address
    address_lst = soup.find('div', class_ = 'propertyAddressContainer').find_all('span')[0:3]
    address_lst = [add.get_text() for add in address_lst]
    street = address_lst[0]
    city_state_zip = (address_lst[1] + address_lst[2]).replace('\n', ' ').strip()
    address = street + ', ' + city_state_zip

    # price range, bedroom, bathroom, room area
    rent_info_lst = soup.find_all('p', class_ = 'rentInfoDetail')

    price_range = rent_info_lst[0].get_text()
    bedroom = rent_info_lst[1].get_text()
    bathroom = rent_info_lst[2].get_text()
    room_area = rent_info_lst[3].get_text()
    
    # amenities
    amenities_lst = soup.find_all('p', class_ = 'amenityLabel')
    amenities_lst = [amenity.get_text() for amenity in amenities_lst]
    amenities = ', '.join(amenities_lst)

    # security: later computations with crime data

    return latitude, longitude, property_name, address, price_range, bedroom, bathroom, room_area, amenities

In [13]:
# Test code for first five properties
for property_link in all_properties[0:5]:
    p, a, long, lat, pr, be, ba, ra, am = property_page_crawler(property_link)
    print(p, a, long, lat, pr, be, ba, ra, am)

41.90801 -87.64773 The Residences at NewCity 1457 N Halsted St, Chicago IL 60642 $2,126 - $4,215 Studio - 2 bd 1 - 2 ba 590 - 1,315 sq ft Pool, Fitness Center, Elevator, Clubhouse, Controlled Access, Recycling, Business Center, Grill, In Unit Washer & Dryer, Air Conditioning, Dishwasher, High Speed Internet Access, Hardwood Floors, Walk-In Closets, Microwave, Refrigerator
41.9276 -87.65243 Elevate 930 W Altgeld St, Chicago IL 60614 $1,910 - $14,435 Studio - 3 bd 1 - 2.5 ba 464 - 2,616 sq ft Pool, Fitness Center, Elevator, Clubhouse, In Unit Washer & Dryer, Air Conditioning, Wi-Fi, Fireplace
41.86377 -87.6236 1407 On Michigan 1407 S Michigan Ave, Chicago IL 60605 $1,750 - $5,650 Studio - 3 bd 1 - 2 ba 446 - 1,687 sq ft Pool, Fitness Center, Elevator, Doorman, Clubhouse, Roof Terrace, Recycling, Business Center, In Unit Washer & Dryer, Air Conditioning, Dishwasher, Washer/Dryer Hookup, High Speed Internet Access, Hardwood Floors, Walk-In Closets, Island Kitchen
41.97831 -87.84335 The Pav

In [13]:
#load crime data
columns = ["CASE#", "LATITUDE", "LONGITUDE"]
crime = pd.read_csv("Crimes_Data.csv", usecols=columns)
crime = crime.dropna()
crime

CASE#           0
LATITUDE     2884
LONGITUDE    2884
dtype: int64


Unnamed: 0,CASE#,LATITUDE,LONGITUDE
2,JE266628,41.748486,-87.602675
3,JE266536,41.880661,-87.731186
5,JE267466,41.871540,-87.705839
6,JE266473,41.780851,-87.649674
7,JE267222,41.859989,-87.735995
...,...,...,...
205762,JE492545,41.902703,-87.629950
205763,JF103181,41.731960,-87.636205
205764,JF102974,41.904582,-87.714655
205765,JF103076,41.987483,-87.712672


In [22]:

#calculate number of crime incidents in the neighborhood\
#from website: https://www.geeksforgeeks.org/program-distance-two-points-earth/#:~:text=For%20this%20divide%20the%20values,is%20the%20radius%20of%20Earth.
def if_crime_in_neighborhood(lo1, la1, lo2, la2, radius):
    lo1 = math.radians(lo1)
    la1 = math.radians(la1)
    lo2 = math.radians(lo2)
    la2 = math.radians(la2)

    lo_dist = lo1 - lo2
    la_dist = la1 - la2

    a = math.sin(la_dist/2)**2 + math.cos(la1) * math.cos(la2) * math.sin(lo_dist/2)**2

    c = 2 * math.asin(math.sqrt(a))

    earth_r_in_mile = 3956

    if c * earth_r_in_mile <= radius:
        return True
    else:
        return False


def find_lo_range(lo1, la1):
    



In [None]:
#data visualization