In [36]:
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
import pandas as pd
import numpy as np
import time

max_pid = 6200 # The max number of PIDs on the Chelsea Website. See 'link' below. 
pids = np.arange(1,max_pid,1) # Create Array of all the PIDs
cols = ['pid','mblu','year','style','heat','ac','wall','roof','area'] # this is the info to extract 

# function to extract soup object from html document from given url
def get_soup(pid):
    link = "https://gis.vgsi.com/chelseama/Parcel.aspx?pid=" + str(pid) # main link + pid 
    # Get Html from link
    fp = urllib.request.urlopen(link)
    mybytes = fp.read()
    mystr = mybytes.decode("utf8")
  
    # Create soup object
    soup = BeautifulSoup(mystr, 'html.parser') 
    if soup.find("span",id="MainContent_lblMessage"):
        soup = []
    return soup

# Function to retrieve MBLU from soup
def get_mblu(soup):
    mblue = ""
    mblu = soup.find("span",id="MainContent_lblMblu").text.replace("/  ","-").replace("/","").replace("\\","-")[:-2]
    return mblu 

# Function to retrieve year from soup
def get_year(soup):
    year = 0
    year = soup.find("span",id="MainContent_ctl01_lblYearBuilt").text
    return year 

# Function to retrieve style from soup
def get_style(soup):
    style = ""
    # Find style in html
    style = soup.find("td", text="Style:").find_next_sibling("td").text
    return style

# Function to retrieve style from soup
def get_units(soup):
    units = "1"
    if soup.find("td", text="Occupancy"):
        units = soup.find("td", text="Occupancy").find_next_sibling("td").text
    return units

def get_tot_units(soup):
    tot_units = "1"
    if soup.find("td", text="Residential Units:"):
        tot_units = soup.find("td", text="Residential Units:").find_next_sibling("td").text
    return tot_units

# Function to retrieve heat from soup
def get_heat(soup):
    heat = ""
    if soup.find("td", text="Heating Type"):
        heat = soup.find("td", text="Heating Type").find_next_sibling("td").text
    return heat

# Function to retrieve heat from soup
def get_heat_fuel(soup):
    heat_fuel = ""
    if soup.find("td", text="Heating Fuel"):
        heat_fuel = soup.find("td", text="Heating Fuel").find_next_sibling("td").text
    return heat_fuel
# Function to retrieve ac from soup
def get_ac(soup):
    ac = ""
    if soup.find("td", text='AC Type:'):
        ac = soup.find("td", text='AC Type:').find_next_sibling("td").text
    return ac

# # Function to retrieve wall type from soup
# def get_wall(soup):
#     wall = ""
#     if soup.find("td", text='Exterior Wall 1'):
#         wall = soup.find("td", text='Exterior Wall 1').find_next_sibling("td").text
#     if soup.find("td", text='Exterior Wall 1:'):
#         wall = soup.find("td", text='Exterior Wall 1:').find_next_sibling("td").text
#     return wall

# # Function to retrieve roof type from soup
# def get_roof(soup):
#     roof = ""
#     if soup.find("td", text='Roof Cover'):
#         roof = soup.find("td", text='Roof Cover').find_next_sibling("td").text
#     return roof

# Function to retrieve area from soup
def get_area(soup):
    area = 0
    area = soup.find("span",id="MainContent_ctl01_lblBldArea").text.replace(",","")
    return area 

# Main function to retrieve all the data above from soup and provide DataFrame
def main(pid):
    soup = get_soup(pid)
    if soup:
        mblu = get_mblu(soup)
        year = get_year(soup)
        style = get_style(soup)
        units = get_units(soup)
        tot_units = get_tot_units(soup)
        heat = get_heat(soup)
        heat_fuel = get_heat_fuel(soup)
        ac = get_ac(soup)
#         wall = get_wall(soup)
#         roof = get_roof(soup)
        area = get_area(soup)
        this_tax = pd.DataFrame([{'pid':pid,'mblu':mblu,'year':year,
                                  'style':style,'units':units,
                                  'tot_units':tot_units,'heat':heat,
                                  'heat_fuel':heat_fuel,'ac':ac,
                                  'area':area}])
        return this_tax


In [37]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


In [38]:
# Create list
tax_list = []
for ipid in pids:
    # Add each individual tax dataframe to main one
    tax_list.append(main(ipid))#pd.concat([tax,main(ipid)])
# concat list to dataframe
tax=pd.concat(tax_list,ignore_index=True)

In [40]:
tax

Unnamed: 0,pid,mblu,year,style,units,tot_units,heat,heat_fuel,ac,area
0,2,5--1A,1900,Warehouse4,1.00,1,Hot Water,Gas,,12200
1,3,5--2,,Vacant Land,,1,,,,0
2,4,5--5,,Vacant Land,,1,,,,0
3,6,6--5,,Outbuildings,,1,,,,0
4,7,6--10,1926,Light Indust,1.00,1,Hot Water,Gas,,1185
...,...,...,...,...,...,...,...,...,...,...
5507,6170,86--122,1950,Other Municip,4.00,1,Hot Water,Gas,,4076
5508,6171,12--77A,1930,3 Family,3,1,,,,2382
5509,6172,29--127B,1940,3 Family,3,1,,,,3816
5510,6173,29--127C,1940,3 Family,3,1,,,,3339


In [42]:
# Save DataFrame to CSV
tax.to_csv('chelsea_tax_0612.csv', index=False)