# Scrape EG data from NE Reval website

epq 3/26/2021

In [1]:
import numpy as np
import scipy as sc
import pandas as pd
import pickle
import time
import requests
import os
import lxml
from bs4 import BeautifulSoup 
from datetime import datetime, timedelta, date

### Set parameters for extract

- lowest NE Reval account number
- highest NE Reval account number
- length of pause between requests (so we don't flood the server)
- fixed text that precedes the account number in the URL
- fixed text that follows the account number in the URL
- path to directory to store the output files

In [2]:
start = 1                                                #lowest account number
stop  = 5600                                             #highest account number
pause_seconds = 5                                        #wait between http requests (don't overwhelm the server)
                                                         #fixed part of URL before account number
url_part1 = 'https://data.nereval.com/PropertyDetail.aspx?town=East%20Greenwich&accountnumber='   
url_part2 = '&card=1'                                    #fixed part of URL after account number
path = 'data/eg03252021/'                                #path to output directory (directory must exist)


### Download the pages for accounts in the range specified

In [3]:
for i in np.arange(start,stop+1):                        #range of account numbers to look for
    url = url_part1 + str(i) + url_part2                 #construct URL for this account
    r=requests.get(url)                                  #load the page                     
    time.sleep(pause_seconds)                            #wait pause_seconds 
    if (r.status_code == 200):                           #status code is 200 if page was retrieved
        t = r.content                                    #content is a python byte array
        if (len(t) > 0):                                 #make sure there is something to process
            fn = path + 'acct' + str(i) + '.pkl'         #build file name  acctxxxx.pkl
            with open(fn,'wb') as binary_file:           #open the file write binary
                pickle.dump(t, binary_file,fix_imports=False)   #dump the byte array to a pickle file

### Get a list of the filenames for accounts that were found

In [4]:
for dirname, dirnames, filenames in os.walk(path):
    # print path to all subdirectories first.
    for subdirname in dirnames:
        print(os.path.join(dirname, subdirname))
len(filenames)                

5165

### Parse the html information with Beautiful Soup

In [6]:
# Initialize the dictionary

nerd = {}

# Read html files
for filename in filenames:
    if filename not in nerd.keys():
        nerd[filename] = {}
    fp=os.path.join(dirname, filename)
    
    town=fp.split('/')[1]
    
    with open(path + filename, 'rb') as handle:
         txt = pickle.load(handle)
        
    soup = BeautifulSoup(txt,"lxml")
    #soup = BeautifulSoup(txt,"html.parser")
    
    try:
        tbl          = soup.find('table', id="ParcelID_ParcelID").findAll('td')         #get the ParcelID table entries
        
        nerd[filename]['ParcelID_ParcelID'] = {}
        nerd[filename]['ParcelID_ParcelID']['ParcelID']     = tbl[1].font.text.strip()
        nerd[filename]['ParcelID_ParcelID']['account']      = tbl[3].font.text.strip()
        nerd[filename]['ParcelID_ParcelID']['State']        = tbl[5].font.text.strip()
        nerd[filename]['ParcelID_ParcelID']['Card']         = tbl[7].font.text.strip()
        
    except AttributeError:
        nerd[filename]['ParcelID_ParcelID'] = {}
        
    try:
        tbl          = soup.find('table', id="Assessment_Assessment").findAll('td')     #get the Assessment table entries

        nerd[filename]["Assessment_Assessment"] = {}
        nerd[filename]["Assessment_Assessment"]['Land'] = tbl[1].font.text.lstrip('$').replace(',','').strip()
        nerd[filename]["Assessment_Assessment"]['Building'] = tbl[3].font.text.lstrip('$').replace(',','').strip()
        nerd[filename]["Assessment_Assessment"]['Card_total'] = tbl[5].font.text.lstrip('$').replace(',','').strip()
        nerd[filename]["Assessment_Assessment"]['Parcel_total'] = tbl[7].font.text.lstrip('$').replace(',','').strip()

    except AttributeError:
        nerd[filename]["Assessment_Assessment"] = {}
            
    try:
        tbl          = soup.find('table', id="LocationOwner_Location").findAll('tr')     #get the location/owner table entries
        
        nerd[filename]["LocationOwner_Location"] = {}
        nerd[filename]["LocationOwner_Location"]['Location'] = (tbl[0]).findAll('td')[1].font.text.replace(',','').strip()
        nerd[filename]["LocationOwner_Location"]['Owner'] = (tbl[1]).findAll('td')[1].font.text.replace(',','').strip()
        nerd[filename]["LocationOwner_Location"]['Owner2'] = (tbl[2]).findAll('td')[1].font.text.replace(',','').strip()
        nerd[filename]["LocationOwner_Location"]['Owner3'] = (tbl[3]).findAll('td')[1].font.text.replace(',','').strip()
        nerd[filename]["LocationOwner_Location"]['Address'] = (tbl[4]).findAll('td')[1].font.text.replace(',','').strip()
        nerd[filename]["LocationOwner_Location"]['Address2'] = (tbl[5]).findAll('td')[1].font.text.replace(',','').strip()
        nerd[filename]["LocationOwner_Location"]['Address3'] = (tbl[6]).findAll('td')[1].font.text.replace(',','').strip()

    except AttributeError:
        nerd[filename]["LocationOwner_Location"] = {}
        
    try:
        tbl             = soup.find('table', id="BuildingInformation_Building").findAll('tr')  #building info table entries
        
        nerd[filename]["BuildingInformation_Building"] = {}
        try:
            
            nerd[filename]["BuildingInformation_Building"]['Design'] = (tbl[0]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Year_built'] = (tbl[1]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Heat'] = (tbl[2]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Fireplaces'] = (tbl[3]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Rooms'] = (tbl[4]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Bedrooms'] = (tbl[5]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Bathrooms'] = (tbl[6]).findAll('td')[1].font.text.strip()
            nerd[filename]["BuildingInformation_Building"]['Above_Ground_SF'] = (tbl[7]).findAll('td')[1].font.text.replace(',','').strip()
        except IndexError:
            nerd[filename]["BuildingInformation_Building"]['Design']          = ''
            nerd[filename]["BuildingInformation_Building"]['Year_built']      = ''
            nerd[filename]["BuildingInformation_Building"]['Heat']            = ''
            nerd[filename]["BuildingInformation_Building"]['Fireplaces']      = ''
            nerd[filename]["BuildingInformation_Building"]['Rooms']           = ''
            nerd[filename]["BuildingInformation_Building"]['Bedrooms']        = ''
            nerd[filename]["BuildingInformation_Building"]['Bathrooms']       = ''
            nerd[filename]["BuildingInformation_Building"]['Above_Ground_SF'] = ''
            
    except AttributeError:
        nerd[filename]["BuildingInformation_Building"] = {}
        
    try:
        tbl          = soup.find('table', id="LandInformation_Land") .findAll('td')     #get the Land info table entries
        nerd[filename]["LandInformation_Land"] = {}
        if (len(tbl) > 7):
            nerd[filename]["LandInformation_Land"]['Land_area']    = tbl[1].font.text
            nerd[filename]["LandInformation_Land"]['Zoning']       = tbl[3].font.text
            nerd[filename]["LandInformation_Land"]['View']         = tbl[5].font.text
            nerd[filename]["LandInformation_Land"]['Neighborhood'] = tbl[7].font.text
            
    except AttributeError:
        nerd[filename]["LandInformation_Land"] = {}
        
    try:
        tbl          = soup.find('table', id="PriorInformation_GridView2").findAll('tr') #get prior info table entries

        nerd[filename]["PriorInformation_GridView2"] = {}
        
        for n in range(1,len(tbl)):
            nerd[filename]["PriorInformation_GridView2"][n] = {}
            nerd[filename]["PriorInformation_GridView2"][n]['FY'] = (tbl[n]).findAll('td')[0].font.text
            nerd[filename]["PriorInformation_GridView2"][n]['Land_value'] = (tbl[n]).findAll('td')[1].font.text.lstrip('$').replace(',','')
            nerd[filename]["PriorInformation_GridView2"][n]['Building_value'] = (tbl[n]).findAll('td')[2].font.text.lstrip('$').replace(',','')
            nerd[filename]["PriorInformation_GridView2"][n]['Outbuilding_value']  = (tbl[n]).findAll('td')[3].font.text.lstrip('$').replace(',','')
            nerd[filename]["PriorInformation_GridView2"][n]['Total_value'] = (tbl[n]).findAll('td')[4].font.text.lstrip('$').replace(',','')

    except AttributeError:
        nerd[filename]["PriorInformation_GridView2"] = {}
        
    try:        
        tbl          = soup.find('table', id="SaleInformation_Sales").findAll('tr') #get prior info table entries

        nerd[filename]["SaleInformation_Sales"] = {}
        
        for n in range(1,len(tbl)):
            nerd[filename]["SaleInformation_Sales"][n] = {}
            nerd[filename]["SaleInformation_Sales"][n]['Sale_date'] = (tbl[n]).findAll('td')[0].font.text
            nerd[filename]["SaleInformation_Sales"][n]['Sale_price'] = (tbl[n]).findAll('td')[1].font.text.lstrip('$').replace(',','')
            nerd[filename]["SaleInformation_Sales"][n]['Legal_reference'] = (tbl[n]).findAll('td')[2].font.text
            nerd[filename]["SaleInformation_Sales"][n]['Instrument']  = (tbl[n]).findAll('td')[3].font.text
        
    except AttributeError:
        nerd[filename]["SaleInformation_Sales"] = {}
        
    try:        
        tbl          = soup.find('table', id="SubArea_SubArea").findAll('td')      #subarea
        
        nerd[filename]['SubArea_SubArea'] = {}    #subarea
        
        if (len(tbl) > 1):
            for i in np.arange(0,len(tbl),2):
                nerd[filename]['SubArea_SubArea'][tbl[i].font.text.strip()] = tbl[i+1].font.text.strip()
                
    except AttributeError:
        nerd[filename]["SubArea_SubArea"] = {}
                

    try:
        tbl          = soup.find('table', id="YardItems_GridView1").findAll('td')
        
        nerd[filename]['YardItems_GridView1'] = {}    #YardItems
        if (len(tbl) > 1):
            for i in np.arange(0,len(tbl),2):
                nerd[filename]['YardItems_GridView1'][tbl[i].font.text.strip()] = tbl[i+1].font.text.strip()
                
    except AttributeError:
        nerd[filename]['YardItems_GridView1'] = {}    #YardItems
        
print('Number of accounts: ',len(nerd))

Number of accounts:  5165


### Save the dictionary to a pickle file

Filename is:     

**NE_Reval_dictionary_MM_DD_YYYY.pkl**

In [8]:
current_date = date.today()
fname = '../NE_Reval_dictionary_' + str(current_date.month) + '_' + \
    str(current_date.day) + '_' + str(current_date.year) + '.pkl'
with open(fname, 'wb') as handle:
    pickle.dump(nerd, handle)
    
print('Dictionary with ',len(nerd),' accounts saved to file: ',fname)

Dictionary with  5165  accounts saved to file:  ../NE_Reval_dictionary_3_26_2021.pkl
