### MWRA Scraping

Goal: get list of towns with MWRA water service

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import urllib
import pandas as pd
import pickle

In [2]:
def try_url(url):
    '''
    Attempt to access webpage
    '''
    response = requests.get(url)
    status = response.status_code
    if status != 200:
        return status
    else:
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        return soup

In [3]:
url = 'http://www.mwra.state.ma.us/02org/html/whatis.htm'

In [4]:
soup = try_url(url)

In [5]:
customers = soup.find_all('table')[8].find_all('tr')

In [6]:
mwra = pd.DataFrame()

for i in range(2, len(customers)):
    town = customers[i].find_all('td')[0].text.encode('utf-8').upper()
    service = customers[i].find_all('td')[1].text.encode('utf-8').upper()
    mwra = mwra.append(pd.Series((town, service)), ignore_index=True)

In [7]:
mwra = mwra.applymap(lambda x: x.upper())

In [8]:
columns = {0: 'TOWN',
           1: 'SERVICES',
          }

mwra.rename(columns=columns, inplace=True)

### MWRA Cleaning

Goal: classify as full, partial, or none (emergency back-up only classified as none)

In [9]:
mwra['SERVICES'].value_counts()

WATER AND SEWER                          23
SEWER                                    10
WATER                                    10
SEWER, WATER (PARTIALLY SUPPLIED)         6
WATER (PARTIALLY SUPPLIED)                4
WATER (PARTIALLY SUPPLIED), SEWER         3
 SEWER, WATER (PARTIALLY SUPPLIED)        1
WATER (PARTIALLY SUPPLIED), SEWER         1
WATER (EMERGENCY BACK-UP ONLY)            1
WATER (EMERGENCY BACK-UP ONLY)            1
WATER (EMERGENCY BACKUP ONLY), SEWER      1
Name: SERVICES, dtype: int64

In [10]:
mwra.loc[mwra['SERVICES'] == 'WATER', 'WATER_SERVICE'] = 'FULL'
mwra.loc[mwra['SERVICES'] == 'WATER AND SEWER', 'WATER_SERVICE'] = 'FULL'
#mwra.loc[mwra['SERVICES'].str.find('WATER (PARTIALLY SUPPLIED)') > -1, 'WATER_SERVICE'] = 'PARTIAL'
mwra.dropna(inplace=True)

In [11]:
mwra.head(10)

Unnamed: 0,TOWN,SERVICES,WATER_SERVICE
0,ARLINGTON,WATER AND SEWER,FULL
3,BELMONT,WATER AND SEWER,FULL
4,BOSTON,WATER AND SEWER,FULL
6,BROOKLINE,WATER AND SEWER,FULL
10,CHELSEA,WATER AND SEWER,FULL
11,CHICOPEE,WATER,FULL
12,CLINTON,WATER AND SEWER,FULL
14,EVERETT,WATER AND SEWER,FULL
15,FRAMINGHAM,WATER AND SEWER,FULL
20,LEXINGTON,WATER AND SEWER,FULL


In [12]:
mwra['WATER_SERVICE'].value_counts()

FULL    33
Name: WATER_SERVICE, dtype: int64

### Add rest of info to match PWS df
[Source](http://www.mwra.state.ma.us/04water/html/watsys.htm)
* PWS_ID: from pwscont
* PWS_NAME: from pwscont
* TOWN: from pwscont
* PWS_CLASS: COM
* TOTAL_INTAKES: 3
* GW_RATIO: 0.0
* MWRA: 1

### Add PWS_ID, PWS_NAME, TOWN from pwscont csv

In [13]:
pws_ids = pd.read_csv('../data/external/MassDEP/pwscont.csv',
                          usecols=['CITY/TOWN', 'PWSID', 'PWS NAME', 'CLASS'], 
                          dtype='str')

pws_ids.columns = ['TOWN', 'PWS_ID', 'PWS_NAME', 'PWS_CLASS']

In [14]:
mwra_keywords = ['MWRA', 'DIST', 'DEP', 'DIV']

mwra_ids = pws_ids.loc[pws_ids['PWS_NAME'].str.contains('|'.join(mwra_keywords))]
#pts.loc[pts['PWS_NAME'].str.contains('|'.join(towns))), 'SCH_SUPPLY'] = 1
#mwra_ids = pws_ids[pws_ids['PWS_NAME'].str.contains('MWRA')]

In [15]:
mwra_with_ids = mwra.merge(mwra_ids, how='left', on='TOWN')

In [16]:
mwra_with_ids[mwra_with_ids['PWS_ID'].isnull()]

Unnamed: 0,TOWN,SERVICES,WATER_SERVICE,PWS_ID,PWS_NAME,PWS_CLASS
12,LYNNFIELD WATER DISTRICT,WATER,FULL,,,
27,SOUTH HADLEY FIRE DISTRICT #1,WATER,FULL,,,


In [17]:
mwra_with_ids.iloc[12, 3] = '3164001'
mwra_with_ids.iloc[12, 4] = 'LYNNFIELD WATER DIST. (MWRA)'
mwra_with_ids.iloc[12, 5] = 'COM'
mwra_with_ids.iloc[27, 3] = '1275000'
mwra_with_ids.iloc[27, 4] = 'SOUTH HADLEY FD #1 (MWRA)'
mwra_with_ids.iloc[27, 5] = 'COM'

In [18]:
#Drop MWRA overall, drop Deer Island
mwra_with_ids.drop(3, inplace=True)
mwra_with_ids.drop(4, inplace=True)

In [19]:
#mwra_with_ids['PWS_CLASS'] = 'COM'
mwra_with_ids['TOTAL_INTAKES'] = 4
mwra_with_ids['GW_RATIO'] = 0.0
mwra_with_ids['MWRA'] = 1

In [20]:
mwra_with_ids.drop(['SERVICES', 'WATER_SERVICE'], axis=1, inplace=True)

In [21]:
with open('../data/interim/mwra.pkl', 'wb') as picklefile:
    pickle.dump(mwra_with_ids, picklefile)