In [1]:
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
import re
import numpy as np
import time
import concurrent.futures

In [30]:
import sys
sys.path.append('/content')
from src.fluvius import USGS_Water_DB

%load_ext autoreload
%autoreload 2

db = USGS_Water_DB()
db.get_station_df()

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache






Unnamed: 0,site_no,site_name,Latitude,Longitude,geometry
0,01472157,"French Creek near Phoenixville, PA",40.1515,-75.6013,POINT (-75.60130 40.15150)
1,01478245,"White Clay Creek near Strickersville, PA",39.7475,-75.7708,POINT (-75.77080 39.74750)
2,01480617,"West Branch Brandywine Creek at Modena, PA",39.9618,-75.8013,POINT (-75.80130 39.96180)
3,01480870,East Branch Brandywine Creek below Downingtown...,39.9687,-75.6733,POINT (-75.67330 39.96870)
4,01481000,"Brandywine Creek at Chadds Ford, PA",39.8698,-75.5933,POINT (-75.59330 39.86980)
...,...,...,...,...,...
160,393806095273700,"Atchison County Lake near Horton, KS",39.6350,-95.4603,POINT (-95.46030 39.63500)
161,393817095260100,"Clear Creek at Decator Road near Horton, KS",39.6381,-95.4336,POINT (-95.43360 39.63810)
162,394126096073500,Black Vermillion River Tributary above Central...,39.6906,-96.1264,POINT (-96.12640 39.69060)
163,394146096085500,"Centralia Lake near Centralia, KS",39.6961,-96.1486,POINT (-96.14860 39.69610)


In [2]:
def create_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--log-level=OFF')
    driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options=chrome_options)
    return driver

def get_url_text(driver, url, verbose=False):
    driver.get(url)
    result = requests.get(url, allow_redirects=False)
    if result.status_code==200:
        if verbose:
            print('Data found!')
        soup = bs(result.text, 'html.parser') 
        return soup
    else:
        if verbose:
            print('Data does not exist')
        return None
        
def get_marker_info(marker_text, return_geodataframe=True):
    site_no = marker_text.split('site_no=')[1].split('>')[0].replace('"','')
    point = [float(p) for p in marker_text.split('[')[1].split(']')[0].split(',')]
    lat = point[0]
    lon = point[1]
    site_name = marker_text.split('<hr>')[1].split('<br')[0]
    df = pd.DataFrame([{'site_no':site_no,'site_name':site_name,'Latitude':lat,'Longitude':lon}])
    if return_geodataframe:
        return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude,df.Latitude))
    return df

def get_station_df(driver=None):
    if driver is None:
        driver = create_driver()
    #need to get the metadata for the site too...
    site_url = 'https://nrtwq.usgs.gov'
    soup = get_url_text(driver, site_url)
    js = str(soup.findAll('script')[6])
    marker_text_raw = js.split('L.marker')[1:-1]
    marker_df = pd.concat([get_marker_info(m) for m in marker_text_raw]).reset_index(drop=True)
    #error here with reindexing...
    return marker_df

def process_soup(soup):
    data_raw = str(soup).split('\n')
    data_raw = [elem for elem in data_raw if not ('#' in elem)]
    data_split = [d.split('\t') for d in data_raw]
    y = (i for i,v in enumerate(data_split) if ('' in v))
    stop = next(y) #identify end of the string of continuous data
    cols = data_split[0]
    units = data_split[1]
    columns = [f'{c} ({u})' if ' ' not in u else f'{c}' for c,u in zip(cols,units) ]
    data = data_split[2:stop]
    df = pd.DataFrame(data=data, columns=columns)
    return df

def get_water_url(site_no, attribute, year):
    pcode_list = {'discharge':'00060',\
          'turbidity':'63680',\
          'temperature':'00010',\
          'dissolved_oxygen':'00300',\
          'ssd':'99409'}
    url_header = 'https://nrtwq.usgs.gov/explore/datatable?'
    timestep = 'uv'
    period = f'{year}_all'
    l = {'url_header':url_header, 'site_no':site_no, 'timestep':timestep}
    l['period'] = period
    l['pcode'] = pcode_list[attribute]
    url = f"{l['url_header']}site_no={l['site_no']}&pcode={l['pcode']}&period={l['period']}&timestep={l['timestep']}&format=rdb&is_verbose=y"
    return url
 
def get_water_attribute(driver, site_no, attribute, year):
    water_url = get_water_url(site_no, attribute, year)
    textsoup = get_url_text(driver, water_url, verbose=False)
    out = None
    if textsoup is not None:
        out = process_soup(textsoup)
    return out 


In [3]:
driver = create_driver()
df = get_station_df(driver)


[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - There is no [linux64] chromedriver for browser 90.0.4430 in cache
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Trying to download new driver from https://chromedriver.storage.googleapis.com/90.0.4430.24/chromedriver_linux64.zip
[WDM] - Driver has been saved in cache [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24]


In [4]:
df.to_csv('/content/data/station_metadata.csv', index=False)

In [34]:
df.site_no[80]

'07144000'

In [None]:
#things to consider 
# year, maybe check the years first?
# pcode variables...
# we can find out if everything has discharge and if everything has turbidity?
# let's acquire the data one by one for the years available
# define a function to get a specific pcode

In [None]:
#with concurrent.futures.ProcessPoolExecutor() as executor:
#    for site_no, out in zip(df.site_no, executor.map(get_attribute_years, df.site_no)):
#        site_meta.append({'site_no':site_no,\
#                          'ssd_record_years':out[0],\
#                          'discharge_record_years':out[1]})

for site_no in df.site_no[80:]:
    d = []
    for year in np.arange(2013, 2022):
        try:
            time.sleep(10)
            ssd_df = get_water_attribute(driver, site_no, 'ssd', year)
            time.sleep(10)
            d_df = get_water_attribute(driver, site_no, 'discharge', year)
            merged = d_df.merge(ssd_df, on='Date-Time')
            if d_df is not None:
                d.append(merged)
        except:
            print(f'Timed out for {site_no}, year {year}!')
            continue #could time out due to no data available
    if d:
        a_df = pd.concat(d).dropna()
        sitefile = f'/content/data/{site_no}_data.csv' 
        a_df.to_csv(sitefile, index=False)
        print(f'Wrote {sitefile}!')

Timed out for 07144000, year 2013!
Timed out for 07144000, year 2014!
Timed out for 07144000, year 2015!
Timed out for 07144000, year 2016!
Timed out for 07144000, year 2017!
Timed out for 07144000, year 2018!
Timed out for 07144000, year 2019!
Timed out for 07144000, year 2020!
Timed out for 07144000, year 2021!
Timed out for 07144100, year 2021!
Wrote /content/data/07144100_data.csv!
Timed out for 07144200, year 2013!
Timed out for 07144200, year 2014!
Timed out for 07144200, year 2015!
Timed out for 07144200, year 2016!
Timed out for 07144200, year 2017!
Timed out for 07144200, year 2018!
Timed out for 07144200, year 2019!
Timed out for 07144200, year 2020!
Timed out for 07144200, year 2021!
Timed out for 07144780, year 2013!
Timed out for 07144780, year 2014!
Timed out for 07144780, year 2015!
Timed out for 07144780, year 2016!
Timed out for 07144780, year 2017!
Timed out for 07144780, year 2018!
Timed out for 07144780, year 2019!
Timed out for 07144780, year 2020!
Timed out for 07

In [349]:
#now we have this list, we have a list of urls that we can use. 
#start concatenating data into a single DataFrame

In [None]:
class USGS_Water_Data:
    def __init__(self, df):
        self.df = df
        self.driver = create_driver()
    def create_driver(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
    def get_data(self):
        pass
    def process_soup(soup):
        data_raw = str(soup).split('\n')
        data_raw = [elem for elem in data_raw if not ('#' in elem)]
        data_split = [d.split('\t') for d in data_raw]
        y = (i for i,v in enumerate(data_split) if ('' in v))
        stop = next(y)
        cols = data_split[0]
        units = data_split[1]
        columns = [f'{c} ({u})' if ' ' not in u else f'{c}' for c,u in zip(cols,units) ]
        data = data_split[2:stop]
        df = pd.DataFrame(data=data, columns=columns)
        return df
    def get_marker_info(marker_text, return_geodataframe=True):
        site_no = marker_text.split('site_no=')[1].split('>')[0].replace('"','')
        point = [float(p) for p in marker_text.split('[')[1].split(']')[0].split(',')]
        lat = point[0]
        lon = point[1]
        site_name = marker_text.split('<hr>')[1].split('<br')[0]
        df = pd.DataFrame([{'site_no':site_no,'site_name':site_name,'Latitude':lat,'Longitude':lon}])
        if return_geodataframe:
            return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude,df.Latitude))
        return df