In [386]:
from bs4 import BeautifulSoup as bs
import requests
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from shapely.geometry import Point
import pandas as pd
import geopandas as gpd
import re
import numpy as np
import time
import concurrent.futures

In [378]:
def create_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(ChromeDriverManager(print_first_line=False).install(), options=chrome_options)
    return driver

def get_url_text(driver, url, verbose=False):
    driver.get(url)
    result = requests.get(url, allow_redirects=False)
    if result.status_code==200:
        if verbose:
            print('Data found!')
        soup = bs(result.text, 'html.parser') 
        return soup
    else:
        if verbose:
            print('Data does not exist')
        return None
        
def get_marker_info(marker_text, return_geodataframe=True):
    site_no = marker_text.split('site_no=')[1].split('>')[0].replace('"','')
    point = [float(p) for p in marker_text.split('[')[1].split(']')[0].split(',')]
    lat = point[0]
    lon = point[1]
    site_name = marker_text.split('<hr>')[1].split('<br')[0]
    df = pd.DataFrame([{'site_no':site_no,'site_name':site_name,'Latitude':lat,'Longitude':lon}])
    if return_geodataframe:
        return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude,df.Latitude))
    return df

def get_station_df():
    driver = create_driver()
    #need to get the metadata for the site too...
    site_url = 'https://nrtwq.usgs.gov'
    soup = get_url_text(driver, site_url)
    js = str(soup.findAll('script')[6])
    marker_text_raw = js.split('L.marker')[1:-1]
    marker_df = pd.concat([get_marker_info(m) for m in marker_text_raw]).reset_index(drop=True)
    return marker_df

def process_ssc_soup(soup):
    data_raw = str(soup).split('\n')
    data_raw = [elem for elem in data_raw if not ('#' in elem)]
    data_split = [d.split('\t') for d in data_raw]
    y = (i for i,v in enumerate(data_split) if ('' in v))
    stop = next(y) #identify end of the string of continuous data
    cols = data_split[0]
    units = data_split[1]
    columns = [f'{c} ({u})' if ' ' not in u else f'{c}' for c,u in zip(cols,units) ]
    data = data_split[2:stop]
    df = pd.DataFrame(data=data, columns=columns)
    return df

def make_url(l):
    url = f"{l['url_header']}site_no={l['site_no']}&pcode={l['pcode']}&period={l['period']}&timestep={l['timestep']}&format=rdb&is_verbose=y"
    return url

In [366]:
df = get_station_df()

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache


In [None]:
#things to consider 
# year, maybe check the years first?
# pcode variables...
# we can find out if everything has discharge and if everything has turbidity?

In [None]:
pcode_list = {'discharge':'00060',\
              'turbidity':'63680',\
              'temperature':'00010',\
              'dissolved_oxygen':'00300',\
              'ssd':'99409'}
              
def get_attribute_years(site_no):
    driver = create_driver()
    year_range = list(np.arange(2000,2022))
    year_range.append('ytd')
    timestep = 'uv'
    url_header = 'https://nrtwq.usgs.gov/explore/datatable?'
    discharge_record_years = []
    ssd_record_years = []
    labels = {'url_header':url_header, 'site_no':site_no, 'timestep':timestep}
    for year in year_range:
        period = f'{year}_all'
        labels['period'] = period
        labels['pcode'] = pcode_list['discharge']
        discharge_url = make_url(labels)
        labels['pcode'] = pcode_list['ssd']
        ssd_url = make_url(labels)
        #get the discharge
        driver.get(url)
        result = requests.get(discharge_url, allow_redirects=False)
        if result.status_code==200:
            discharge_record_years.append(year)
        #get ssd
        result = requests.get(ssd_url, allow_redirects=False)
        if result.status_code==200:
            ssd_record_years.append(year)
        time.sleep(10)

    ssd_record_years = [2021 if i=='ytd' else i for i in ssd_record_years]
    discharge_record_years = [2021 if i=='ytd' else i for i in discharge_record_years]
    return ssd_record_years, discharge_record_years

site_meta = []

with concurrent.futures.ProcessPoolExecutor() as executor:
    for site_no, out in zip(df.site_no, executor.map(get_attribute_years, df.site_no)):
        site_meta.append({'site_no':site_no,\
                          'ssd_record_years':out[0],\
                          'discharge_record_years':out[1]})

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Trying to download new driver from https://chromedriver.storage.googleapis.com/90.0.4430.24/chromedriver_linux64.zip
[WDM] - Driver has been saved in cache [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24]
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current

[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache
[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver 

In [396]:
site_meta

[{'site_no': '01472157',
  'ssd_record_years': [],
  'discharge_record_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021]},
 {'site_no': '01478245',
  'ssd_record_years': [],
  'discharge_record_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021]}]

In [349]:
#now we have this list, we have a list of urls that we can use. 
#start concatenating data into a single DataFrame

In [190]:
class USGS_Water_Data:
    def __init__(self, df):
        self.df = df
        create_driver()
    def create_driver(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
    def get_data(self):
        pass
    def process_soup(soup):
        data_raw = str(soup).split('\n')
        data_raw = [elem for elem in data_raw if not ('#' in elem)]
        data_split = [d.split('\t') for d in data_raw]
        y = (i for i,v in enumerate(data_split) if ('' in v))
        stop = next(y)
        cols = data_split[0]
        units = data_split[1]
        columns = [f'{c} ({u})' if ' ' not in u else f'{c}' for c,u in zip(cols,units) ]
        data = data_split[2:stop]
        df = pd.DataFrame(data=data, columns=columns)
        return df
    def get_marker_info(marker_text, return_geodataframe=True):
        site_no = marker_text.split('site_no=')[1].split('>')[0].replace('"','')
        point = [float(p) for p in marker_text.split('[')[1].split(']')[0].split(',')]
        lat = point[0]
        lon = point[1]
        site_name = marker_text.split('<hr>')[1].split('<br')[0]
        df = pd.DataFrame([{'site_no':site_no,'site_name':site_name,'Latitude':lat,'Longitude':lon}])
        if return_geodataframe:
            return gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude,df.Latitude))
        return df

In [281]:
driver = create_driver()
pcode_list = {'discharge':'00060QSTGQ',\
              'turbidity':'63680TS213',\
              'temperature':'00010',\
              'dissolved_oxygen':'00300',\
              'ssd':'99409SED15'}
header_url = 'https://nrtwq.usgs.gov/explore/datatable'
site_no = '0205551460'
pcode = pcode_list['ssd']
year = 2020
period =f'{year}_all'
timestep='uv'
format ='rdb'
url = f'{header_url}?site_no={site_no}&pcode={pcode}&period={period}&timestep={timestep}&format={format}'
print(url)
soup = get_url_text(driver, url)

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/root/.wdm/drivers/chromedriver/linux64/90.0.4430.24/chromedriver] found in cache




https://nrtwq.usgs.gov/explore/datatable?site_no=0205551460&pcode=99409SED15&period=2020_all&timestep=uv&format=rdb
