In [None]:
#This program is designed to automate the URL generation and site collection on USGS' website for all of the USA states
#
# When generating URLs using the website, for the purposes of this project, instantaneous data, stream watersheds,
# and a maximum of 40 square miles are constant.
#
# https://waterservices.usgs.gov/nwis/site/?format=rdb&stateCd=ia&startDT=2001-01-01&endDT=2001-12-31&siteOutput=expanded&siteType=ST&siteStatus=all&hasDataTypeCd=iv&drainAreaMax=40
#
# Version 1.3, years are also constant as 2000-2020.

In [2]:
# Import necessary modules
import urllib.request
import os
import shutil
import pandas as pd

# Directory setup
Direct = "C:/Users/mgalib/Box/Finley_UH_work/Data/"

# List of valid state codes
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

# Define years
years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
         '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
         '2018', '2019', '2020']

print('Sites will be retrieved from 2000 through 2020.')

# Loop through each state in the states list
for state in states:
    # Create state results directories if they do not already exist
    state_text_dir = Direct + state + '/text'
    state_sites_dir = Direct + state + '/sites'
    
    if not os.path.exists(state_text_dir):
        os.makedirs(state_text_dir)
    if not os.path.exists(state_sites_dir):
        os.makedirs(state_sites_dir)

    # Loop through each year
    for year in years:
        file_name = f'{state}_{year}.txt'
        sites_text = f'{state}_{year}_sites.txt'
        text_file_path = os.path.join(state_text_dir, file_name)
        sites_file_path = os.path.join(state_sites_dir, sites_text)

        # Check if the data for this state and year has already been downloaded
        if os.path.exists(text_file_path) and os.path.exists(sites_file_path):
            print(f'Data for {state} in {year} already downloaded. Skipping.')
            continue

        try:
            # Create URL for the given state and year
            url = f'https://waterservices.usgs.gov/nwis/site/?format=rdb&stateCd={state}&startDT={year}-01-01&endDT={year}-12-31&siteOutput=expanded&siteType=ST&siteStatus=all&hasDataTypeCd=iv&drainAreaMax=40'
            
            # Download the text file
            urllib.request.urlretrieve(url, text_file_path)

            # Read the text file and extract sites
            with open(text_file_path, 'r') as file:
                lines = file.readlines()
                sites = [line[5:line.index('\t', 8)] for line in lines if line[0] != '#' and lines.index(line) > 2]

            # Convert the sites list to a DataFrame and save as a text file
            sites_df = pd.DataFrame(sites)
            sites_df.to_csv(sites_file_path, sep='\t', index=False, header=None)

        except urllib.error.HTTPError as e:
            print(f'Failed to download data for {state} in {year}: {e}')

print('Site retrieval and storage is complete for all states.')


Sites will be retrieved from 2000 through 2020.
Data for AL in 2000 already downloaded. Skipping.
Data for AL in 2001 already downloaded. Skipping.
Data for AL in 2002 already downloaded. Skipping.
Data for AL in 2003 already downloaded. Skipping.
Data for AL in 2004 already downloaded. Skipping.
Data for AL in 2005 already downloaded. Skipping.
Data for AL in 2006 already downloaded. Skipping.
Data for AL in 2007 already downloaded. Skipping.
Data for AL in 2008 already downloaded. Skipping.
Data for AL in 2009 already downloaded. Skipping.
Data for AL in 2010 already downloaded. Skipping.
Data for AL in 2011 already downloaded. Skipping.
Data for AL in 2012 already downloaded. Skipping.
Data for AL in 2013 already downloaded. Skipping.
Data for AL in 2014 already downloaded. Skipping.
Data for AL in 2015 already downloaded. Skipping.
Data for AL in 2016 already downloaded. Skipping.
Data for AL in 2017 already downloaded. Skipping.
Data for AL in 2018 already downloaded. Skipping.
Da