# Analysis of Stream Gauges

This code was created to calculate slopes and perform our drought analysis on each individual wells used in this study.

To run this code, you need to follow all of the DataPreprocessing instructions.

Total time to run on my machine: TBA

In [1]:
# Packages and paths

import os
import numpy as np
import pandas as pd
#import earthpy as et
import scipy.stats as sp

# Local paths
inputpath = '../../Data/Input/RawData'
outputpath = '../../Data/Output/Local'

# Function to calculate slope using linear regression
def calculate_slope(y):
    x = np.arange(len(y))
    slope, _ = np.polyfit(x, y, 1)
    return slope

In [2]:
# ==== Reading in the data ====
filename_ts = 'USGSStreamgauges.txt'
filepath = os.path.join(inputpath, filename_ts)

# Open the file and read the first 10 lines
print("Supplemental Information about this file:")
with open(filepath, 'r') as file:
    for i in range(301):
        print(file.readline().strip())

stream_db = pd.read_csv(filepath, sep='\t',
                        # header=300,
                        skiprows=299,  # Adjust as needed
                        on_bad_lines="skip"
                    )
stream_db

# Some of the data that you have obtained from this U.S. Geological Survey database
# may not have received Director's approval. Any such data values are qualified
# as provisional and are subject to revision. Provisional data are released on the
# condition that neither the USGS nor the United States Government may be held liable
# for any damages resulting from its use.
#
# Additional info: https://waterdata.usgs.gov/provisional-data-statement/
#
# Contact:   gs-w_waterdata_support@usgs.gov
# retrieved: 2025-01-15 00:29:48 EST       (nadww02)
#
# Data for the following 271 site(s) are contained in this file
#    USGS 09379025 CHINLE CREEK AT CHINLE, AZ
#    USGS 09379050 LUKACHUKAI CREEK NEAR LUKACHUKAI, AZ
#    USGS 09379180 LAGUNA CREEK AT DENNEHOTSO, AZ
#    USGS 09379200 CHINLE CREEK NEAR MEXICAN WATER, AZ
#    USGS 09379910 COLORADO RIVER BELOW GLEN CANYON DAM, AZ
#    USGS 09380000 COLORADO RIVER AT LEES FERRY, AZ
#    USGS 09382000 PARIA RIVER AT LEES FERRY, AZ
#    USGS 09383

  stream_db = pd.read_csv(filepath, sep='\t',


Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
0,5s,15s,20d,6s,14n,10s
1,USGS,09379025,2000-01-01 00:00,MST,4.30,A:[91]
2,USGS,09379025,2000-01-01 00:15,MST,4.30,A:[91]
3,USGS,09379025,2000-01-01 00:30,MST,4.60,A:[91]
4,USGS,09379025,2000-01-01 00:45,MST,3.60,A:[91]
...,...,...,...,...,...,...
140178204,USGS,352025111332401,2014-10-13 22:45,MST,0.0,A
140178205,USGS,352025111332401,2014-10-13 23:00,MST,0.0,A
140178206,USGS,352025111332401,2014-10-13 23:15,MST,0.0,A
140178207,USGS,352025111332401,2014-10-13 23:30,MST,0.0,A


In [4]:
stream_db = stream_db[1:140178208]
stream_db = stream_db.rename(columns={"agency_cd": "Agency",
                   "site_no": "ID",
                   "6121_00060": "Discharge (cfs)"}, errors="raise")
stream_db.head()

Unnamed: 0,Agency,ID,datetime,tz_cd,Discharge (cfs),6121_00060_cd
1,USGS,9379025,2000-01-01 00:00,MST,4.3,A:[91]
2,USGS,9379025,2000-01-01 00:15,MST,4.3,A:[91]
3,USGS,9379025,2000-01-01 00:30,MST,4.6,A:[91]
4,USGS,9379025,2000-01-01 00:45,MST,3.6,A:[91]
5,USGS,9379025,2000-01-01 01:00,MST,4.3,A:[91]


Basically when I tried to convert the date column into datetime, I noticed that metadata is being repeated for each station deep in the tables.  

In [6]:
stream_db.iloc[315825]

Agency                 agency_cd
ID                       site_no
datetime                datetime
tz_cd                      tz_cd
Discharge (cfs)       6122_00060
6121_00060_cd      6122_00060_cd
Name: 315826, dtype: object

In [10]:
# del stream_db.iloc[315825]
stream_db.iloc[315810:3158228]

Unnamed: 0,Agency,ID,datetime,tz_cd,Discharge (cfs),6121_00060_cd
315811,USGS,09379025,2025-01-14 21:30,MST,0.00,P
315812,USGS,09379025,2025-01-14 21:45,MST,0.00,P
315813,USGS,09379025,2025-01-14 22:00,MST,0.00,P
315814,#,,,,,
315815,# Data provided for site 09379050,,,,,
...,...,...,...,...,...,...
3158224,USGS,9382000,2011-11-22 17:45,MST,9.9,A
3158225,USGS,9382000,2011-11-22 18:00,MST,10.7,A
3158226,USGS,9382000,2011-11-22 18:15,MST,9.51,A
3158227,USGS,9382000,2011-11-22 18:30,MST,11.1,A


So to account for this I ran the following pre-processing of the original stream database file and renamed it to "filtered_streamgauges_test.csv".|

In [20]:
import csv

filepath = os.path.join(inputpath, filename_ts)  # Replace with your file path
output_file = "filtered_streamgauges_test.csv"  # Adjust output file extension if needed

# Define the index of the column to check for NaN (adjust index as needed)
column_to_check = 2  # Replace with the zero-based index of the column you want to check

with open(filepath, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    reader = csv.reader(infile, delimiter='\t')  # Set delimiter to tab
    writer = csv.writer(outfile, delimiter='\t')  # Set delimiter to tab

    # Skip the first 299 lines
    for _ in range(299):
        next(reader)

    # Write header (if the 300th line is the header)
    header = next(reader)
    writer.writerow(header)

    # Process each line
    for row in reader:
        # Skip empty or malformed rows
        if len(row) <= column_to_check:  # Check if the row has enough columns
            print(f"Skipping malformed row: {row}")
            continue

        # Check if the target column is not empty (no NaN)
        if row[column_to_check].strip() != '':
            writer.writerow(row)


Skipping malformed row: ['#']
Skipping malformed row: ['# Data provided for site 09379050']
Skipping malformed row: ['#            TS   parameter     Description']
Skipping malformed row: ['#          6122       00060     Discharge, cubic feet per second']
Skipping malformed row: ['#']
Skipping malformed row: ['# Data-value qualification codes included in this output:']
Skipping malformed row: ['#     A  Approved for publication -- Processing and review completed.']
Skipping malformed row: ['#    90  Daily mean calculated from data on this day matches published daily mean to nearest hundredth']
Skipping malformed row: ['#    91  Daily mean calculated from data on this day matches published daily mean within 1 percent']
Skipping malformed row: ['#    92  Daily mean calculated from data on this day matches published daily mean within 5 percent']
Skipping malformed row: ['#    93  Daily mean calculated from data on this day matches published daily mean within 10 percent']
Skipping malform

Now Reading in the new table, hopefully without the metadata

In [2]:
filtered_streamdb = pd.read_csv("filtered_streamgauges_test.csv"
                                ,sep='\t'
                                ,on_bad_lines='skip')
filtered_streamdb.head()

  filtered_streamdb = pd.read_csv("filtered_streamgauges_test.csv"


Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
0,5s,15s,20d,6s,14n,10s
1,USGS,09379025,2000-01-01 00:00,MST,4.30,A:[91]
2,USGS,09379025,2000-01-01 00:15,MST,4.30,A:[91]
3,USGS,09379025,2000-01-01 00:30,MST,4.60,A:[91]
4,USGS,09379025,2000-01-01 00:45,MST,3.60,A:[91]


In [4]:
filtered_streamdb.iloc[315811:3158228]

Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
315811,USGS,09379025,2025-01-14 21:30,MST,0.00,P
315812,USGS,09379025,2025-01-14 21:45,MST,0.00,P
315813,USGS,09379025,2025-01-14 22:00,MST,0.00,P
315814,agency_cd,site_no,datetime,tz_cd,6122_00060,6122_00060_cd
315815,5s,15s,20d,6s,14n,10s
...,...,...,...,...,...,...
3158223,USGS,9382000,2011-11-23 11:30,MST,12.0,A
3158224,USGS,9382000,2011-11-23 11:45,MST,13.5,A
3158225,USGS,9382000,2011-11-23 12:00,MST,11.6,A
3158226,USGS,9382000,2011-11-23 12:15,MST,10.7,A


So we still have repeated headers and that "5s 15s" thing that I don't know what it means.  We're going to run this code on the filtered data to get rid of those lines.

In [5]:
import csv

input_file = "filtered_streamgauges_test.csv"  # Replace with the filtered file path
output_file = "cleaned_streamgauges.csv"       # Final output file name

# Define a set of known "weird" values to filter out (e.g., "15s")
weird_values = {"5s","15s","20d","6s","14n","10s"}  # Add more values here if needed

with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
    reader = csv.reader(infile, delimiter='\t')  # Adjust delimiter as needed
    writer = csv.writer(outfile, delimiter='\t')  # Adjust delimiter as needed

    # Read the first header row
    header = next(reader)
    writer.writerow(header)  # Write the initial header to the output file

    # Process each line
    for row in reader:
        # Check if the row matches the header exactly
        if row == header:
            print(f"Skipping repeated header row: {row}")
            continue  # Skip repeated header rows

        # Check if the row contains "weird" values
        if any(cell in weird_values for cell in row):
            print(f"Skipping weird row: {row}")
            continue  # Skip rows with weird values

        # Write all other rows
        writer.writerow(row)


Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s', '15s', '20d', '6s', '14n', '10s']
Skipping weird row: ['5s

In [2]:
cleaned_streamdb = pd.read_csv("cleaned_streamgauges.csv"
                                ,sep='\t'
                                ,on_bad_lines='skip')
cleaned_streamdb.head()

  cleaned_streamdb = pd.read_csv("cleaned_streamgauges.csv"


Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
0,USGS,9379025,2000-01-01 00:00,MST,4.3,A:[91]
1,USGS,9379025,2000-01-01 00:15,MST,4.3,A:[91]
2,USGS,9379025,2000-01-01 00:30,MST,4.6,A:[91]
3,USGS,9379025,2000-01-01 00:45,MST,3.6,A:[91]
4,USGS,9379025,2000-01-01 01:00,MST,4.3,A:[91]


In [4]:
# Check the problem rows
cleaned_streamdb.iloc[315811:3158225]

Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
315811,USGS,09379025,2025-01-14 21:45,MST,0.00,P
315812,USGS,09379025,2025-01-14 22:00,MST,0.00,P
315813,agency_cd,site_no,datetime,tz_cd,6122_00060,6122_00060_cd
315814,USGS,09379050,2000-01-26 00:00,MST,3.30,A:[91]
315815,USGS,09379050,2000-01-26 00:15,MST,3.60,A:[91]
...,...,...,...,...,...,...
3158220,USGS,9382000,2011-11-23 12:30,MST,9.51,A
3158221,USGS,9382000,2011-11-23 12:45,MST,12.0,A
3158222,USGS,9382000,2011-11-23 13:00,MST,9.9,A
3158223,USGS,9382000,2011-11-23 13:15,MST,9.9,A


The whole header line thing didn't work so we'll just delete it here

In [5]:
df = cleaned_streamdb[cleaned_streamdb['agency_cd'] != 'agency_cd']
df.iloc[315811:3158225]


Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
315811,USGS,09379025,2025-01-14 21:45,MST,0.00,P
315812,USGS,09379025,2025-01-14 22:00,MST,0.00,P
315814,USGS,09379050,2000-01-26 00:00,MST,3.30,A:[91]
315815,USGS,09379050,2000-01-26 00:15,MST,3.60,A:[91]
315816,USGS,09379050,2000-01-26 00:30,MST,3.60,A:[91]
...,...,...,...,...,...,...
3158226,USGS,9382000,2011-11-23 14:00,MST,11.6,A
3158227,USGS,9382000,2011-11-23 14:15,MST,10.3,A
3158228,USGS,9382000,2011-11-23 14:30,MST,9.9,A
3158229,USGS,9382000,2011-11-23 14:45,MST,10.7,A


In [11]:
df.to_csv("streamgauges_bettercleaning.csv")

In [12]:
cleaned_streamdb = pd.read_csv("streamgauges_bettercleaning.csv"
                                ,sep='\t'
                                ,on_bad_lines='skip')
cleaned_streamdb.head()

: 

In [6]:
df['datetime'] = pd.to_datetime(df.datetime)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df.datetime)


Unnamed: 0,agency_cd,site_no,datetime,tz_cd,6121_00060,6121_00060_cd
0,USGS,9379025,2000-01-01 00:00:00,MST,4.3,A:[91]
1,USGS,9379025,2000-01-01 00:15:00,MST,4.3,A:[91]
2,USGS,9379025,2000-01-01 00:30:00,MST,4.6,A:[91]
3,USGS,9379025,2000-01-01 00:45:00,MST,3.6,A:[91]
4,USGS,9379025,2000-01-01 01:00:00,MST,4.3,A:[91]


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 140174942 entries, 0 to 140175174
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   agency_cd      object        
 1   site_no        object        
 2   datetime       datetime64[ns]
 3   tz_cd          object        
 4   6121_00060     object        
 5   6121_00060_cd  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 7.3+ GB


In [9]:
df = df.rename(columns={"agency_cd": "Agency",
                   "site_no": "ID",
                   "6121_00060": "Discharge (cfs)"}, errors="raise")
df

MemoryError: Unable to allocate 5.22 GiB for an array with shape (5, 140174942) and data type object