In [None]:
####### This Notebook is for Calculating Accuracy Assessment #######
## First, pull all available USGS temperature gage data. Final results only use gages that occur within the same Landsat scenes as another gage ##

In [None]:
# Import the packages needed
import numpy as np
from dataretrieval import nwis
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
import os
import glob
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import datetime as dt
from sklearn.linear_model import LinearRegression

In [None]:
################## Prepare the Data ##################

In [None]:
# Get Dam AOI File
DamAOIs = gpd.read_file(r"F:\Insert_File_Path_of_Shapefile_with_Dam_AOIs.shp") # Update this file path

In [None]:
### Select USGS Site Numbers for the Study ###
# Set Parameters
P_Code = '00010'
StartDate = '2013-03-01'
EndDate = '2023-12-31'

## Pull in List of Gages -- List was pulled from USGS Website
Unique_Gages =  pd.read_excel(r"F:\Insert_File_Path_of_All_Possible_Gage_Site_Nos.xlsx", sheet_name='Sheet1', converters={'site_no':str}) #converter keeps padded zero # Update this file path here
# Drop Null in Location Values
Unique_Gages = Unique_Gages[(Unique_Gages['dec_long_va'].notna())& (Unique_Gages['dec_lat_va'].notna())]
Unique_Gages.head()

In [None]:
### Need distances later to make sure these nodes are near our larger rivers ### 
# Convert to GDF 
Unique_Gages_gdf = gpd.GeoDataFrame(Unique_Gages, geometry=gpd.points_from_xy(x=Unique_Gages.dec_long_va, y=Unique_Gages.dec_lat_va))

# Pull in SWORD Nodes
SWORD_Nodes = gpd.read_file(r"F:\Insert_File_Path_of_the_Shapefile_Containing_the_Selected_SWORD_Nodes.shp") # Update this file path
# Rename Width Column
SWORD_Nodes.rename(columns={"width":"SWD_wid"}, inplace = True)

In [None]:
## Define Nearest Node Function For Later ##
def Closest_Nodes(gdA, gdB):

    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)
    return gdf

In [None]:
# Set CRS (My Data is WGS 84, USGS is NAD 83), Match Them, Find points within AOIs
Unique_Gages_gdf_crs = Unique_Gages_gdf.set_crs('EPSG:4269')
Unique_Gages_gdf_crs = Unique_Gages_gdf_crs.to_crs('EPSG:4326')
NN_Gage_Filt = gpd.sjoin(Unique_Gages_gdf_crs, DamAOIs, predicate = 'within')
NN_Gage_Filt = NN_Gage_Filt.drop_duplicates(subset='site_no', keep='last')

# Save File and Preview
NN_Gage_Filt.to_csv(r"F:\Insert_File_Path_for_List_USGS_Gages_SWORD.csv")  # Update this file path
print("Gages within AOIs: " + str(len(NN_Gage_Filt)))
NN_Gage_Filt.head()

In [None]:
### Pull USGS Data ###
# Get a list of Site Numbers
Selected_Sites_List = NN_Gage_Filt['site_no'].tolist()

In [None]:
##### Pull Gage Data from USGS and Save it #####
### Takes a long time -- Running only once is recommended ####

# Set Parameters for loop
Outputpath = r"F:\Insert_File_Path_for_Saving_Gage_Data"  # Update this file path

# This is done to have the data saved and also to improve run times (Big Datasets)
for i in Selected_Sites_List:
  try:
    TempGage = nwis.get_record(sites =  i , service = 'iv', parameterCd =  P_Code, start = StartDate, end = EndDate)
    TempGage.rename(columns = {'00010':'Temperature'}, inplace = True)
    # Save the CSV
    TempGage.to_csv(Outputpath +  r"\Gage_"+ str(i))
    print("Gage " + str(i) + " CSV Exported")
  except:
    continue

In [None]:
# Get the List of Gage Data
Outputpath =  r"F:\Insert_File_Path_of_Folder_of_Saved_Gage_Data"  # Update this file path
Gage_Files = glob.glob(os.path.join(Outputpath, "Gage_*")) 
SplitString = "Gage_"
# Create a list for non-empty files
Gage_List = []
SplitString = "Gage_" # will use this to pull the Gage number

# Loop thru each gage file and check if it is empty, some gages don't have data
for i in Gage_Files: 
    # If the file is greater than 1KB --> list
    check_file = os.path.getsize(i) # note this is done in bytes (1KB = 1000bytes)
    # If it has data -- add to list 
    if(check_file > 1000):
        gage_no =i.split(SplitString)[1]
        Gage_List.append(gage_no)

    # If is is empty -- skip
    else:
        continue

print("Number of Gages Exported: "+ str(len(Gage_Files)))
print( "Total Gages with Data: " + str(len(Gage_List)))

In [None]:
## For reference: these missing gages are because of the study period mismatch ##
EmptyGages = NN_Gage_Filt[(~NN_Gage_Filt['site_no'].isin(Gage_List))]

In [None]:
###  Get which Dam's RWCT to pull in -- Data is stored by dam 
# Get a list of Dams to pull in RWCT Values
RWCT_AA_List = NN_Gage_Filt['Assgn_dam'].drop_duplicates().tolist()
RWCT_AA_List.sort()
print("Number of Assiocated Dams:  " + str(len(RWCT_AA_List)))
print(RWCT_AA_List)

In [None]:
################## Look at the 5 Closest Temperature Nodes ##################

In [None]:
## Want to pull the accuracy like the analysis does (temperature is the average of 5 closest nodes)
GageTemps_5n = pd.DataFrame()

for i in RWCT_AA_List: 
    try:
        # Pull in Temp Points based on the dam
        RWCT_Data = pd.read_csv(r"Insert_File_Path_of_the_Folder_of_Temperature_Outputs_from_RWC\Dam_"+ str(int(i)) +r"RWCT.csv" ,
                                usecols = ['Clst_Dam', 'GEE_temp', 'flag_cldShadow', 'flag_cloud', 'flag_water', 'image_id', 'latitude', 'longitude', 'RWC_wid', 'Date_Time', 'Date', 'Month', 'Year', 'Season'], # Keep only the columns needed for accuracy assessment
                                engine='python')
        geometry = [Point(xy) for xy in zip(RWCT_Data['longitude'], RWCT_Data['latitude'])]
        RWCT_Data_gdf = gpd.GeoDataFrame(RWCT_Data, geometry=geometry, crs="EPSG:4326")
        
        # Get the gages assoc with that dam
        Dam_Gages = NN_Gage_Filt[NN_Gage_Filt['Assgn_dam']== i]
        Dam_Gages = Dam_Gages[['agency_cd', 'site_no', 'station_nm', 'site_tp_cd', 'dec_lat_va',  'dec_long_va', 'huc_cd']] # Keep only the columns you need for accuracy assessment
        Dam_Gages = Dam_Gages[(Dam_Gages['site_no'].isin(Gage_List))] # Filter only the Sites with data
        geometry2 = [Point(xy) for xy in zip(Dam_Gages[ 'dec_long_va'], Dam_Gages[ 'dec_lat_va'])]
        Dam_Gages_gdf = gpd.GeoDataFrame(Dam_Gages, geometry=geometry2, crs="EPSG:4326")
        
        # Get the unique image date/times
        RWCT_Times = RWCT_Data_gdf['Date_Time'].drop_duplicates().tolist()
        for j in RWCT_Times:
            OneImage = RWCT_Data_gdf[RWCT_Data_gdf['Date_Time']== j]
            Image_Dist = Closest_Nodes(OneImage, Dam_Gages_gdf)
            # Nearest Distance is calculated in Degrees b/c both gdf are in WGS 84 --> (Convert Degrees to meters * 111139)
            Image_Dist['NDist_m'] = Image_Dist['dist']*111139 

            ## Filtering to the closest nodes for averaging temps ##
            # Get each Gage's 5 nearest RWC points for each date
            Nearest_RWCT_grp = Image_Dist.groupby(['site_no','Date_Time'])['dist'].nsmallest(5)
            Nearest_RWCT_grp = Nearest_RWCT_grp.reset_index()

            # Get a list of Index numbers
            Node_list5 = Nearest_RWCT_grp['level_2'].tolist()

            # Select nodes by Index
            Select_ImageNodes = Image_Dist.iloc[Node_list5]
            
            # Remove extra nodes -- If it is farther than 200m drop it
            Image_Dist_filt = Select_ImageNodes[Select_ImageNodes['NDist_m'] <= 200]
            
            #### Get the Temperature Averages ####
            ## Group By Node, Month, and Year -- Get Average Temp 
            Avg_Near_Temps = Image_Dist_filt.groupby(['site_no','Date_Time']).agg({'GEE_temp': ['mean'],'RWC_wid': ['mean']})
            Avg_Near_Temps.columns = ['Avg_Temp', 'Avg_RWC_Wid']
            Avg_Near_Temps = Avg_Near_Temps.reset_index()

            # Convert to Dataframe
            AvgTemps_df = pd.DataFrame(Avg_Near_Temps)
        
            # Join The information 
            AvgMonTemps_xy = pd.merge(AvgTemps_df, Dam_Gages_gdf, on='site_no', how='inner')

            GageTemps_5n = pd.concat([GageTemps_5n,AvgMonTemps_xy],axis=0)
            
        print("Finished Dam: "+ str(i))
    except:
        continue

GageTemps_5n

In [None]:
# Export to CSV
GageTemps_5n.to_csv(r"F:\Insert_File_Path_for\RWCT_Gage_Temps_5n.csv") # Update this file path
print("Gage Temp 5n CSV Exported")

In [None]:
## Update some Fields for Assessment  ##
# Make a copy of the RWCT temps at Gages
GageTemps_5n_forjoin = pd.DataFrame(GageTemps_5n)

# Round DateTime to Nearest Hour -- to pair up gage data
GageTemps_5n_forjoin['Date_Time'] = pd.to_datetime(GageTemps_5n_forjoin['Date_Time'], format='mixed')
GageTemps_5n_forjoin['Round_time'] = GageTemps_5n_forjoin['Date_Time'].dt.round('60min')

print('Total Sites with Available RWCT Avg Data: '+ str(len(GageTemps_5n_forjoin['site_no'].drop_duplicates().tolist())))

In [None]:
# Loop through the gages and add temperatures
Gage_RS_Temp_5n = pd.DataFrame()

for i in Gage_List:
    # Pull in and set up gage records
    Gage_Record = pd.read_csv(r"F:\Insert_File_Path_of_Folder_of_Saved_Gage_Data\Gage_" + str(i), converters={'site_no': str}, engine = 'python')  # Update this file path
    Gage_Record['Round_time'] =  pd.to_datetime(Gage_Record['datetime'])
    Gage_Record['site_no'] = Gage_Record['site_no'].astype(str)

     # Filter Site Info to the Site Number from table with GEE exports and nearest points
    SIteInfo =  GageTemps_5n_forjoin[(GageTemps_5n_forjoin['site_no'] == i)]

    # Merge the data together
    MergedTemps_5n = pd.merge(SIteInfo, Gage_Record, on=['site_no', 'Round_time'], how='inner')

    # Place in the Empty dataframe
    Gage_RS_Temp_5n = pd.concat([Gage_RS_Temp_5n,MergedTemps_5n],axis=0)
print("Combined: Measurements for all sites with available gage and satellite observations")

Gage_RS_Temp_5n.head()

In [None]:
### Gage data has some messy naming conventions that need to be cleaned ###
# Fix columns with spaces and punctuation in them
Gage_RS_Temp_5n.columns = Gage_RS_Temp_5n.columns.str.replace(' ', '')
Gage_RS_Temp_5n.columns = Gage_RS_Temp_5n.columns.str.replace(',', '')
Gage_RS_Temp_5n.columns = Gage_RS_Temp_5n.columns.str.replace('[', '')
Gage_RS_Temp_5n.columns = Gage_RS_Temp_5n.columns.str.replace(']', '')
Gage_RS_Temp_5n.columns = Gage_RS_Temp_5n.columns.str.replace('/', '')

# Save intermediate file -- all possible matching gage to landsat observations
Gage_RS_Temp_5n.to_csv(r"F:\Insert_File_Path_For\Matched_Gage_Temps_5n.csv")  # Update this file path

# Some columns have non-uniform naming conventions. We want to standardize them
# Find the gages where "Temperature" is na
NonTraditional_Temps = Gage_RS_Temp_5n[Gage_RS_Temp_5n[['Temperature']].isna().any(axis=1)]

## Clean up columns to get one single temperature column ## 
# Column Types
ysi_cols = [col for col in NonTraditional_Temps.columns if '_ysi' in col]
up_cols = [col for col in NonTraditional_Temps.columns if '_up' in col]
top_cols = [col for col in NonTraditional_Temps.columns if '_top' in col]
left_cols = [col for col in NonTraditional_Temps.columns if '_left' in col]
right_cols = [col for col in NonTraditional_Temps.columns if '_right' in col]
cent_cols = [col for col in NonTraditional_Temps.columns if '_center' in col]
flt_cols = [col for col in NonTraditional_Temps.columns if '_float' in col]
p4_cols = [col for col in NonTraditional_Temps.columns if '_p4' in col]
cs_cols = [col for col in NonTraditional_Temps.columns if '_cs' in col]
therm_cols = [col for col in NonTraditional_Temps.columns if 'thermistor' in col]
tdg_cols = [col for col in NonTraditional_Temps.columns if '_tdg' in col]
# More Specific Column Types
ds_cols = [col for col in NonTraditional_Temps.columns if '_downstreamhydro' in col]
exo_cols = [col for col in NonTraditional_Temps.columns if '_exo1depthat3feet' in col]
hrec_cols = [col for col in NonTraditional_Temps.columns if '_hrecos' in col]
pier_cols = [col for col in NonTraditional_Temps.columns if 'pier' in col]

# Combine them
alttemp_cols = ysi_cols + up_cols + top_cols + left_cols + right_cols + cent_cols + flt_cols  + p4_cols + cs_cols + therm_cols + tdg_cols + ds_cols + exo_cols + hrec_cols + pier_cols

# Remove the code columns
suff = '_cd'
 
# Suffix removal from String list
# using loop + remove() + endswith()
for word in alttemp_cols[:]:
    if word.endswith(suff):
        alttemp_cols.remove(word)

print(alttemp_cols)

# Get list of all the Temperature Columns
USGS_Temp_Cols = alttemp_cols + ['00010_2'] + ['Temperature']

# Get the Temps in one column -- Average because each row only has one temp value, rest are nulls
Gage_RS_Temp_5n['USGS_Temp'] = Gage_RS_Temp_5n[USGS_Temp_Cols].mean(axis=1).astype(float)

# Export the fixed matches to a CSV
Gage_RS_Temp_5n.to_csv(r"F:\Insert_File_Path_for\Matched_Gage_Temps_ColumnFix_5n.csv")  # Update this file path
print("Matched Gage Temp 5n -- Fixed Columns CSV Exported")

## For following analyses -- use a simplified DF
QuickView_5n = Gage_RS_Temp_5n[['site_no','huc_cd','Date_Time','Avg_Temp', 'Avg_RWC_Wid', 'USGS_Temp']]

# Remove Error Gage Values
QuickView_5n = QuickView_5n[QuickView_5n['USGS_Temp']>= 0]
QuickView_5n

In [None]:
############### Get Accuracy #################

In [None]:
## Define Error Metrics ##
EPSILON = 1e-10

def _error(actual: np.ndarray, predicted: np.ndarray):
    """ Simple error """
    return predicted - actual

def _percentage_error(actual: np.ndarray, predicted: np.ndarray):
    """
    Percentage error
    Note: result is NOT multiplied by 100
    """
    return _error(actual, predicted) / (actual + EPSILON)

def mse(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Squared Error """
    return np.mean(np.square(_error(actual, predicted)))

def rmse(actual: np.ndarray, predicted: np.ndarray):
    """ Root Mean Squared Error """
    return np.sqrt(mse(actual, predicted))

def nrmse(actual: np.ndarray, predicted: np.ndarray):
    """ Normalized Root Mean Squared Error """
    return rmse(actual, predicted) / (actual.max() - actual.min())

def mae(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Absolute Error """
    return np.mean(np.abs(_error(actual, predicted)))

def mbe(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Bias Error """
    return np.sum(_error(actual, predicted))/len(actual)

METRICS = {
    'rmse': rmse,
    'nrmse': nrmse,
    'mae': mae,
    'mbe':mbe
}

def evaluate(actual: np.ndarray, predicted: np.ndarray, metrics=('rmse', 'nrmse', 'mae', 'mbe')):
    results = {}
    for name in metrics:
        try:
            results[name] = METRICS[name](actual, predicted)
        except Exception as err:
            results[name] = np.nan
            print('Unable to compute metric {0}: {1}'.format(name, err))
    return results

def evaluate_all(actual: np.ndarray, predicted: np.ndarray):
    return evaluate(actual, predicted, metrics=set(METRICS.keys()))

In [None]:
##############################
### Check Wider & Error  Points ###
#############################

In [None]:
## Check wider points ##
# Get a copy of the data
QuickView_5n = Gage_RS_Temp_5n[['site_no','huc_cd','Date_Time','Avg_Temp', 'Avg_RWC_Wid', 'USGS_Temp']] ## Note: we use datetime, becuase we need the timestamps to match images later

# Remove error gage values
QuickView_5n = QuickView_5n[QuickView_5n['USGS_Temp']>= 0]

# Define the width bin edges
bins = [QuickView_5n['Avg_RWC_Wid'].min(), 100,  300, 500, 1500, QuickView_5n['Avg_RWC_Wid'].max()]

# Bin the width column
QuickView_5n['Width_bin'] = pd.cut(QuickView_5n['Avg_RWC_Wid'], bins)

# Remove observations < 100m in width
QuickView_5n = QuickView_5n[ (QuickView_5n['Avg_RWC_Wid']>=100)]

# Preview
QuickView_5n

In [None]:
###########################################################
################## Get RELATIVE ACCURACIES  ##################
###########################################################

In [None]:
### We want to compare relative accuracies -- Landsat delta vs Gage delta within a given image 
## So we need to figure out which of the above gages are within the same landsat images as each other

In [None]:
## Make a copy of the gages
Filtered = QuickView_5n[:]

## Get List of Gages -- Used to Compare the location to Landsat Scene footprints in Arc
print("Number of Potential Sites: " + str(len(Filtered['site_no'].drop_duplicates().to_list())))
print("Site Numbers: ")
print(Filtered['site_no'].drop_duplicates().to_list())

In [None]:
########## GAGE PAIRS WERE DETERMINED MANUALLY IN ARCGIS ##########
## Output is a csv file -- USGS Site numbers have leading zeros. Be cautious of opening the CSV file in Excel ##

In [None]:
## Use all the possible pair options -- Created in ArcGIS and populated in a CSV file  ##  
GagePairs = pd.read_csv(r"F:\Insert_File_Path_of_Matched_Gages\Site_Image_Match.csv", converters={'Site 1': str, 'Site 2': str }, engine = 'python') # Update this file path
GagePairs.head()

In [None]:
## Join The gage and landsat Information with the corresponding sites
AllSite_ImageOverlap = pd.DataFrame()

for i in range(len(GagePairs)):
    PairTest_1 = Filtered[Filtered['site_no']== GagePairs.iloc[i, 0]]
    PairTest_2 = Filtered[Filtered['site_no']== GagePairs.iloc[i, 1]]

    JoinSitePair = pd.merge(PairTest_1,PairTest_2, on= "Date_Time")
    JoinSitePair = JoinSitePair[["site_no_x", "Date_Time", "Avg_Temp_x","Avg_RWC_Wid_x", "USGS_Temp_x","site_no_y", "Avg_Temp_y", "Avg_RWC_Wid_y","USGS_Temp_y"]]
    JoinSitePair["Image_Diff"] = JoinSitePair["Avg_Temp_x"]-JoinSitePair["Avg_Temp_y"]
    JoinSitePair["Gage_Diff"] = JoinSitePair["USGS_Temp_x"]-JoinSitePair["USGS_Temp_y"]

    # Add to DF
    output = pd.concat([AllSite_ImageOverlap, JoinSitePair],ignore_index=True)
        
    # Signif Output
    AllSite_ImageOverlap = output

AllSite_ImageOverlap.head()

In [None]:
## Get Number of Usable Gages ##
Site1List = AllSite_ImageOverlap["site_no_x"].to_list()
Site2List = AllSite_ImageOverlap["site_no_y"].to_list()
Crossover_Sites = (Site1List + Site2List)
Crossover_Sites = list(set(Crossover_Sites))
print("Number of gages with crossover: ", str(len(Crossover_Sites)))
print(Crossover_Sites) # Used to make a final shapefile for study area map. Created in ArcGIS

In [None]:
### Check the temperature difference between image and gage measurement ##

In [None]:
from scipy import stats
import seaborn as sns

## Calculate the Error Metrics
ErrorMetrics_Cross = evaluate_all(actual= AllSite_ImageOverlap.Gage_Diff, predicted=AllSite_ImageOverlap.Image_Diff )
print(ErrorMetrics_Cross)

# Get the labels for the plots
RMSE_Label =  str(round(ErrorMetrics_Cross.get('rmse'),1))
MAE_Label = str(round(ErrorMetrics_Cross.get('mae'),1))
Bias_Label = str(round(ErrorMetrics_Cross.get('mbe'),1))

### Plot the Results ###

# Get the data density for symbology/color ramp
from scipy.stats import gaussian_kde
xy = np.vstack([AllSite_ImageOverlap["Gage_Diff"], AllSite_ImageOverlap["Image_Diff"]])
z = gaussian_kde(xy)(xy)

# Plot
ax = sns.jointplot(data=AllSite_ImageOverlap, x="Gage_Diff", y="Image_Diff", xlim = (-10,12), ylim = (-10,12), space= 1,
                  kind="kde", fill = True, zorder = 1, alpha = 0)

sns.scatterplot(data=AllSite_ImageOverlap, x="Gage_Diff", y="Image_Diff", marker = "o", edgecolor=None, s = 20, hue = z, palette = "cividis",  zorder = 2, legend = False)
plt.plot([-10, 12], [-10, 12], zorder = 1,  color = 'grey', linestyle='dashed')

# Label settings
ax.set_axis_labels('Difference between gage measurements (°C)', 'Difference between TIR measurements (°C)', fontsize=12) 
ax.ax_joint.set_xticks([-10, -5, 0, 5, 10])
ax.ax_joint.set_yticks([-10, -5, 0, 5, 10])

# Color Bar
norm = plt.Normalize(z.min(), z.max())
sm = plt.cm.ScalarMappable(cmap="cividis", norm=norm)
sm.set_array([])
plt.subplots_adjust(left=0.2, right=0.8, top=0.8, bottom=0.2)  # shrink figure  so cbar is visible
# make new ax object for the cbar
cbar_ax = ax.fig.add_axes([.85, .25, .05, .4])  # x, y, width, height
plt.colorbar(sm, cax=cbar_ax)

# Add the Labels to the figure 
plt.text(x= -12.5, y=0.11, s= "RMSE: "+ str(RMSE_Label) + "°C", fontsize=12, color='black')
plt.text(x= -12.5, y=0.10, s= "MAE: "+ str(MAE_Label) + "°C", fontsize=12, color='black')
plt.text(x= -12.5, y=0.09, s= "Bias: "+ str(Bias_Label) + "°C", fontsize=12, color='black')
plt.text(x= -0.3 , y=0.113, s= "Data \nDensity", fontsize=12, color='black')

# Save the figure 
plt.savefig(r"F:\Insert_File_Path_for_Graphic_Output\Accuracy_Assessment.png", bbox_inches="tight", pad_inches=0.25, dpi=1200)  # Update this file path

In [None]:
########### Look At Accuracy Comparisons by Width ###########

In [None]:
## Create a copy of the df
Width_Accuracy = AllSite_ImageOverlap[:]

# Get the minimum width between the two locations
Width_Accuracy['Min_Width'] = Width_Accuracy[['Avg_RWC_Wid_x','Avg_RWC_Wid_y']].T.min()

# Define the bin edges -- Min Width
Min_bins = [100, 150, 200, 250, 300, 1500]

# bin the Min Width  column
Width_Accuracy['Min_Width_bin'] = pd.cut(Width_Accuracy['Min_Width'], Min_bins)

# Export the widths -- will need for comparision graphic 2C
Width_Accuracy.to_csv(r"F:\Insert_File_Path_For\Widths_AA.csv")


In [None]:
## Get Accuracy by the Width Bins
# Create Data Frame
ErrorMetric_wid = pd.DataFrame()

## Look at Accuracy by Site  -- keeping wider points only
groups = Width_Accuracy.groupby('Min_Width_bin', observed= True)

for name, group in groups:
    try:
        CompleteRows = group.loc[group.notna().all(axis='columns')]
        # Create Linear Model
        model = LinearRegression()
        # Define predictor and response variables
        X, y = CompleteRows[['Gage_Diff']], CompleteRows.Image_Diff

        # Fit regression model
        model.fit(X, y)

        # Calculate R-squared
        r_squared = model.score(X, y)

        # Calculate Error Metrics
        ErrorMetrics = evaluate_all(actual=CompleteRows.Gage_Diff, predicted=CompleteRows.Image_Diff)
        # Convert Dictionary to DF
        Error_Metric_df = pd.DataFrame(ErrorMetrics, index = [0])
        # Attach Site No
        Error_Metric_df['Min_Width_bin'] = name
        
        # Get Number of Points
        Error_Metric_df['No_Points'] = len(CompleteRows)

        # Concatenate 
        ErrorMetric_wid = pd.concat([ErrorMetric_wid, Error_Metric_df], axis = 0 )
    except:
        continue

ErrorMetric_wid

In [None]:
# Save the Width AA table
ErrorMetric_wid.to_csv(r"F:\Insert_File_Path_For\AA_by_widths.csv")