# Kachemak Bay Turbidity - Exploratory Data Analysis 
This script: <br>
1. Pulls the most recent data from ERDDAP for Seldovia and Homer Surface and Deep Water.
2. Performs Exploratory Data Analysis.

In [1]:
import requests
import pandas as pd
import os
import matplotlib.pyplot as plt

### Download the most recent data

In [2]:
# URL of the webpage where the CSV link is located
data_dict = {'seldovia_swq.csv' : 'https://erddap.aoos.org/erddap/tabledap/nerrs_kacsswq.csv?time%2Csea_water_turbidity%2Csea_water_turbidity_qc_agg',
            'seldovia_dwq.csv' : 'https://erddap.aoos.org/erddap/tabledap/nerrs_kacsdwq.csv?time%2Csea_water_turbidity%2Csea_water_turbidity_qc_agg',
            'homer_dwq.csv' : 'https://erddap.aoos.org/erddap/tabledap/nerrs_kachdwq.csv?time%2Csea_water_turbidity%2Csea_water_turbidity_qc_agg',
            'homer_swq.csv' : 'https://erddap.aoos.org/erddap/tabledap/nerrs_kach3wq.csv?time%2Csea_water_turbidity%2Csea_water_turbidity_qc_agg',
            }

print()
# Send a GET request to the URL
for i, sensor in enumerate(list(data_dict)):
    response = requests.get(list(data_dict.values())[i])

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Get the content of the response (CSV data)
        csv_data = response.content
        
        # Specify the file name for saving the CSV
        csv_file_name = list(data_dict.keys())[i]
        
        # Write the CSV data to a file in the current directory
        with open('data/'+csv_file_name, 'wb') as csv_file:
            csv_file.write(csv_data)
        
        print(f"CSV file for sensor {i+1} saved successfully as '{csv_file_name}' in the /data directory.\n")
    else:
        print(f"Failed to download CSV file for sensor {i+1}. Status code: {response.status_code}\n")


CSV file for sensor 1 saved successfully as 'seldovia_swq.csv' in the /data directory.

CSV file for sensor 2 saved successfully as 'seldovia_dwq.csv' in the /data directory.

CSV file for sensor 3 saved successfully as 'homer_dwq.csv' in the /data directory.

CSV file for sensor 4 saved successfully as 'homer_swq.csv' in the /data directory.



### Load in CSV Files

In [None]:
# List all csv files in the data directory 
csv_files = [f for f in os.listdir('data') if f.endswith('.csv')]

### Data Resolution

In [None]:
def data_resolution():
    for csv in csv_files:
        # Load csv file into a DataFrame
        df = pd.read_csv(f"data/{csv}", low_memory=False)

        # # Display the first few rows to verify (UNCOMMENT FOR TESTING)
        # print(df.head())
        # print("\n")

        # Remove the first row containing units and reindex
        df = df.drop(index=0)
        df = df.reset_index(drop=True)
        # Convert the time column entries into a datetime type
        df['time'] = pd.to_datetime(df['time'])

        # Calculate the average measurement interval
        avg_interval = df['time'].diff().mean()

        # Get the first and last measurement dates
        first_measurement_date = df['time'].min()
        last_measurement_date = df['time'].max()

        # Display the results
        print(f"First Measurement Date for {csv}:", first_measurement_date.strftime('%Y-%m-%d %H:%M:%S'))
        print(f"Last Measurement Date for {csv}:", last_measurement_date.strftime('%Y-%m-%d %H:%M:%S'))
        print(f"Average Measurement Interval for {csv}: {avg_interval.components.hours} hours, {avg_interval.components.minutes} minutes")
        print("\n")


# Run the function
data_resolution()

### Data Quality

In [None]:
def data_quality():
    # From https://erddap.aoos.org/erddap/tabledap/nerrs_kacsswq.html
    qc_flag_meanings =  {
                        'PASS'          : 1,
                        'NOT_EVALUATED' : 2,
                        'SUSPECT'       : 3,
                        'FAIL'          : 4,
                        'MISSING'       : 9
                        }

    for csv in csv_files:
        # Load csv file into a DataFrame
        df = pd.read_csv(f"data/{csv}", low_memory=False)

        # # Display the first few rows to verify (UNCOMMENT FOR TESTING)
        # print(df.head())
        # print("\n")

        # Remove the first row containing units and reindex
        df = df.drop(index=0)
        df = df.reset_index(drop=True)

        # Make the 'sea_water_turbidity_qc_agg' column a type int instead of float
        df['sea_water_turbidity_qc_agg'] = df['sea_water_turbidity_qc_agg'].astype(int)

        # Count the occurrences of each value in the 'sea_water_turbidity_qc_agg' column
        counts = df['sea_water_turbidity_qc_agg'].value_counts()
        
        # Get the total number of rows
        total_rows = df.shape[0]

        # # Print the counts (UNCOMMENT FOR TESTING)
        # print(counts)
        print("------------------------------------------------------------")

        # counts.get(X, y) gets the value in the counts series for X and defaults to value y if X is not found
        if counts.get(4, 0) > 0: 
            pass_fail_rate = counts.get(1, 0) / counts.get(4, 0) 
        else:
            pass_fail_rate = float('inf')  # To handle the case where there are no fail entries

        print(f"Quality of {csv}:")
        print(f"Total entries: {total_rows}")
        print(f"Percentage of PASS entries: {(counts.get(1, 0) / total_rows * 100):.1f}%") 
        print(f"Percentage of NOT_EVALUATED entries: {(counts.get(2, 0) / total_rows * 100):.1f}%")
        print(f"Percentage of SUSPECT entries: {(counts.get(3, 0) / total_rows * 100):.1f}%")
        print(f"Percentage of FAIL entries: {(counts.get(4, 0) / total_rows * 100):.1f}%")
        print(f"Percentage of MISSING entries: {(counts.get(9, 0) / total_rows * 100):.1f}%")
        print(f"Pass/Fail rate: {pass_fail_rate:.1f}")


# Run the function
data_quality()

### Statistical Metrics

In [None]:
def statistical_metrics():
    for csv in csv_files:
        # Load csv file into a DataFrame
        df = pd.read_csv(f"data/{csv}", low_memory=False)

        # # Display the first few rows to verify (UNCOMMENT FOR TESTING)
        # print(df.head())
        # print("\n")

        # Remove the first row containing units and reindex
        df = df.drop(index=0)
        df = df.reset_index(drop=True)

        # Make the 'sea_water_turbidity' column a type int instead of float
        df['sea_water_turbidity'] = df['sea_water_turbidity'].astype(float)

        # Only include rows where 'sea_water_turbidity_qc_agg' is 1 (PASS)
        df_filtered = df[df['sea_water_turbidity_qc_agg'].astype(int) == 1]
    
        # Calculate statistics
        min_value = df_filtered['sea_water_turbidity'].min()
        max_value = df_filtered['sea_water_turbidity'].max()
        mean = df_filtered['sea_water_turbidity'].mean()
        median = df_filtered['sea_water_turbidity'].median()
        mode = df_filtered['sea_water_turbidity'].mode()[0]  # mode() returns a Series
        percentiles = df_filtered['sea_water_turbidity'].quantile([0.25, 0.5, 0.75])
        range_ = df_filtered['sea_water_turbidity'].max() - df_filtered['sea_water_turbidity'].min()
        variance = df_filtered['sea_water_turbidity'].var()
        std_dev = df_filtered['sea_water_turbidity'].std()
        

        # Print the results
        print(f"Quality of {csv}:")
        print(f"Lowest Value: {min_value:.2f}")
        print(f"Highest Value: {max_value:.2f}")
        print(f"Mean: {mean:.2f}")
        print(f"Median: {median:.2f}")
        print(f"Mode: {mode:.2f}")
        print(f"25th Percentile: {percentiles[0.25]:.2f}")
        print(f"50th Percentile (Median): {percentiles[0.5]:.2f}")
        print(f"75th Percentile: {percentiles[0.75]:.2f}")
        print(f"Range: {range_:.2f}")
        print(f"Variance: {variance:.2f}")
        print(f"Standard Deviation: {std_dev:.2f}")
        print("\n")


# Run the function
statistical_metrics()

### Graphs

In [None]:
def plot_turbidity_vs_time():
    for csv in csv_files:
        # Load csv file into a DataFrame
        df = pd.read_csv(f"data/{csv}", low_memory=False)

        # # Display the first few rows to verify (UNCOMMENT FOR TESTING)
        # print(df.head())
        # print("\n")

        # Remove the first row containing units and reindex
        df = df.drop(index=0)
        df = df.reset_index(drop=True)

        # Make the 'sea_water_turbidity' column a type int instead of float
        df['sea_water_turbidity'] = df['sea_water_turbidity'].astype(float)

        # Only include rows where 'sea_water_turbidity_qc_agg' is 1 ('PASS')
        df_filtered = df[df['sea_water_turbidity_qc_agg'].astype(int) == 1]

        # Ensure 'time' is in datetime format
        df_filtered.loc[:, 'time'] = pd.to_datetime(df_filtered['time'])

        # Plot turbidity over time
        plt.figure(figsize=(12, 6))
        plt.plot(df_filtered['time'], df_filtered['sea_water_turbidity'], linestyle='-', color='b')
        plt.xlabel('Time')
        plt.ylabel('Sea Water Turbidity (NTU)')
        plt.title(f'Sea Water Turbidity Over Time for {csv}')
        plt.grid(True)
        plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
        plt.tight_layout()  # Adjust layout to fit labels
        plt.show()


# Run the function
plot_turbidity_vs_time()