# Get IFCB Data

This is from a notebook written by Gulce Kurtay during OceanHackWeek 2024. [notebook](https://github.com/oceanhackweek/ohw24_proj_pace_us/blob/main/final_notebooks/Hypercoast(PACE)_with_IFCB.ipynb)

Woods Hole Dashboard, https://ifcb-data.whoi.edu/dashboard <br>

This notebook designed to get in-situ flow cytometry group distribution data.<br>
Classified IFCB data is limited, so make sure to search the database then find a group distribution that matches with your interest. 



STEP 1  download the phytoplankton group distribution and metadata from the IFCB dashboard, it only downloads the autoclass files not the images<br>
STEP 2.1 organize the csv files in group distribution format aligns with lat and long info<br>
STEP 2.2 summarize the information into daily format to match up with PACE data <br>
STEP 3   Download the PACE data with hypercoast<br>
STEP 4   interactive map with spatial chl-a and insitu group distribution


In [1]:
#STEP 1.1 
#Functions that needed for the downloading the csv files
#STEP-1 Download the csv files and lat and long
import os
import requests
import csv
from concurrent.futures import ThreadPoolExecutor

def collect_bin_ids(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix):
    """
    Collects all bin IDs starting with a given prefix and stops at the end bin ID.

    :param start_bin_id: The starting bin ID to begin the search.
    :param end_bin_id: The bin ID at which to stop collecting.
    :param base_url: The base URL of the API.
    :param dataset: The dataset name to filter.
    :param instrument: The instrument name to filter.
    :param prefix: The prefix to match bin IDs against (e.g., "D2024").
    :return: A list of matching bin IDs.
    """
    bin_ids = []
    current_bin_id = start_bin_id
    
    while True:
        url = f"{base_url}/api/bin/{current_bin_id}"
        params = {
            "dataset": dataset,
            "instrument": instrument,
        }
        response = requests.get(url, params=params, timeout=10)
        if response.status_code != 200:
            print(f"Failed to retrieve data for bin: {current_bin_id}, Status Code: {response.status_code}")
            break
        
        data = response.json()
        if current_bin_id.startswith(prefix):
            bin_ids.append(current_bin_id)
            print(f"Collected bin ID: {current_bin_id}")
        
        # Check if we've reached the end bin ID
        if current_bin_id == end_bin_id:
            print(f"Reached the end bin ID: {end_bin_id}")
            break
        
        next_bin_id = data.get('next_bin_id')
        if not next_bin_id or not next_bin_id.startswith(prefix):
            break
        
        current_bin_id = next_bin_id
    
    print(f"Total bin IDs collected: {len(bin_ids)}")
    return bin_ids

def download_file(file_url, output_dir):
    """
    Downloads a specific file from the given URL.

    :param file_url: The full URL to the file.
    :param output_dir: The directory where the file will be saved.
    """
    file_name = os.path.basename(file_url)
    output_file = os.path.join(output_dir, file_name)
    os.makedirs(output_dir, exist_ok=True)

    try:
        response = requests.get(file_url, timeout=10)
        response.raise_for_status()
        with open(output_file, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {output_file}")
        return True

    except requests.exceptions.HTTPError as err:
        if response.status_code == 404:
            print(f"File not found (404): {file_url}")
        else:
            print(f"HTTP error occurred: {err} - URL: {file_url}")
        return False
    except requests.exceptions.ConnectionError as err:
        print(f"Connection error occurred: {err}")
        return False
    except requests.exceptions.Timeout as err:
        print(f"Timeout error occurred: {err}")
        return False
    except requests.exceptions.RequestException as err:
        print(f"An error occurred: {err}")
        return False

def fetch_lat_lon(bin_id, base_url, dataset):
    """
    Fetches latitude and longitude for a specific bin ID.

    :param bin_id: The bin ID to fetch lat/lon for.
    :param base_url: The base URL where the dataset is located.
    :param dataset: The dataset name.
    :return: A dictionary with latitude and longitude.
    """
    url = f"{base_url}/api/bin/{bin_id}"
    params = {
        "dataset": dataset,
    }
    response = requests.get(url, params=params, timeout=10)
    if response.status_code == 200:
        data = response.json()
        return {
            "bin_id": bin_id,
            "latitude": data.get('lat'),
            "longitude": data.get('lng')
        }
    else:
        print(f"Failed to fetch lat/lon for bin: {bin_id}, Status Code: {response.status_code}")
        return None

def download_autoclass_csvs_and_lat_lon(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix, output_dir, max_workers=5):
    """
    Collects bin IDs, downloads all _class_scores.csv files, and fetches latitude and longitude for each bin.

    :param start_bin_id: The starting bin ID to begin the search.
    :param end_bin_id: The bin ID at which to stop collecting.
    :param base_url: The base URL where the dataset is located.
    :param dataset: The dataset name to filter.
    :param instrument: The instrument name to filter.
    :param prefix: The prefix to match bin IDs against (e.g., "D2024").
    :param output_dir: The directory where the files will be saved.
    :param max_workers: The maximum number of parallel downloads.
    """
    bin_ids = collect_bin_ids(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        lat_lon_results = []
        
        for bin_id in bin_ids:
            # Download the _class_scores.csv file
            csv_file_url = f"{base_url}/{dataset}/{bin_id}_class_scores.csv"
            futures.append(executor.submit(download_file, csv_file_url, output_dir))
            
            # Fetch latitude and longitude
            lat_lon_results.append(fetch_lat_lon(bin_id, base_url, dataset))
        
        # Wait for all downloads to complete
        for future in futures:
            future.result()
        
        # Extract date from start_bin_id for the CSV filename
        date_str = start_bin_id.split('T')[0][1:]  # Extracts "DYYYYMMDD" and removes the "D"
        lat_lon_file = os.path.join(output_dir, f"{date_str}.csv")
        
        # Write lat/lon to CSV
        if lat_lon_results:
            with open(lat_lon_file, 'w', newline='') as csvfile:
                fieldnames = ['bin_id', 'latitude', 'longitude']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for result in lat_lon_results:
                    if result:
                        writer.writerow(result)
            print(f"Latitude and Longitude data saved to {lat_lon_file}")




In [2]:
#STEP 1.2 application
#To get the community composition information from the dashboard, you will need to know specific information about the dataset:
#I have added stat and end bin ids because it will take too much time, so you can limit it 
# Example usage
base_url = "https://ifcb-data.whoi.edu"
dataset = "mvco" #Dataset name: Name should be as it spelled in the url 
instrument = "IFCB10" #Instrument number
# Roll over the timeline and then you can copy the bin number
start_bin_id = "D20240215T150055_IFCB010"  # Start with a known valid bin ID 
end_bin_id = "D20240215T191619_IFCB010"  # End at this bin ID
#end_bin_id = "D20241227T181716_IFCB010"  # End at this bin ID
prefix = "D2024"  # Prefix to match bin IDs
output_dir = r"../../data/mvco"  # Directory to save the files

# Download all _class_scores.csv files and fetch lat/lon data, stopping at end_bin_id
download_autoclass_csvs_and_lat_lon(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix, output_dir)

Collected bin ID: D20240215T150055_IFCB010
Collected bin ID: D20240215T152410_IFCB010
Collected bin ID: D20240215T154723_IFCB010
Collected bin ID: D20240215T161037_IFCB010
Collected bin ID: D20240215T163352_IFCB010
Collected bin ID: D20240215T165704_IFCB010
Collected bin ID: D20240215T172017_IFCB010
Collected bin ID: D20240215T174329_IFCB010
Collected bin ID: D20240215T180642_IFCB010
Collected bin ID: D20240215T182954_IFCB010
Collected bin ID: D20240215T185306_IFCB010
Collected bin ID: D20240215T191619_IFCB010
Reached the end bin ID: D20240215T191619_IFCB010
Total bin IDs collected: 12
Downloaded: ../../data/mvco/D20240215T152410_IFCB010_class_scores.csv
Downloaded: ../../data/mvco/D20240215T154723_IFCB010_class_scores.csv
Downloaded: ../../data/mvco/D20240215T161037_IFCB010_class_scores.csv
Downloaded: ../../data/mvco/D20240215T163352_IFCB010_class_scores.csv
Downloaded: ../../data/mvco/D20240215T165704_IFCB010_class_scores.csv
Downloaded: ../../data/mvco/D20240215T150055_IFCB010_clas

In [5]:
#STEP 2.1, SPATIAL GROUPING FOR ONE DAY
#Choose the group with highest score for eVery images
import os
import pandas as pd

# Define the directory containing the CSV files
if_files_root = r"../../data/mvco" # this is the folder pathway to your local computer 

# List to store summary data for each file
all_summaries = []

# Iterate over all files in the directory
for file_name in os.listdir(if_files_root):
    if file_name.startswith('D2024') and file_name.endswith('.csv'):
        file_path = os.path.join(if_files_root, file_name)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Keep the 'pid' column and process the numeric columns for max values
        pid_column = df['pid']
        numeric_df = df.iloc[:, 1:].fillna(-float('inf'))  # Process the numeric columns, excluding 'pid'
        
        # Find the max value and corresponding column
        df['max_value_column'] = numeric_df.idxmax(axis=1)
        df['max_value'] = numeric_df.max(axis=1)
        
        # Create a summary DataFrame, including 'pid', 'max_value_column', and 'max_value'
        summary_df = pd.DataFrame({
            'pid': pid_column,
            'max_value_column': df['max_value_column'],
            'max_value': df['max_value']
        })
        
        # Extract the date from the file name and remove the leading "D"
        summary_df['date'] = file_name.split('T')[0][1:]
        
        # Append the summary to the list
        all_summaries.append(summary_df)

# Concatenate all summary DataFrames into one
final_summary = pd.concat(all_summaries, ignore_index=True)

# Print the final summary (or save it to a file if needed)
print(final_summary)

# Optionally, save the final summary to a CSV file
final_summary.to_csv('../../data/final_summary.csv', index=False)


                                  pid   max_value_column  max_value      date
0      D20240215T152410_IFCB010_00001              fiber     1.0000  20240215
1      D20240215T152410_IFCB010_00002              fiber     1.0000  20240215
2      D20240215T152410_IFCB010_00004   nanoplankton_mix     0.9920  20240215
3      D20240215T152410_IFCB010_00005  Bacillariophyceae     0.2783  20240215
4      D20240215T152410_IFCB010_00006              fiber     0.9850  20240215
...                               ...                ...        ...       ...
71019  D20240215T191619_IFCB010_06417        Cryptophyta     0.8765  20240215
71020  D20240215T191619_IFCB010_06418   nanoplankton_mix     0.9960  20240215
71021  D20240215T191619_IFCB010_06419              fiber     0.9940  20240215
71022  D20240215T191619_IFCB010_06421              fiber     0.9604  20240215
71023  D20240215T191619_IFCB010_06422   nanoplankton_mix     0.9985  20240215

[71024 rows x 4 columns]


In [4]:
# Now load the data
difcb=pd.read_csv("../../data/mvco_2024.csv")
# Convert the cleaned date strings to datetime objects
difcb['date'] = pd.to_datetime(difcb['date'], format='%Y%m%d')
difcb.head()


FileNotFoundError: [Errno 2] No such file or directory: '../../data/mvco_2024.csv'

# Now go to Hypercoast notebook

To see how they got some PACE with hypercoast package--if you want.