In [13]:
import pandas as pd
from datetime import datetime
import json
import requests
import os

def get_stations_from_networks():
    """Build a station list by using IEM networks."""
    stations = []
    states = (
        "AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN "
        "MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT "
        "WA WI WV WY"
    )
    networks = [f"{state}_ASOS" for state in states.split()]

    for network in networks:
        # Get metadata
        uri = f"https://mesonet.agron.iastate.edu/geojson/network/{network}.geojson"
        try:
            response = requests.get(uri)
            response.raise_for_status()  # Check for request errors
            jdict = response.json()
            
            # Extract station IDs
            for site in jdict["features"]:
                stations.append(site["properties"]["sid"])
                
        except requests.RequestException as e:
            print(f"Failed to retrieve data for network {network}: {e}")
    
    return stations

def download_station_data(station_list, output_dir, base_url=None):
    """
    Download data for each station from the specified or default base URL and save as CSV files.
    """
    # Set a default base URL if none is provided
    if base_url is None:
        base_url = (
            "https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
            "station=STATION_ID&data=feel&year1=1993&month1=1&day1=1&year2=2023&month2=1&day2=1&"
            "tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3"
        )

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    for station in station_list:
        if station is None:  # Skip invalid entries
            continue

        # Construct the URL for each station
        url = base_url.replace("STATION_ID", station)
        
        # Download and save the file
        try:
            response = requests.get(url)
            response.raise_for_status()  # Check for request errors
            
            # Define the output file path
            output_file = os.path.join(output_dir, f"{station}_data.csv")
            
            # Write to CSV file
            with open(output_file, "wb") as file:
                file.write(response.content)
                
            print(f"Downloaded data for station: {station}")
        
        except requests.RequestException as e:
            print(f"Failed to download data for station: {station}. Error: {e}")    

def get_cleaned_station_id(station_id):
    """Apply cleaning to a station ID and return the cleaned version."""
    if station_id[0].isdigit():  # Exclude IDs that start with a number
        return None  # Return None or raise an exception if desired
    elif len(station_id) == 4:  # Keep IDs that are exactly 4 characters long
        cleaned_station = station_id
    else:  # Add "K" in front of IDs shorter than 4 characters
        cleaned_station = f"K{station_id}"
    return cleaned_station

def get_file_paths(directory):
    """
    Get all file paths in a specified directory.

    Parameters:
    - directory: The path to the directory to search in.

    Returns:
    - A list of full file paths for all files in the directory.
    """
    dataset = []
    print(f"Searching in directory: {directory}")  # Debug statement
    for root, _, files in os.walk(directory):
        print(f"Current directory: {root}")  # Debug statement
        for file in files:
            print(f"Found file: {file}")  # Debug statement
            if file.lower().endswith(".csv"):
                dataset.append(os.path.join(root, file))
    print(f"Total CSV files found: {len(dataset)}")  # Debug statement
    return dataset

# Function to handle ICAO call sign extraction and data loading
def handle_station_data(file_path):
    """Read CSV file and extract ICAO call sign or return a default value."""
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Check if the DataFrame is empty
    if df.empty:
        print(f"Warning: The file {file_path} is empty. Returning a default ICAO code.")
        return "KUNKNOWN", pd.DataFrame()  # Return a default or placeholder value

    icao_code = f"K{df.iloc[0, 0]}"
    return icao_code, df  # Return both ICAO code and the DataFrame

def load_and_preprocess_data(file_path):
    """Load specific columns from a CSV file and preprocess data types."""
    data = pd.read_csv(file_path, usecols=[1, 2])  # Assuming columns 1 and 2 are date and temp
    data["valid"] = pd.to_datetime(data["valid"], errors="coerce")
    data["feel"] = pd.to_numeric(data["feel"], errors="coerce")
    return data

def convert_to_timezone(data, time_zone):
    """Convert datetime column to specified timezone and remove timezone information."""
    data["valid"] = data["valid"].dt.tz_localize("UTC").dt.tz_convert(time_zone).dt.tz_localize(None)
    return data

def filter_daylight_hours(data):
    """Filter data for times between 6 am and 6 pm."""
    return data[(data["valid"].dt.hour >= 6) & (data["valid"].dt.hour < 18)]

def filter_ideal_temps(data):
    """Filter data for temperatures between 50 and 63.5 degrees."""
    return data[(data["feel"] >= 50) & (data["feel"] <= 63.5)]

def get_complete_years(data):
    """Identify years with data spanning from January to December."""
    complete_years = []
    for year in data["valid"].dt.year.unique():
        year_data = data[data["valid"].dt.year == year]
        if (year_data["valid"].dt.month.min() == 1) and (year_data["valid"].dt.month.max() == 12):
            complete_years.append(year)
    return complete_years

def filter_complete_years(data, complete_years):
    """Filter data to include only entries from complete years."""
    return data[data["valid"].dt.year.isin(complete_years)]

def calculate_average_ideal_days_per_year(filtered_data, complete_years):
    """Calculate the average number of ideal temperature days per year."""
    count_days = filtered_data["valid"].dt.date.nunique()
    num_years = len(complete_years)
    return count_days / num_years if num_years > 0 else 0

# Main function to execute the full pipeline
def average_annual_ideal_run_temp_days(file_path, time_zone):
    """Calculate the average annual ideal run temperature days for the dataset."""
    icao_code, data = handle_station_data(file_path)
    if data.empty:
        return 0  # Skip processing if the DataFrame is empty

    data = load_and_preprocess_data(file_path)
    data = convert_to_timezone(data, time_zone)
    data_daylight = filter_daylight_hours(data)
    filtered_data = filter_ideal_temps(data_daylight)
    complete_years = get_complete_years(filtered_data)
    complete_data = filter_complete_years(filtered_data, complete_years)
    return calculate_average_ideal_days_per_year(complete_data, complete_years)

def write_all_results_to_csv(dataset, output_path=None):
    """Iterate over the dataset and write ICAO codes and average annual ideal run temp days to a CSV file."""
    # Set default output path with date and time if none is provided
    if output_path is None:
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        output_path = f"results_{timestamp}.csv"
    
    results = []

    for station_id, time_zone in dataset.items():
        cleaned_station = get_cleaned_station_id(station_id)  # Get cleaned station ID
        if cleaned_station:  # Check if the cleaned station ID is valid
            aairtd = average_annual_ideal_run_temp_days(station_id, time_zone)
            
            # Append the result as a dictionary
            results.append({"ICAO": cleaned_station, "aairtd": aairtd})

    # Convert results to a DataFrame and write to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    print(f"Results written to {output_path}")

print(get_file_paths("station_data"))

Searching in directory: station_data
Total CSV files found: 0
[]
