In [1]:
import pandas as pd
import numpy as np

In [2]:
def standardize_shark_data(input_filename='shark_data.csv', output_filename='standardized_shark_data_daily.csv'):
    """
    Loads raw shark tracking data, standardizes it by calculating the average
    daily location for each shark, and saves the result to a new CSV.

    Args:
        input_filename (str): The name of the raw CSV file.
        output_filename (str): The name for the new, daily-averaged CSV file.
    """
    try:
        df = pd.read_csv(input_filename, header=1, on_bad_lines='skip')
        print(f"Successfully loaded '{input_filename}'.")
    except FileNotFoundError:
        print(f"Error: '{input_filename}' not found. Please ensure the file is in the correct directory.")
        return

    df.columns = df.columns.str.strip().str.lower()

    # Convert coordinate columns to numeric, handling comma decimals
    for col in ['latitude', 'longitude']:
        df[col] = df[col].astype(str).str.replace(',', '.')
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Convert date column and drop any rows with invalid data
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df.dropna(subset=['latitude', 'longitude', 'date', 'deployid'], inplace=True)

    # Set the 'date' column as the index, which is required for resampling
    df = df.set_index('date').sort_index()
    print(f"Initial data cleaned. Starting with {len(df)} valid data points.")

    # We'll process each shark individually
    all_sharks_daily = []
    
    # Group the DataFrame by each unique shark ID
    grouped = df.groupby('deployid')
    
    print(f"\nStandardizing data for {len(grouped)} individual sharks by daily average...")

    for shark_id, shark_df in grouped:
        # Calculate the mean for numeric columns and take the first entry for categorical ones.
        aggregation_rules = {
            'latitude': 'mean',
            'longitude': 'mean',
            'area_tagged': 'first',
            'sex': 'first',
            'maturity': 'first'
        }
        
        # Apply the aggregation rules while resampling to a daily ('D') interval
        daily_df = shark_df.resample('D').agg(aggregation_rules)
        
        # Add the shark's ID back in, as it's lost during resampling
        daily_df['deployid'] = shark_id
        
        all_sharks_daily.append(daily_df)

    # Combine the daily-averaged data for all sharks into a single DataFrame
    daily_averaged_df = pd.concat(all_sharks_daily)

    # Drop the days where there were no pings (which result in NaN values)
    daily_averaged_df.dropna(subset=['latitude', 'longitude'], inplace=True)
    
    # Reset the index to turn the 'date' back into a regular column
    daily_averaged_df.reset_index(inplace=True)

    # Save the new, clean dataset to a new CSV file
    daily_averaged_df.to_csv(output_filename, index=False)
    
    print(f"\nPreprocessing complete.")
    print(f"Daily averaged dataset with {len(daily_averaged_df)} data points saved to '{output_filename}'.")






In [3]:
# --- Execute the function ---
if __name__ == "__main__":
    standardize_shark_data()

Successfully loaded 'shark_data.csv'.
Initial data cleaned. Starting with 19278 valid data points.

Standardizing data for 34 individual sharks by daily average...

Preprocessing complete.
Daily averaged dataset with 4752 data points saved to 'standardized_shark_data_daily.csv'.
