In [None]:
# Install necessary libraries
!pip install netCDF4 xarray matplotlib pandas scikit-learn tensorflow

# Import libraries
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from tensorflow import keras
import seaborn as sns
from google.colab import drive

# Mount Google Drive to access your files
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#############################################################
# SECTION 1: SETUP AND CONFIGURATION
#############################################################

import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from tqdm import tqdm  # For progress tracking

# Base directory for the Tunisia PFZ Model
base_dir = '/content/drive/MyDrive/Tunisia_PFZ_Model/'

# Create results directories
results_dir = os.path.join(base_dir, 'Results')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

processed_dir = os.path.join(results_dir, 'processed')
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

analysis_dir = os.path.join(results_dir, 'Analysis')
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)

model_dir = os.path.join(results_dir, 'Models')
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Tunisia bounding box: lat: 30-38°N, lon: 7-12°E
tunisia_lat_min, tunisia_lat_max = 30, 38
tunisia_lon_min, tunisia_lon_max = 7, 12

print("Tunisia PFZ Model - Starting processing pipeline...")
print(f"Base directory: {base_dir}")
print(f"Tunisia region: Lat {tunisia_lat_min}°-{tunisia_lat_max}°N, Lon {tunisia_lon_min}°-{tunisia_lon_max}°E")

Tunisia PFZ Model - Starting processing pipeline...
Base directory: /content/drive/MyDrive/Tunisia_PFZ_Model/
Tunisia region: Lat 30°-38°N, Lon 7°-12°E


In [None]:
#############################################################
# SECTION 2: FILE READING FUNCTIONS
#############################################################

def read_satellite_files(directory):
    """Read all NetCDF files from directory"""
    files = glob.glob(os.path.join(directory, '*.nc'))
    data_dict = {}

    print(f"Found {len(files)} files in {directory}")

    for file in files:
        filename = os.path.basename(file)
        print(f"Reading file: {filename}")

        try:
            # Try multiple engines to handle different NetCDF formats
            for engine in [None, 'netcdf4', 'h5netcdf', 'scipy']:
                try:
                    if engine is None:
                        data = xr.open_dataset(file)
                    else:
                        data = xr.open_dataset(file, engine=engine)
                    # If we get here, we successfully opened the file
                    break
                except Exception as e:
                    if engine == 'scipy':  # Last attempt failed
                        raise e
                    continue

            # Store in dictionary with filename as key
            data_dict[filename] = data

            # Print basic info
            print(f"Dimensions: {dict(data.sizes)}")
            print(f"Variables: {list(data.variables)}")
            print("-------------------------")
        except Exception as e:
            print(f"Error reading file {filename}: {e}")

    return data_dict

# Read all data from each directory
def read_all_data():
    print("Reading SST monthly data...")
    sst_mo_data = read_satellite_files(os.path.join(base_dir, 'SST_MO'))

    print("\nReading CHL monthly data...")
    chl_mo_data = read_satellite_files(os.path.join(base_dir, 'CHL_MO'))

    print("\nReading SST daily data...")
    sst_dl_data = read_satellite_files(os.path.join(base_dir, 'SST_DL'))

    print("\nReading CHL daily data...")
    chl_dl_data = read_satellite_files(os.path.join(base_dir, 'CHL_DL'))

    return sst_mo_data, chl_mo_data, sst_dl_data, chl_dl_data

In [None]:
#############################################################
# SECTION 3: DATA CROPPING FUNCTIONS
#############################################################

def crop_to_tunisia(data_dict):
    """Crop satellite data to Tunisia region"""
    cropped_data = {}

    for filename, data in data_dict.items():
        print(f"Cropping file: {filename}")

        try:
            # Get indices corresponding to Tunisia region
            lat = data['lat'].values
            lon = data['lon'].values

            # Find indices for Tunisia region
            lat_indices = np.where((lat >= tunisia_lat_min) & (lat <= tunisia_lat_max))[0]
            lon_indices = np.where((lon >= tunisia_lon_min) & (lon <= tunisia_lon_max))[0]

            if len(lat_indices) == 0 or len(lon_indices) == 0:
                print(f"No data points found in Tunisia region for {filename}")
                continue

            # Get min/max indices
            lat_min_idx, lat_max_idx = min(lat_indices), max(lat_indices)
            lon_min_idx, lon_max_idx = min(lon_indices), max(lon_indices)

            # Create a new dataset with just the Tunisia region
            data_cropped = data.isel(lat=slice(lat_min_idx, lat_max_idx + 1),
                                    lon=slice(lon_min_idx, lon_max_idx + 1))

            # Store in dictionary
            cropped_data[filename] = data_cropped

            # Print dimensions after cropping
            print(f"Cropped dimensions: {dict(data_cropped.sizes)}")

            # Save cropped file
            result_filename = f"Tunisia_{filename}"
            result_path = os.path.join(results_dir, result_filename)

            # Handle potential error in saving
            try:
                data_cropped.to_netcdf(result_path)
                print(f"Saved Tunisia data to {result_path}")
            except Exception as e:
                print(f"Error saving NetCDF file: {e}")
                print("Trying alternative save method...")
                try:
                    # Try with scipy engine which is more forgiving
                    data_cropped.to_netcdf(result_path, engine='scipy')
                    print(f"Saved Tunisia data to {result_path} using scipy engine")
                except Exception as e2:
                    print(f"Could not save file: {e2}")

            print("-------------------------")
        except Exception as e:
            print(f"Error cropping file {filename}: {e}")

    return cropped_data

# Crop all data to Tunisia region
def crop_all_data(sst_mo_data, chl_mo_data, sst_dl_data, chl_dl_data):
    print("\nCropping SST monthly data to Tunisia...")
    sst_mo_tunisia = crop_to_tunisia(sst_mo_data)

    print("\nCropping CHL monthly data to Tunisia...")
    chl_mo_tunisia = crop_to_tunisia(chl_mo_data)

    print("\nCropping SST daily data to Tunisia...")
    sst_dl_tunisia = crop_to_tunisia(sst_dl_data)

    print("\nCropping CHL daily data to Tunisia...")
    chl_dl_tunisia = crop_to_tunisia(chl_dl_data)

    return sst_mo_tunisia, chl_mo_tunisia, sst_dl_tunisia, chl_dl_tunisia

In [None]:
#############################################################
# SECTION 4: DATA CLASSIFICATION FUNCTIONS
#############################################################

def classify_fishing_zone(row):
    """Classify fishing zone potential based on SST and chlorophyll values"""
    sst = row['sst']
    chl = row['chlorophyll']

    # For Tunisia, typical SST ranges from ~13°C in winter to ~30°C in summer
    # Chlorophyll concentration varies widely, but typically 0.1-0.5 mg/m³ is good

    # Season-based classification (using the date to determine season)
    date_str = str(row['date'])
    # Handle both formats: 'Monthly_YYYYMM', 'Daily_YYYYMMDD', or just 'YYYYMM'
    if 'Monthly_' in date_str:
        month = int(date_str.replace('Monthly_', '')[-6:-4])
    elif 'Daily_' in date_str:
        month = int(date_str.replace('Daily_', '')[-8:-6])
    else:
        month = int(date_str[-6:-4]) if len(date_str) >= 6 else int(date_str[-2:])

    # Different criteria by season
    if month in [6, 7, 8]:  # Summer
        # Summer: higher temperatures, prefer moderate chlorophyll
        if chl > 0.2 and 24.0 < sst < 29.0:
            return 'HIGH'
        elif chl > 0.1 and 22.0 < sst < 30.0:
            return 'MEDIUM'
        else:
            return 'LOW'
    elif month in [12, 1, 2]:  # Winter
        # Winter: prefer warmer spots with high chlorophyll
        if chl > 0.4 and sst > 16.0:
            return 'HIGH'
        elif chl > 0.25 and sst > 15.0:
            return 'MEDIUM'
        else:
            return 'LOW'
    else:  # Spring/Fall
        # Spring/Fall transition: balanced conditions
        if chl > 0.3 and 18.0 < sst < 26.0:
            return 'HIGH'
        elif chl > 0.15 and 16.0 < sst < 28.0:
            return 'MEDIUM'
        else:
            return 'LOW'

In [None]:
#############################################################
# SECTION 5: DATA EXTRACTION FUNCTIONS
#############################################################

def extract_to_dataframe(sst_data, chl_data, date_label):
    """Extract SST and Chlorophyll data to a pandas DataFrame"""
    try:
        # Extract the data variables
        sst = sst_data['sst'].values
        chlorophyll = chl_data['chlor_a'].values

        # Extract coordinates
        lat = sst_data['lat'].values
        lon = sst_data['lon'].values

        print("Data shapes:")
        print(f"SST shape: {sst.shape}")
        print(f"Chlorophyll shape: {chlorophyll.shape}")
        print(f"Latitude shape: {lat.shape}")
        print(f"Longitude shape: {lon.shape}")

        # Create DataFrame
        df_data = []

        # Process data points
        print("Processing data points...")

        for y_idx in tqdm(range(len(lat)), desc="Processing rows"):
            for x_idx in range(len(lon)):
                try:
                    # Extract values for this point
                    sst_val = sst[y_idx, x_idx]
                    chl_val = chlorophyll[y_idx, x_idx]

                    # Skip if any value is NaN or invalid
                    if (np.isnan(sst_val) or np.isnan(chl_val) or
                        isinstance(sst_val, np.ma.core.MaskedConstant) or
                        isinstance(chl_val, np.ma.core.MaskedConstant)):
                        continue

                    # Add to data list
                    df_data.append({
                        'latitude': float(lat[y_idx]),
                        'longitude': float(lon[x_idx]),
                        'sst': float(sst_val),
                        'chlorophyll': float(chl_val),
                        'date': date_label
                    })

                except Exception as e:
                    # Skip problematic points
                    continue

        # Create DataFrame
        df = pd.DataFrame(df_data)
        print(f"Created DataFrame with {len(df)} valid data points")

        # Display head and data statistics
        print("\nFirst few rows:")
        print(df.head())

        print("\nData statistics:")
        print(df.describe())

        # Check for any remaining issues in the data
        print("\nChecking for infinite values:")
        print(np.isinf(df[['sst', 'chlorophyll']]).sum())

        # Remove any infinities if present
        if np.isinf(df[['sst', 'chlorophyll']]).any().any():
            print("Removing infinite values")
            df = df.replace([np.inf, -np.inf], np.nan).dropna()
            print(f"DataFrame now has {len(df)} valid data points")

        # Apply classification to dataset
        print("\nClassifying fishing zones...")
        df['FishingZone'] = df.apply(classify_fishing_zone, axis=1)

        # Convert to numeric for model training
        df['Label'] = df['FishingZone'].map({'HIGH': 2, 'MEDIUM': 1, 'LOW': 0})

        # Check distribution of classes
        class_counts = df['FishingZone'].value_counts()
        print("\nClass distribution:")
        print(class_counts)
        print(f"Percentage distribution:\n{100 * class_counts / len(df)}")

        # Save the DataFrame to CSV
        csv_file = os.path.join(processed_dir, f'Tunisia_PFZ_{date_label}.csv')
        df.to_csv(csv_file, index=False)
        print(f"Saved DataFrame to {csv_file}")

        # Create visualization of class distribution
        plt.figure(figsize=(8, 6))
        plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%',
                colors=['forestgreen', 'gold', 'red'], startangle=90)
        plt.axis('equal')
        plt.title(f'Distribution of Fishing Zone Classifications - {date_label}')
        plt.savefig(os.path.join(analysis_dir, f'FishingZones_Pie_{date_label}.png'), dpi=300)
        plt.close()

        # Map the classes spatially
        plt.figure(figsize=(12, 10))
        scatter = plt.scatter(df['longitude'], df['latitude'], c=df['Label'],
                             cmap='viridis', alpha=0.7, s=10)
        plt.colorbar(scatter, label='Fishing Zone Potential (0=Low, 1=Medium, 2=High)')
        plt.title(f'Spatial Distribution of Fishing Zones - {date_label}')
        plt.xlabel('Longitude (°E)')
        plt.ylabel('Latitude (°N)')
        plt.grid(alpha=0.3)
        plt.savefig(os.path.join(analysis_dir, f'FishingZones_Map_{date_label}.png'), dpi=300)
        plt.close()

        return df
    except Exception as e:
        print(f"Error extracting data to DataFrame: {e}")
        return None

In [None]:
#############################################################
# SECTION 6: PAIRED DATA PROCESSING
#############################################################

def find_matching_pairs(sst_files_dict, chl_files_dict, is_monthly=True):
    """Find matching SST and CHL file pairs"""
    pairs = []

    for sst_filename in sst_files_dict:
        if is_monthly:
            # Extract date from monthly filename (AQUA_MODIS.YYYYMM_YYYYMMDD...)
            sst_date = sst_filename.split('.')[1].split('_')[0]
        else:
            # Extract date from daily filename (JPSS1_VIIRS.YYYYMMDD...)
            sst_date = sst_filename.split('.')[1]

        for chl_filename in chl_files_dict:
            if is_monthly:
                chl_date = chl_filename.split('.')[1].split('_')[0]
            else:
                chl_date = chl_filename.split('.')[1]

            if sst_date == chl_date:
                pairs.append((sst_filename, chl_filename, sst_date))

    return pairs

def process_paired_data(sst_mo_tunisia, chl_mo_tunisia, sst_dl_tunisia, chl_dl_tunisia):
    """Process paired SST and CHL data"""
    # Find matching monthly pairs
    monthly_pairs = find_matching_pairs(sst_mo_tunisia, chl_mo_tunisia, is_monthly=True)
    print(f"\nFound {len(monthly_pairs)} matching monthly pairs")

    # Process each monthly pair
    monthly_dfs = {}
    for sst_filename, chl_filename, date in monthly_pairs:
        print(f"\nProcessing monthly pair for {date}")
        prefix = "Monthly_" if len(date) == 6 else ""  # Add prefix for clarity
        monthly_dfs[date] = extract_to_dataframe(
            sst_mo_tunisia[sst_filename],
            chl_mo_tunisia[chl_filename],
            f"{prefix}{date}"
        )

    # Find matching daily pairs
    daily_pairs = find_matching_pairs(sst_dl_tunisia, chl_dl_tunisia, is_monthly=False)
    print(f"\nFound {len(daily_pairs)} matching daily pairs")

    # Process each daily pair
    daily_dfs = {}
    for sst_filename, chl_filename, date in daily_pairs:
        print(f"\nProcessing daily pair for {date}")
        daily_dfs[date] = extract_to_dataframe(
            sst_dl_tunisia[sst_filename],
            chl_dl_tunisia[chl_filename],
            f"Daily_{date}"
        )

    return monthly_dfs, daily_dfs

In [None]:
#############################################################
# SECTION 7: PFZ ANALYSIS FUNCTIONS
#############################################################

def calculate_pfz_index(df, output_prefix):
    """Calculate PFZ index and create visualizations"""
    if len(df) == 0:
        print("No data points available for PFZ calculation.")
        return None

    # Create a grid for spatial analysis
    lat_min, lat_max = df['latitude'].min(), df['latitude'].max()
    lon_min, lon_max = df['longitude'].min(), df['longitude'].max()

    # Create grid with 0.05 degree resolution
    lat_resolution = 0.05
    lon_resolution = 0.05
    lat_bins = np.arange(lat_min, lat_max + lat_resolution, lat_resolution)
    lon_bins = np.arange(lon_min, lon_max + lon_resolution, lon_resolution)

    # Bin the data to create a grid
    grid_sst = np.zeros((len(lat_bins)-1, len(lon_bins)-1))
    grid_chl = np.zeros((len(lat_bins)-1, len(lon_bins)-1))
    grid_count = np.zeros((len(lat_bins)-1, len(lon_bins)-1))

    # Populate the grid
    for _, row in df.iterrows():
        lat_idx = int((row['latitude'] - lat_min) / lat_resolution)
        lon_idx = int((row['longitude'] - lon_min) / lon_resolution)

        if 0 <= lat_idx < len(lat_bins)-1 and 0 <= lon_idx < len(lon_bins)-1:
            grid_sst[lat_idx, lon_idx] += row['sst']
            grid_chl[lat_idx, lon_idx] += row['chlorophyll']
            grid_count[lat_idx, lon_idx] += 1

    # Average values where there are multiple points
    mask = grid_count > 0
    grid_sst[mask] = grid_sst[mask] / grid_count[mask]
    grid_chl[mask] = grid_chl[mask] / grid_count[mask]

    # Set zeros to NaN for better visualization
    grid_sst[~mask] = np.nan
    grid_chl[~mask] = np.nan

    # Calculate SST gradient
    dy, dx = np.gradient(grid_sst)
    gradient_magnitude = np.sqrt(dx**2 + dy**2)

    # Normalize values between 0 and 1
    norm_gradient = np.zeros_like(gradient_magnitude)
    norm_chl = np.zeros_like(grid_chl)

    if np.nanmax(gradient_magnitude) > np.nanmin(gradient_magnitude):
        norm_gradient = (gradient_magnitude - np.nanmin(gradient_magnitude)) / (np.nanmax(gradient_magnitude) - np.nanmin(gradient_magnitude))

    if np.nanmax(grid_chl) > np.nanmin(grid_chl):
        norm_chl = (grid_chl - np.nanmin(grid_chl)) / (np.nanmax(grid_chl) - np.nanmin(grid_chl))

    # Calculate PFZ index (weighted sum)
    pfz_index = 0.6 * norm_gradient + 0.4 * norm_chl

    # Create visualizations
    create_pfz_visualizations(
        lat_bins[:-1], lon_bins[:-1],
        grid_sst, grid_chl, norm_gradient, pfz_index,
        output_prefix
    )

    # Identify potential fishing zones
    identify_potential_fishing_zones(
        lat_bins[:-1], lon_bins[:-1],
        pfz_index, grid_sst, grid_chl,
        output_prefix
    )

    return pfz_index

def create_pfz_visualizations(lat_bins, lon_bins, grid_sst, grid_chl, norm_gradient, pfz_index, output_prefix):
    """Create visualizations for PFZ analysis"""
    fig, axs = plt.subplots(2, 2, figsize=(16, 14))

    # SST map
    im1 = axs[0, 0].pcolormesh(lon_bins, lat_bins, grid_sst, cmap='viridis', shading='auto')
    axs[0, 0].set_title(f'Sea Surface Temperature (°C) - {output_prefix}')
    axs[0, 0].set_xlabel('Longitude (°E)')
    axs[0, 0].set_ylabel('Latitude (°N)')
    plt.colorbar(im1, ax=axs[0, 0], label='Temperature (°C)')

    # Chlorophyll map
    im2 = axs[0, 1].pcolormesh(lon_bins, lat_bins, grid_chl, cmap='plasma', shading='auto')
    axs[0, 1].set_title(f'Chlorophyll-a Concentration (mg/m³) - {output_prefix}')
    axs[0, 1].set_xlabel('Longitude (°E)')
    axs[0, 1].set_ylabel('Latitude (°N)')
    plt.colorbar(im2, ax=axs[0, 1], label='Chlorophyll-a (mg/m³)')

    # SST Gradient map
    im3 = axs[1, 0].pcolormesh(lon_bins, lat_bins, norm_gradient, cmap='hot_r', shading='auto')
    axs[1, 0].set_title(f'Normalized SST Gradient - {output_prefix}')
    axs[1, 0].set_xlabel('Longitude (°E)')
    axs[1, 0].set_ylabel('Latitude (°N)')
    plt.colorbar(im3, ax=axs[1, 0], label='Gradient Magnitude (normalized)')

    # PFZ map - custom colormap
    pfz_cmap = LinearSegmentedColormap.from_list('pfz',
                                               [(0, 'darkblue'),
                                                (0.25, 'blue'),
                                                (0.5, 'lightgreen'),
                                                (0.75, 'yellow'),
                                                (1, 'red')])

    im4 = axs[1, 1].pcolormesh(lon_bins, lat_bins, pfz_index, cmap=pfz_cmap, shading='auto')
    axs[1, 1].set_title(f'Potential Fishing Zone Index - {output_prefix}')
    axs[1, 1].set_xlabel('Longitude (°E)')
    axs[1, 1].set_ylabel('Latitude (°N)')
    cbar = plt.colorbar(im4, ax=axs[1, 1], label='PFZ Index')
    cbar.set_ticks([0, 0.25, 0.5, 0.75, 1.0])
    cbar.set_ticklabels(['Very Low', 'Low', 'Medium', 'High', 'Very High'])

    # Add Tunisia coastline reference
    for ax in axs.flat:
        ax.set_xlim(min(lon_bins), max(lon_bins))
        ax.set_ylim(min(lat_bins), max(lat_bins))
        ax.grid(True, linestyle='--', alpha=0.5)

    plt.tight_layout()

    # Save the figure
    fig_path = os.path.join(analysis_dir, f'PFZ_Analysis_{output_prefix}.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight')
    print(f"Saved analysis figure to {fig_path}")

    # Close the figure to free memory
    plt.close(fig)

def identify_potential_fishing_zones(lat_bins, lon_bins, pfz_index, grid_sst, grid_chl, output_prefix):
    """Identify and save potential fishing zones"""
    pfz_df = pd.DataFrame()

    for lat_idx in range(len(lat_bins)):
        for lon_idx in range(len(lon_bins)):
            if lat_idx < pfz_index.shape[0] and lon_idx < pfz_index.shape[1]:
                if not np.isnan(pfz_index[lat_idx, lon_idx]) and pfz_index[lat_idx, lon_idx] > 0.7:  # High PFZ index threshold
                    pfz_df = pfz_df._append({
                        'latitude': lat_bins[lat_idx],
                        'longitude': lon_bins[lon_idx],
                        'pfz_index': pfz_index[lat_idx, lon_idx],
                        'sst': grid_sst[lat_idx, lon_idx],
                        'chlorophyll': grid_chl[lat_idx, lon_idx]
                    }, ignore_index=True)

    # Sort by PFZ index (highest first)
    if len(pfz_df) > 0:
        pfz_df = pfz_df.sort_values('pfz_index', ascending=False)

        # Save the top PFZ locations
        pfz_path = os.path.join(analysis_dir, f'PFZ_Locations_{output_prefix}.csv')
        pfz_df.to_csv(pfz_path, index=False)
        print(f"Saved top {len(pfz_df)} potential fishing zones to {pfz_path}")

        # Print the top 5 PFZ locations
        print("\nTop 5 Potential Fishing Zones:")
        for idx, row in pfz_df.head(5).iterrows():
            print(f"Location {idx+1}: Lat {row['latitude']:.4f}°N, Lon {row['longitude']:.4f}°E (PFZ Index: {row['pfz_index']:.4f})")
    else:
        print("\nNo high-potential fishing zones identified.")

    return pfz_df

In [None]:
#############################################################
# SECTION 8: MACHINE LEARNING MODEL
#############################################################

def train_pfz_prediction_model(all_dfs):
    """Train machine learning models to predict fishing zones based on SST and chlorophyll"""
    import numpy as np
    import pandas as pd
    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
    import matplotlib.pyplot as plt
    import seaborn as sns
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping

    print("\n--- TRAINING PFZ PREDICTION MODEL ---")

    # Combine all dataframes
    if len(all_dfs) == 0:
        print("No valid dataframes available for model training")
        return None, None, None

    # Combine all valid dataframes
    combined_df = pd.concat([df for df in all_dfs.values() if df is not None and len(df) > 0])
    print(f"Combined dataset has {len(combined_df)} data points")

    # Prepare features and target
    X = combined_df[['sst', 'chlorophyll']]

    # Check for and handle any remaining NaN or infinite values
    if X.isna().any().any() or np.isinf(X).any().any():
        print("Removing any remaining NaN or infinite values")
        combined_df = combined_df.replace([np.inf, -np.inf], np.nan).dropna()
        X = combined_df[['sst', 'chlorophyll']]
        print(f"Dataset now has {len(combined_df)} valid points")

    # Normalize features
    X_scaled = preprocessing.scale(X)
    X_normalized = preprocessing.normalize(X_scaled, norm='l2')
    y = combined_df['Label']

    # One-hot encode labels for neural network
    y_categorical = to_categorical(y, num_classes=3)

    # Split data for model training
    X_train, X_test, y_train, y_test = train_test_split(
        X_normalized, y,
        test_size=0.2, random_state=42,
        stratify=y  # Ensure balanced classes
    )

    # Also prepare categorical targets for neural network
    _, _, y_train_cat, y_test_cat = train_test_split(
        X_normalized, y_categorical,
        test_size=0.2, random_state=42,
        stratify=y
    )

    print(f"Training set size: {len(X_train)} samples")
    print(f"Testing set size: {len(X_test)} samples")

    # Check class distribution
    train_class_counts = pd.Series(y_train).value_counts()
    test_class_counts = pd.Series(y_test).value_counts()

    print("\nClass distribution in training set:")
    print(train_class_counts)
    print(f"Percentage: {100 * train_class_counts / len(y_train)}")

    print("\nClass distribution in test set:")
    print(test_class_counts)
    print(f"Percentage: {100 * test_class_counts / len(y_test)}")

    # Create class distribution visualization
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.pie(train_class_counts, labels=['Low', 'Medium', 'High'], autopct='%1.1f%%',
           colors=['skyblue', 'lightgreen', 'salmon'])
    plt.title('Training Set Class Distribution')

    plt.subplot(1, 2, 2)
    plt.pie(test_class_counts, labels=['Low', 'Medium', 'High'], autopct='%1.1f%%',
           colors=['skyblue', 'lightgreen', 'salmon'])
    plt.title('Test Set Class Distribution')

    plt.tight_layout()
    class_dist_path = os.path.join(analysis_dir, 'PFZ_Class_Distribution.png')
    plt.savefig(class_dist_path, dpi=300)
    plt.close()

    # 1. Random Forest Model - without grid search
    print("\nTraining Random Forest Classifier (simplified version)...")

    # Use a simpler model with default parameters
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)

    # Predictions and metrics
    rf_pred = rf.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_pred)
    rf_f1 = f1_score(y_test, rf_pred, average='weighted')

    print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
    print(f"Random Forest F1 Score (weighted): {rf_f1:.4f}")

    # Feature importance
    feature_importances = pd.DataFrame({
        'feature': ['SST', 'Chlorophyll'],
        'importance': rf.feature_importances_
    })
    feature_importances = feature_importances.sort_values('importance', ascending=False)

    plt.figure(figsize=(8, 5))
    sns.barplot(x='feature', y='importance', data=feature_importances)
    plt.title('Feature Importance - Random Forest')
    plt.ylabel('Importance')
    plt.tight_layout()
    fi_path = os.path.join(analysis_dir, 'PFZ_RF_Feature_Importance.png')
    plt.savefig(fi_path, dpi=300)
    plt.close()

    # Confusion matrix
    rf_cm = confusion_matrix(y_test, rf_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Low', 'Medium', 'High'],
               yticklabels=['Low', 'Medium', 'High'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - Random Forest')
    cm_path = os.path.join(analysis_dir, 'PFZ_RF_Confusion_Matrix.png')
    plt.savefig(cm_path, dpi=300)
    plt.close()

    # Classification report
    rf_report = classification_report(y_test, rf_pred, target_names=['Low', 'Medium', 'High'])
    print("\nRandom Forest Classification Report:")
    print(rf_report)

    # 2. Neural Network Model - simpler and faster
    print("\nTraining Neural Network (simplified version)...")

    # Define model architecture - simpler network
    model = Sequential([
        Dense(32, activation='relu', input_shape=(2,)),
        Dense(16, activation='relu'),
        Dense(3, activation='softmax')
    ])

    # Compile model
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

    # Define early stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    # Train model - fewer epochs
    history = model.fit(
        X_train, y_train_cat,
        epochs=50,  # Reduced from 100
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate model
    nn_loss, nn_accuracy = model.evaluate(X_test, y_test_cat)
    print(f"Neural Network Accuracy: {nn_accuracy:.4f}")

    # Plot training history
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')

    plt.tight_layout()
    history_path = os.path.join(analysis_dir, 'PFZ_NN_Training_History.png')
    plt.savefig(history_path, dpi=300)
    plt.close()

    # Generate predictions
    nn_pred_prob = model.predict(X_test)
    nn_pred = np.argmax(nn_pred_prob, axis=1)
    nn_true = np.argmax(y_test_cat, axis=1)

    # Calculate F1 score
    nn_f1 = f1_score(nn_true, nn_pred, average='weighted')
    print(f"Neural Network F1 Score (weighted): {nn_f1:.4f}")

    # Confusion matrix
    nn_cm = confusion_matrix(nn_true, nn_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(nn_cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Low', 'Medium', 'High'],
               yticklabels=['Low', 'Medium', 'High'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - Neural Network')
    nn_cm_path = os.path.join(analysis_dir, 'PFZ_NN_Confusion_Matrix.png')
    plt.savefig(nn_cm_path, dpi=300)
    plt.close()

    # Classification report
    nn_report = classification_report(nn_true, nn_pred, target_names=['Low', 'Medium', 'High'])
    print("\nNeural Network Classification Report:")
    print(nn_report)

    # Compare models
    plt.figure(figsize=(8, 6))
    model_names = ['Random Forest', 'Neural Network']
    accuracies = [rf_accuracy, nn_accuracy]
    f1_scores = [rf_f1, nn_f1]

    x = np.arange(len(model_names))
    width = 0.35

    plt.bar(x - width/2, accuracies, width, label='Accuracy')
    plt.bar(x + width/2, f1_scores, width, label='F1 Score')

    plt.xlabel('Model')
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.xticks(x, model_names)
    plt.ylim(0, 1.0)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)

    plt.tight_layout()
    comparison_path = os.path.join(analysis_dir, 'PFZ_Model_Comparison.png')
    plt.savefig(comparison_path, dpi=300)
    plt.close()

    # Save classification reports
    report_path = os.path.join(analysis_dir, 'PFZ_Classification_Reports.txt')
    with open(report_path, 'w') as f:
        f.write("TUNISIA PFZ MODEL - CLASSIFICATION REPORTS\n")
        f.write("=======================================\n\n")
        f.write(f"Total data points: {len(combined_df)}\n")
        f.write(f"Training samples: {len(X_train)}\n")
        f.write(f"Testing samples: {len(X_test)}\n\n")

        f.write("RANDOM FOREST MODEL\n")
        f.write("------------------\n")
        f.write(f"Parameters: n_estimators=100, max_depth=10\n")
        f.write(f"Accuracy: {rf_accuracy:.4f}\n")
        f.write(f"F1 Score (weighted): {rf_f1:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(rf_report)
        f.write("\n\n")

        f.write("NEURAL NETWORK MODEL\n")
        f.write("------------------\n")
        f.write(f"Accuracy: {nn_accuracy:.4f}\n")
        f.write(f"F1 Score (weighted): {nn_f1:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(nn_report)

    print(f"Saved classification reports to {report_path}")

    # Save models
    # Random Forest
    from joblib import dump
    rf_model_path = os.path.join(model_dir, 'random_forest_model.joblib')
    dump(rf, rf_model_path)

    # Neural Network
    nn_model_path = os.path.join(model_dir, 'neural_network_model.keras')
    model.save(nn_model_path)

    print(f"Saved Random Forest model to {rf_model_path}")
    print(f"Saved Neural Network model to {nn_model_path}")

    # Return the better model and a prediction function
    if rf_f1 >= nn_f1:
        best_model = rf
        model_type = "RandomForest"

        def predict_function(data):
            """Predict fishing zones using Random Forest model"""
            # Preprocess data
            data_scaled = preprocessing.scale(data)
            data_normalized = preprocessing.normalize(data_scaled, norm='l2')
            # Make predictions
            predictions = best_model.predict(data_normalized)
            probabilities = best_model.predict_proba(data_normalized)
            return predictions, probabilities
    else:
        best_model = model
        model_type = "NeuralNetwork"

        def predict_function(data):
            """Predict fishing zones using Neural Network model"""
            # Preprocess data
            data_scaled = preprocessing.scale(data)
            data_normalized = preprocessing.normalize(data_scaled, norm='l2')
            # Make predictions
            probabilities = best_model.predict(data_normalized)
            predictions = np.argmax(probabilities, axis=1)
            return predictions, probabilities

    print(f"\nThe best model is: {model_type}")
    return best_model, predict_function, model_type

In [None]:
#############################################################
# SECTION 9: PREDICTION AND VALIDATION
#############################################################

def generate_pfz_predictions(model, predict_function, model_type, df, date_label):
    """Generate predictions for a given dataset and validate against ground truth"""
    print(f"\nGenerating PFZ predictions for {date_label}...")

    # Extract features
    X = df[['sst', 'chlorophyll']].values
    y_true = df['Label'].values

    # Get predictions
    predictions, probabilities = predict_function(X)

    # Add predictions to dataframe
    df['Predicted_Label'] = predictions
    df['Predicted_Zone'] = df['Predicted_Label'].map({0: 'LOW', 1: 'MEDIUM', 2: 'HIGH'})

    # Calculate accuracy and F1 score
    from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
    accuracy = accuracy_score(y_true, predictions)
    f1 = f1_score(y_true, predictions, average='weighted')

    print(f"Prediction accuracy: {accuracy:.4f}")
    print(f"F1 score (weighted): {f1:.4f}")

    # Create confusion matrix
    cm = confusion_matrix(y_true, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=['Low', 'Medium', 'High'],
               yticklabels=['Low', 'Medium', 'High'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'{date_label} - Actual vs Predicted Fishing Zones')
    cm_path = os.path.join(analysis_dir, f'PFZ_Prediction_CM_{date_label}.png')
    plt.savefig(cm_path, dpi=300)
    plt.close()

    # Map the predicted classes spatially
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(df['longitude'], df['latitude'],
                         c=df['Predicted_Label'], cmap='viridis',
                         alpha=0.7, s=10)
    plt.colorbar(scatter, label='Predicted Fishing Zone Potential (0=Low, 1=Medium, 2=High)')
    plt.title(f'Predicted Fishing Zones - {date_label}')
    plt.xlabel('Longitude (°E)')
    plt.ylabel('Latitude (°N)')
    plt.grid(alpha=0.3)
    map_path = os.path.join(analysis_dir, f'PFZ_Prediction_Map_{date_label}.png')
    plt.savefig(map_path, dpi=300)
    plt.close()

    # Save prediction results
    pred_df = df[['latitude', 'longitude', 'sst', 'chlorophyll',
                 'FishingZone', 'Label', 'Predicted_Zone', 'Predicted_Label']]
    pred_path = os.path.join(processed_dir, f'Tunisia_PFZ_Predictions_{date_label}.csv')
    pred_df.to_csv(pred_path, index=False)
    print(f"Saved prediction results to {pred_path}")

    # Create high potential fishing zone map for operational use
    high_potential = df[df['Predicted_Label'] == 2]
    if len(high_potential) > 0:
        plt.figure(figsize=(10, 8))
        plt.scatter(df['longitude'], df['latitude'], c='lightgray', alpha=0.3, s=5)
        plt.scatter(high_potential['longitude'], high_potential['latitude'],
                  c='red', alpha=0.8, s=20)
        plt.title(f'High Potential Fishing Zones - {date_label}')
        plt.xlabel('Longitude (°E)')
        plt.ylabel('Latitude (°N)')
        plt.grid(True, alpha=0.3)
        plt.xlim(tunisia_lon_min, tunisia_lon_max)
        plt.ylim(tunisia_lat_min, tunisia_lat_max)
        high_path = os.path.join(analysis_dir, f'High_PFZ_{date_label}.png')
        plt.savefig(high_path, dpi=300)
        plt.close()
        print(f"Saved high potential fishing zones map to {high_path}")
    else:
        print("No high potential fishing zones identified in this dataset.")

    return accuracy, f1

In [None]:
#############################################################
# SECTION 10: MAIN EXECUTION
#############################################################

def main():
    """Main execution function"""
    # Step 1: Read all satellite data
    print("\n--- STEP 1: READING SATELLITE DATA ---")
    sst_mo_data, chl_mo_data, sst_dl_data, chl_dl_data = read_all_data()

    # Step 2: Crop data to Tunisia region
    print("\n--- STEP 2: CROPPING DATA TO TUNISIA REGION ---")
    sst_mo_tunisia, chl_mo_tunisia, sst_dl_tunisia, chl_dl_tunisia = crop_all_data(
        sst_mo_data, chl_mo_data, sst_dl_data, chl_dl_data
    )

    # Step 3: Process paired data
    print("\n--- STEP 3: PROCESSING PAIRED DATA ---")
    monthly_dfs, daily_dfs = process_paired_data(
        sst_mo_tunisia, chl_mo_tunisia, sst_dl_tunisia, chl_dl_tunisia
    )

    # Step 4: Analyze data and calculate PFZ
    print("\n--- STEP 4: ANALYZING DATA AND CALCULATING PFZ ---")

    # Process monthly data
    monthly_results = []
    for date, df in monthly_dfs.items():
        if df is not None and len(df) > 0:
            print(f"\nAnalyzing Monthly data for {date}")
            pfz_index = calculate_pfz_index(df, f"Monthly_{date}")

            if pfz_index is not None:
                monthly_results.append({
                    'date': date,
                    'data_points': len(df),
                    'sst_min': df['sst'].min(),
                    'sst_max': df['sst'].max(),
                    'chl_min': df['chlorophyll'].min(),
                    'chl_max': df['chlorophyll'].max()
                })

    # Process daily data
    daily_results = []
    for date, df in daily_dfs.items():
        if df is not None and len(df) > 0:
            print(f"\nAnalyzing Daily data for {date}")
            pfz_index = calculate_pfz_index(df, f"Daily_{date}")

            if pfz_index is not None:
                daily_results.append({
                    'date': date,
                    'data_points': len(df),
                    'sst_min': df['sst'].min(),
                    'sst_max': df['sst'].max(),
                    'chl_min': df['chlorophyll'].min(),
                    'chl_max': df['chlorophyll'].max()
                })

    # Step 5: Create summary reports
    print("\n--- STEP 5: CREATING SUMMARY REPORTS ---")

    # Monthly summary
    if monthly_results:
        monthly_df = pd.DataFrame(monthly_results)
        monthly_summary_path = os.path.join(analysis_dir, 'Monthly_PFZ_Summary.csv')
        monthly_df.to_csv(monthly_summary_path, index=False)
        print(f"Saved monthly summary to {monthly_summary_path}")

        # Plot monthly trends
        plot_time_series(monthly_df, 'Monthly')

    # Daily summary
    if daily_results:
        daily_df = pd.DataFrame(daily_results)
        daily_summary_path = os.path.join(analysis_dir, 'Daily_PFZ_Summary.csv')
        daily_df.to_csv(daily_summary_path, index=False)
        print(f"Saved daily summary to {daily_summary_path}")

        # Plot daily trends if there are enough points
        if len(daily_df) >= 3:
            plot_time_series(daily_df, 'Daily')

    # Step 6: Train prediction model
    print("\n--- STEP 6: TRAINING PREDICTION MODEL ---")
    best_model, predict_function, model_type = train_pfz_prediction_model(monthly_dfs)

    # Step 7: Generate predictions for validation
    print("\n--- STEP 7: GENERATING PREDICTIONS FOR VALIDATION ---")
    validation_results = []

    # Select a few months for validation
    validation_months = ['20240101', '20240601', '20241201']  # Winter, Summer, Winter
    for month in validation_months:
        if month in monthly_dfs and monthly_dfs[month] is not None:
            print(f"\nValidating model on {month} data...")
            accuracy, f1 = generate_pfz_predictions(
                best_model, predict_function, model_type,
                monthly_dfs[month], f"Monthly_{month}"
            )
            validation_results.append({
                'date': month,
                'accuracy': accuracy,
                'f1_score': f1
            })

    # Save validation results
    if validation_results:
        validation_df = pd.DataFrame(validation_results)
        validation_path = os.path.join(analysis_dir, 'Model_Validation_Results.csv')
        validation_df.to_csv(validation_path, index=False)
        print(f"Saved validation results to {validation_path}")

        # Plot validation metrics
        plt.figure(figsize=(10, 6))
        x = np.arange(len(validation_df))
        width = 0.35

        plt.bar(x - width/2, validation_df['accuracy'], width, label='Accuracy')
        plt.bar(x + width/2, validation_df['f1_score'], width, label='F1 Score')

        plt.xlabel('Validation Month')
        plt.ylabel('Score')
        plt.title('Model Validation Metrics')
        plt.xticks(x, validation_df['date'])
        plt.ylim(0, 1.0)
        plt.legend()
        plt.grid(axis='y', alpha=0.3)

        val_path = os.path.join(analysis_dir, 'Validation_Metrics.png')
        plt.savefig(val_path, dpi=300)
        plt.close()

    print("\nTunisia PFZ Model processing complete!")

def plot_time_series(df, data_type):
    """Create time series plots from summary data"""
    try:
        plt.figure(figsize=(14, 10))

        # Convert date strings to datetime for proper ordering
        df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
        df = df.sort_values('date')

        # SST min/max plot
        plt.subplot(2, 1, 1)
        plt.plot(df['date'], df['sst_min'], 'b-', marker='o', label='Min SST')
        plt.plot(df['date'], df['sst_max'], 'r-', marker='o', label='Max SST')
        plt.fill_between(df['date'], df['sst_min'], df['sst_max'], alpha=0.2, color='purple')
        plt.title(f'{data_type} Sea Surface Temperature Trend')
        plt.xlabel('Date')
        plt.ylabel('Temperature (°C)')
        plt.grid(True, linestyle='--', alpha=0.5)
        plt.legend()

        # Chlorophyll min/max plot
        plt.subplot(2, 1, 2)
        plt.plot(df['date'], df['chl_min'], 'g-', marker='o', label='Min Chlorophyll')
        plt.plot(df['date'], df['chl_max'], 'm-', marker='o', label='Max Chlorophyll')
        plt.fill_between(df['date'], df['chl_min'], df['chl_max'], alpha=0.2, color='olive')
        plt.title(f'{data_type} Chlorophyll Concentration Trend')
        plt.xlabel('Date')
        plt.ylabel('Chlorophyll (mg/m³)')
        plt.grid(True, linestyle='--', alpha=0.5)
        plt.legend()

        plt.tight_layout()
        plt.savefig(os.path.join(analysis_dir, f'{data_type}_PFZ_Trends.png'), dpi=300)
        print(f"Saved {data_type} trends visualization")
        plt.close()
    except Exception as e:
        print(f"Error creating time series plots: {e}")

# Execute main function
if __name__ == "__main__":
    main()


--- STEP 1: READING SATELLITE DATA ---
Reading SST monthly data...
Found 12 files in /content/drive/MyDrive/Tunisia_PFZ_Model/SST_MO
Reading file: AQUA_MODIS.20240201_20240229.L3m.MO.SST.sst.4km.nc
Error reading file AQUA_MODIS.20240201_20240229.L3m.MO.SST.sst.4km.nc: Error: /content/drive/MyDrive/Tunisia_PFZ_Model/SST_MO/AQUA_MODIS.20240201_20240229.L3m.MO.SST.sst.4km.nc is not a valid NetCDF 3 file
            If this is a NetCDF4 file, you may need to install the
            netcdf4 library, e.g.,

            $ pip install netcdf4
            
Reading file: AQUA_MODIS.20240301_20240331.L3m.MO.SST.sst.4km.nc
Dimensions: {'lat': 4320, 'lon': 8640, 'rgb': 3, 'eightbitcolor': 256}
Variables: ['sst', 'qual_sst', 'lat', 'lon', 'palette']
-------------------------
Reading file: AQUA_MODIS.20240101_20240131.L3m.MO.SST.sst.4km.nc
Dimensions: {'lat': 4320, 'lon': 8640, 'rgb': 3, 'eightbitcolor': 256}
Variables: ['sst', 'qual_sst', 'lat', 'lon', 'palette']
-------------------------
Reading f

Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4323.00it/s]

Created DataFrame with 5769 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  16.315001     0.287264  20240301
1  37.979164   7.062506  16.240000     0.289395  20240301
2  37.979164   7.104172  16.164999     0.292652  20240301
3  37.979164   7.145839  16.094999     0.283431  20240301
4  37.979164   7.187506  16.039999     0.281166  20240301

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5769.000000  5769.000000  5769.000000  5769.000000
mean     36.245228    10.392307    16.386505     0.720012
std       1.474039     1.421338     0.639513     2.038677
min      33.020832     7.020839    14.730000     0.141760
25%      34.854164     9.437506    15.915000     0.248858
50%      36.895832    10.937506    16.209999     0.281840
75%      37.479164    11.479173    16.715000     0.327565
max      37.979164    11.979173    21.930000    42.434219

Checking for infinite values:
sst            0
c




Saved DataFrame to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/processed/Tunisia_PFZ_20240301.csv

Processing monthly pair for 20240101
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4614.91it/s]

Created DataFrame with 5789 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  17.000000     0.280274  20240101
1  37.979164   7.062506  17.004999     0.277238  20240101
2  37.979164   7.104172  17.070000     0.284001  20240101
3  37.979164   7.145839  17.090000     0.284520  20240101
4  37.979164   7.187506  17.094999     0.282164  20240101

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5789.000000  5789.000000  5789.000000  5789.000000
mean     36.238306    10.395040    16.358435     0.851346
std       1.476993     1.418413     0.680068     2.104607
min      33.020832     7.020839    13.025000     0.220535
25%      34.854164     9.479173    16.014999     0.326390
50%      36.895832    10.937506    16.455000     0.358842
75%      37.479164    11.479173    16.709999     0.451985
max      37.979164    11.979173    18.250000    55.051014

Checking for infinite values:
sst            0
c




Saved DataFrame to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/processed/Tunisia_PFZ_20240101.csv

Processing monthly pair for 20240501
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:01<00:00, 160.71it/s]


Created DataFrame with 5740 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  19.029999     0.126862  20240501
1  37.979164   7.062506  19.260000     0.125697  20240501
2  37.979164   7.104172  19.959999     0.129296  20240501
3  37.979164   7.145839  20.004999     0.132227  20240501
4  37.979164   7.187506  19.865000     0.134365  20240501

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5740.000000  5740.000000  5740.000000  5740.000000
mean     36.252176    10.390475    20.114596     0.735185
std       1.470216     1.423675     0.897305     2.768498
min      33.020832     7.020839    18.344999     0.082944
25%      34.885415     9.437506    19.494999     0.138558
50%      36.895832    10.937506    19.814999     0.153171
75%      37.520832    11.479173    20.625000     0.196250
max      37.979164    11.979173    26.629999    62.710205

Checking for infinite values:
sst            0
c

Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4490.39it/s]

Created DataFrame with 5735 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  17.574999     0.160252  20240401
1  37.979164   7.062506  17.340000     0.164753  20240401
2  37.979164   7.104172  17.350000     0.179476  20240401
3  37.979164   7.145839  17.355000     0.174007  20240401
4  37.979164   7.187506  17.320000     0.168333  20240401

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5735.000000  5735.000000  5735.000000  5735.000000
mean     36.255873    10.392439    17.840170     0.622810
std       1.467358     1.422424     0.955714     2.140978
min      33.020832     7.020839    16.404999     0.081892
25%      34.895832     9.437506    17.115000     0.154124
50%      36.895832    10.937506    17.545000     0.166137
75%      37.520832    11.479173    18.549999     0.210183
max      37.979164    11.979173    21.869999    54.874355

Checking for infinite values:
sst            0
c




Saved DataFrame to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/processed/Tunisia_PFZ_20240401.csv

Processing monthly pair for 20240601
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4778.25it/s]

Created DataFrame with 5688 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  23.115000     0.107691  20240601
1  37.979164   7.062506  23.184999     0.105442  20240601
2  37.979164   7.104172  22.894999     0.112560  20240601
3  37.979164   7.145839  22.834999     0.113489  20240601
4  37.979164   7.187506  22.809999     0.114113  20240601

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5688.000000  5688.000000  5688.000000  5688.000000
mean     36.260298    10.388792    23.108095     0.697095
std       1.468582     1.427366     0.853458     2.236595
min      33.020832     7.020839    21.029999     0.073988
25%      34.895832     9.395839    22.564999     0.109625
50%      36.937500    10.937506    22.949999     0.124485
75%      37.520832    11.479173    23.389999     0.149689
max      37.979164    11.979173    27.559999    31.682673

Checking for infinite values:
sst            0
c





Processing monthly pair for 20240701
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4682.23it/s]

Created DataFrame with 5746 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  26.129999     0.084055  20240701
1  37.979164   7.062506  26.115000     0.084388  20240701
2  37.979164   7.104172  26.105000     0.083880  20240701
3  37.979164   7.145839  26.125000     0.084091  20240701
4  37.979164   7.187506  26.119999     0.084742  20240701

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5746.000000  5746.000000  5746.000000  5746.000000
mean     36.244872    10.393838    26.549858     0.982676
std       1.475461     1.422605     1.071937     3.463466
min      33.020832     7.020839    23.824999     0.068108
25%      34.854164     9.437506    25.799999     0.094687
50%      36.895832    10.937506    26.285000     0.103510
75%      37.520832    11.479173    27.019999     0.144657
max      37.979164    11.979173    31.174999    52.282475

Checking for infinite values:
sst            0
c





Processing monthly pair for 20240801
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4750.82it/s]

Created DataFrame with 5773 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  27.539999     0.076360  20240801
1  37.979164   7.062506  27.535000     0.075609  20240801
2  37.979164   7.104172  27.574999     0.076290  20240801
3  37.979164   7.145839  27.564999     0.077819  20240801
4  37.979164   7.187506  27.529999     0.079200  20240801

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5773.000000  5773.000000  5773.000000  5773.000000
mean     36.246062    10.392100    28.364567     1.287635
std       1.473591     1.421815     0.958547     3.953377
min      33.020832     7.020839    26.724998     0.063483
25%      34.854164     9.437506    27.535000     0.089072
50%      36.895832    10.937506    28.135000     0.098727
75%      37.479164    11.479173    29.054998     0.166314
max      37.979164    11.979173    31.824999    44.965679

Checking for infinite values:
sst            0
c





Processing monthly pair for 20241001
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 3754.87it/s]

Created DataFrame with 5780 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  23.514999     0.125374  20241001
1  37.979164   7.062506  23.490000     0.125542  20241001
2  37.979164   7.104172  23.439999     0.125233  20241001
3  37.979164   7.145839  23.424999     0.124658  20241001
4  37.979164   7.187506  23.394999     0.124830  20241001

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5780.000000  5780.000000  5780.000000  5780.000000
mean     36.239611    10.393669    24.633437     1.377638
std       1.476470     1.421096     1.273980     4.598810
min      33.020832     7.020839    22.010000     0.100648
25%      34.854164     9.437506    23.654999     0.124075
50%      36.895832    10.937506    24.344999     0.144102
75%      37.479164    11.479173    25.906249     0.326979
max      37.979164    11.979173    27.570000    81.774239

Checking for infinite values:
sst            0
c





Processing monthly pair for 20240901
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 2097.95it/s]

Created DataFrame with 5762 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  26.049999     0.105173  20240901
1  37.979164   7.062506  26.010000     0.104158  20240901
2  37.979164   7.104172  26.010000     0.103175  20240901
3  37.979164   7.145839  25.959999     0.102714  20240901
4  37.979164   7.187506  26.004999     0.103340  20240901

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5762.000000  5762.000000  5762.000000  5762.000000
mean     36.248979    10.392455    27.013080     1.483195
std       1.470981     1.422532     1.212991     4.811378
min      33.020832     7.020839    24.809999     0.065116
25%      34.854164     9.437506    26.135000     0.115807
50%      36.895832    10.937506    26.594999     0.129615
75%      37.479164    11.479173    28.118749     0.284656
max      37.979164    11.979173    30.494999    79.111908

Checking for infinite values:
sst            0
c





Class distribution:
FishingZone
LOW       4742
MEDIUM    1017
HIGH         3
Name: count, dtype: int64
Percentage distribution:
FishingZone
LOW       82.297813
MEDIUM    17.650121
HIGH       0.052065
Name: count, dtype: float64
Saved DataFrame to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/processed/Tunisia_PFZ_20240901.csv

Processing monthly pair for 20241201
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4606.99it/s]

Created DataFrame with 5795 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  17.469999     0.264480  20241201
1  37.979164   7.062506  17.750000     0.269194  20241201
2  37.979164   7.104172  17.900000     0.269820  20241201
3  37.979164   7.145839  18.010000     0.377011  20241201
4  37.979164   7.187506  18.055000     0.413914  20241201

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5795.000000  5795.000000  5795.000000  5795.000000
mean     36.234709    10.394696    18.195203     1.232353
std       1.479253     1.418744     1.164273     3.813551
min      33.020832     7.020839    13.134999     0.153487
25%      34.854164     9.479173    17.445000     0.270995
50%      36.895832    10.937506    17.945000     0.351198
75%      37.479164    11.479173    18.984999     0.620143
max      37.979164    11.979173    21.359999    82.118927

Checking for infinite values:
sst            0
c





Processing monthly pair for 20241101
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4589.01it/s]

Created DataFrame with 5771 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll      date
0  37.979164   7.020839  20.980000     0.171023  20241101
1  37.979164   7.062506  21.080000     0.155617  20241101
2  37.979164   7.104172  21.080000     0.165561  20241101
3  37.979164   7.145839  21.010000     0.169781  20241101
4  37.979164   7.187506  20.994999     0.174501  20241101

Data statistics:
          latitude    longitude          sst  chlorophyll
count  5771.000000  5771.000000  5771.000000  5771.000000
mean     36.242053    10.394041    21.928560     1.459623
std       1.476031     1.420513     0.882918     4.488988
min      33.020832     7.020839    18.959999     0.138181
25%      34.854164     9.458339    21.244999     0.180633
50%      36.895832    10.937506    21.635000     0.225446
75%      37.479164    11.479173    22.667500     0.554646
max      37.979164    11.979173    23.889999    75.200935

Checking for infinite values:
sst            0
c





Found 7 matching daily pairs

Processing daily pair for 20250419
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 5340.90it/s]

Created DataFrame with 3242 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll            date
0  37.979164   8.145839  15.990000     0.261728  Daily_20250419
1  37.979164   8.187506  17.049999     0.267158  Daily_20250419
2  37.979164   8.229173  17.094999     0.268612  Daily_20250419
3  37.979164   8.270839  17.195000     0.254668  Daily_20250419
4  37.979164   8.312506  17.215000     0.249075  Daily_20250419

Data statistics:
          latitude    longitude          sst  chlorophyll
count  3242.000000  3242.000000  3242.000000  3242.000000
mean     36.392516    10.888989    17.883120     0.487192
std       1.240946     1.001598     0.952558     1.543923
min      33.187500     8.145839    15.740000     0.087971
25%      35.697916    10.604173    17.150000     0.148618
50%      36.729164    11.187506    17.684999     0.177824
75%      37.395832    11.604173    18.275000     0.256980
max      37.979164    11.979173    21.404999    23.104805

Checking for





Processing daily pair for 20250418
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4491.22it/s]

Created DataFrame with 3675 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll            date
0  37.979164   7.020839  16.734999     0.216179  Daily_20250418
1  37.979164   7.062506  16.734999     0.205943  Daily_20250418
2  37.979164   7.104172  16.834999     0.212590  Daily_20250418
3  37.979164   7.145839  16.730000     0.224532  Daily_20250418
4  37.979164   7.187506  16.785000     0.231402  Daily_20250418

Data statistics:
          latitude    longitude          sst  chlorophyll
count  3675.000000  3675.000000  3675.000000  3675.000000
mean     36.255186    10.650590    17.559948     0.492123
std       1.531576     1.303396     1.450778     1.335603
min      33.020832     7.020839    13.929999     0.089045
25%      34.687500    10.187506    16.504999     0.150391
50%      37.062500    11.145839    16.955000     0.191990
75%      37.583332    11.604173    18.612499     0.235812
max      37.979164    11.979173    21.809999    17.274824

Checking for





Processing daily pair for 20250417
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 3509.76it/s]

Created DataFrame with 3289 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll            date
0  37.979164   7.187506  17.125000     0.206933  Daily_20250417
1  37.979164   7.229172  16.895000     0.205385  Daily_20250417
2  37.979164   7.270839  16.529999     0.201602  Daily_20250417
3  37.979164   7.312506  17.004999     0.198594  Daily_20250417
4  37.979164   7.354172  17.100000     0.201561  Daily_20250417

Data statistics:
          latitude    longitude          sst  chlorophyll
count  3289.000000  3289.000000  3289.000000  3289.000000
mean     35.652812    10.616524    17.532324     0.607799
std       1.523826     1.313105     1.274796     1.279850
min      33.020832     7.020839    14.380000     0.036085
25%      34.270832    10.229173    16.484999     0.154481
50%      35.562500    11.104173    17.285000     0.186719
75%      37.229164    11.520839    18.285000     0.285630
max      37.979164    11.979173    21.740000    14.562012

Checking for





Processing daily pair for 20250416
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 6251.75it/s]


Created DataFrame with 0 valid data points

First few rows:
Empty DataFrame
Columns: []
Index: []

Data statistics:
Error extracting data to DataFrame: Cannot describe a DataFrame without columns

Processing daily pair for 20250415
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 4681.85it/s]

Created DataFrame with 2530 valid data points

First few rows:
    latitude  longitude        sst  chlorophyll            date
0  37.979164   7.020839  17.334999     0.210796  Daily_20250415
1  37.979164   7.062506  17.334999     0.210964  Daily_20250415
2  37.979164   7.104172  17.369999     0.203291  Daily_20250415
3  37.979164   7.145839  17.369999     0.194784  Daily_20250415
4  37.979164   7.187506  17.180000     0.188726  Daily_20250415

Data statistics:
          latitude    longitude          sst  chlorophyll
count  2530.000000  2530.000000  2530.000000  2530.000000
mean     36.886840     9.266837    18.468399     0.430439
std       1.381624     1.358763     1.198681     1.801915
min      33.395832     7.020839    15.705000     0.145752
25%      37.104164     8.104173    17.520000     0.189572
50%      37.437500     9.145839    18.250000     0.209890
75%      37.729164    10.354173    19.295000     0.256091
max      37.979164    11.979173    22.855000    57.078804

Checking for





Processing daily pair for 20250414
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 3169.94it/s]


Created DataFrame with 0 valid data points

First few rows:
Empty DataFrame
Columns: []
Index: []

Data statistics:
Error extracting data to DataFrame: Cannot describe a DataFrame without columns

Processing daily pair for 20250413
Data shapes:
SST shape: (192, 120)
Chlorophyll shape: (192, 120)
Latitude shape: (192,)
Longitude shape: (120,)
Processing data points...


Processing rows: 100%|██████████| 192/192 [00:00<00:00, 3324.15it/s]

Created DataFrame with 0 valid data points

First few rows:
Empty DataFrame
Columns: []
Index: []

Data statistics:
Error extracting data to DataFrame: Cannot describe a DataFrame without columns

--- STEP 4: ANALYZING DATA AND CALCULATING PFZ ---

Analyzing Monthly data for 20240301





Saved analysis figure to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/Analysis/PFZ_Analysis_Monthly_20240301.png

No high-potential fishing zones identified.

Analyzing Monthly data for 20240101
Saved analysis figure to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/Analysis/PFZ_Analysis_Monthly_20240101.png
Saved top 1 potential fishing zones to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/Analysis/PFZ_Locations_Monthly_20240101.csv

Top 5 Potential Fishing Zones:
Location 1: Lat 35.1708°N, Lon 11.1208°E (PFZ Index: 0.7138)

Analyzing Monthly data for 20240501
Saved analysis figure to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/Analysis/PFZ_Analysis_Monthly_20240501.png

No high-potential fishing zones identified.

Analyzing Monthly data for 20240401
Saved analysis figure to /content/drive/MyDrive/Tunisia_PFZ_Model/Results/Analysis/PFZ_Analysis_Monthly_20240401.png

No high-potential fishing zones identified.

Analyzing Monthly data for 20240601
Saved analysis figure to

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.6862 - loss: 0.8142 - val_accuracy: 0.6995 - val_loss: 0.7644
Epoch 2/50
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7041 - loss: 0.7541 - val_accuracy: 0.7021 - val_loss: 0.7552
Epoch 3/50
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7005 - loss: 0.7552 - val_accuracy: 0.7034 - val_loss: 0.7473
Epoch 4/50
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7057 - loss: 0.7450 - val_accuracy: 0.7014 - val_loss: 0.7457
Epoch 5/50
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.7019 - loss: 0.7433 - val_accuracy: 0.7012 - val_loss: 0.7413
Epoch 6/50
[1m1267/1267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7063 - loss: 0.7361 - val_accuracy: 0.7005 - val_loss: 0.7358
Epoch 7/50
[1m1267/1267[0