In [None]:
import os
import glob
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [None]:
def load_and_sort_predictions(output_dir='/output/predictions'):
    csv_files = glob.glob(os.path.join(output_dir, '**/*.csv'), recursive=True)
    
    if not csv_files:
        print(f"No CSV files found in {output_dir}")
        return pd.DataFrame()
    
    df.info()
    df.describe(include='all')
    df.sample(5)

    try:
        df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
        df.drop_duplicates(subset=['window_start', 'window_end', 'airport_name'], inplace=True)
        
        df['window_start'] = pd.to_datetime(df['window_start'])
        df['window_end'] = pd.to_datetime(df['window_end'])
        df['airport_name'] = df['airport_name'].astype(str)

        print("\nNumber of unique airports: ", df['airport_name'].nunique())
        print(sorted(df['airport_name'].unique()))
        
        df.sort_values(by='window_end', inplace=True)
        df.reset_index(drop=True, inplace=True)
        return df
    except Exception as e:
        print(f"Error while reading or merging CSV files: {e}")
        return pd.DataFrame()

def plot_time_analysis(df):
    if df.empty:
        print("DataFrame is empty, cannot plot time analysis.")
        return
    
    fig, ax1 = plt.subplots(figsize=(14, 6))
    
    ax1.set_xlabel('Window time')
    ax1.set_ylabel('Prediction accuracy (%)', color='tab:blue')
    ax1.plot(df['window_end'], df['accuracy_percent'], marker='.', color='tab:blue', label='Prediction accuracy (%)')
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    ax1.grid(True)
    
    ax2 = ax1.twinx()
    ax2.set_ylabel('Total flights in window', color='tab:orange')
    ax2.plot(df['window_end'], df['total_flights'], marker='x', color='tab:orange', alpha=0.5, label='Total flights in window')
    ax2.tick_params(axis='y', labelcolor='tab:orange')
    
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines + lines2, labels + labels2, loc='upper left')
    
    plt.title('Prediction accuracy and total flights in time windows')
    plt.show()

def create_geodataframe(df):
    if df.empty:
        print("DataFrame is empty, cannot create GeoDataFrame.")
        return gpd.GeoDataFrame()
    
    df_latest = df.drop_duplicates(subset=['airport_name'], keep='last')
    geometry = [Point(xy) for xy in zip(df_latest['LONGITUDE'], df_latest['LATITUDE'])]
    gdf = gpd.GeoDataFrame(df_latest, geometry=geometry, crs="EPSG:4326")
    return gdf



def plot_geographical_analysis(gdf):
    if gdf.empty:
        print("GeoDataFrame is empty, cannot plot geographical analysis.")
        return
    
    gdf = gdf.to_crs(epsg=4326)

    fig = plt.figure(figsize=(18, 10))
    ax = plt.axes(projection=ccrs.PlateCarree())

    ax.add_feature(cfeature.LAND.with_scale('50m'), facecolor='lightgrey')
    ax.add_feature(cfeature.OCEAN.with_scale('50m'))
    ax.add_feature(cfeature.COASTLINE.with_scale('50m'))
    ax.add_feature(cfeature.BORDERS.with_scale('50m'), linestyle='-')

    sizes = gdf['total_flights'].apply(lambda x: max(50, x / 5))

    scatter = ax.scatter(
        gdf['LONGITUDE'], 
        gdf['LATITUDE'], 
        c=gdf['accuracy_percent'], 
        cmap='RdYlGn',
        s=sizes,
        alpha=0.8,
        transform=ccrs.PlateCarree()
    )

    cbar = plt.colorbar(scatter, orientation="horizontal", pad=0.05)
    cbar.set_label("Prediction accuracy (%)")

    ax.set_title("Geographical Analysis of ML Prediction Accuracy")

    plt.show()

In [None]:
df = load_and_sort_predictions('/output/predictions')
plot_time_analysis(df)
gdf = create_geodataframe(df)
plot_geographical_analysis(gdf)