In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster, HeatMap
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.express as px
import plotly.graph_objects as go
from datetime import timedelta

# Set plotting styles
sns.set(style="whitegrid")
%matplotlib inline

In [2]:
# Load the cleaned Parquet file
parquet_file = "ais_data_20240911_cleaned.parquet"
df = pd.read_parquet(parquet_file)

# Display the first few rows
print("First 5 rows of the cleaned dataset:")
display(df.head())

# Basic info
print("\nDataFrame Info:")
df.info()

# Shape
print(f"\nShape of the DataFrame: {df.shape}")

First 5 rows of the cleaned dataset:


Unnamed: 0,mmsi,vessel_name,time,lat,lon,heading,rot,sog,cog,nas,mi,imo,call_sign
0,538011058,AGIA CHARIS,2024-09-11 00:00:01,-24.12966,-46.271113,98.0,0.0,0.0,170.9,0,0,,V7A6515
1,710003185,SVITZER JOAQUIM R.,2024-09-11 00:00:01,-23.874703,-46.373788,,-128.0,0.0,196.9,8,0,,PU3490
2,538010074,OINOUSSIAN COURAGE,2024-09-11 00:00:01,-23.988802,-46.292782,37.0,0.0,5.6,45.7,0,0,,V7A5555
3,636018551,TRANSMERIDIAN,2024-09-11 00:00:01,-23.953395,-46.30716,333.0,5.0,7.5,330.0,0,0,,D5QF5
4,710028580,WS SCORPIP,2024-09-11 00:00:01,-23.9682,-46.29047,,-128.0,0.0,173.1,0,0,,PO2303



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300707 entries, 0 to 300706
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   mmsi         300707 non-null  int64         
 1   vessel_name  300707 non-null  string        
 2   time         300707 non-null  datetime64[ns]
 3   lat          300629 non-null  float64       
 4   lon          300633 non-null  float64       
 5   heading      128060 non-null  float64       
 6   rot          283138 non-null  float64       
 7   sog          296026 non-null  float64       
 8   cog          296079 non-null  float64       
 9   nas          300707 non-null  string        
 10  mi           283137 non-null  Int64         
 11  imo          0 non-null       string        
 12  call_sign    300707 non-null  string        
dtypes: Int64(1), datetime64[ns](1), float64(6), int64(1), string(4)
memory usage: 30.1 MB

Shape of the DataFrame: (30070

In [3]:
# Display column names and data types
print("Column names and data types:")
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

# Check for missing columns
expected_columns = ["mmsi", "vessel_name", "time", "lat", "lon", "heading", "rot", "sog", "cog", "nas", "mi", "imo", "call_sign"]
missing_columns = [col for col in expected_columns if col not in df.columns]
if missing_columns:
    print(f"\nMissing columns: {missing_columns}")
else:
    print("\nAll expected columns are present.")

# Check missing values
print("\nMissing values in each column:")
display(df.isnull().sum())

Column names and data types:
mmsi: int64
vessel_name: string
time: datetime64[ns]
lat: float64
lon: float64
heading: float64
rot: float64
sog: float64
cog: float64
nas: string
mi: Int64
imo: string
call_sign: string

All expected columns are present.

Missing values in each column:


mmsi                0
vessel_name         0
time                0
lat                78
lon                74
heading        172647
rot             17569
sog              4681
cog              4628
nas                 0
mi              17570
imo            300707
call_sign           0
dtype: int64

In [15]:
# Function to update heatmap
def update_heatmap(time_window):
    with output_heatmap:
        clear_output(wait=True)
        current_time = df["time"].max()
        window_df = df[df["time"] >= current_time - timedelta(minutes=time_window)].copy()
        valid_positions = window_df.dropna(subset=["lat", "lon"])
        
        if not valid_positions.empty:
            center_lat = valid_positions["lat"].mean()
            center_lon = valid_positions["lon"].mean()
            heatmap_map = folium.Map(location=[center_lat, center_lon], zoom_start=10)
            HeatMap(data=valid_positions[["lat", "lon"]].values, radius=15).add_to(heatmap_map)
            display(heatmap_map)
        else:
            print(f"No data in the last {time_window} minutes.")

# Create slider widget
time_slider = widgets.IntSlider(value=5, min=1, max=60, step=1, description="Time Window (min):")
output_heatmap = widgets.Output()

# Link slider to update function
widgets.interactive(update_heatmap, time_window=time_slider)

# Display widgets
display(time_slider)
display(output_heatmap)

# Initial update
update_heatmap(5)

IntSlider(value=5, description='Time Window (min):', max=60, min=1)

Output()

In [10]:
# Check for invalid ranges after cleaning
invalid_lat = df[(df["lat"] < -90) | (df["lat"] > 90)]
invalid_lon = df[(df["lon"] < -180) | (df["lon"] > 180)]
invalid_sog = df[(df["sog"] < 0) | (df["sog"] > 50)]
invalid_cog = df[(df["cog"] < 0) | (df["cog"] > 360)]

print("Invalid latitude count:", len(invalid_lat))
print("Invalid longitude count:", len(invalid_lon))
print("Invalid speed count:", len(invalid_sog))
print("Invalid course count:", len(invalid_cog))

# Check MMSI consistency
mmsi_vessel_counts = df.groupby("mmsi")["vessel_name"].nunique()
inconsistent_mmsi = mmsi_vessel_counts[mmsi_vessel_counts > 1]
if inconsistent_mmsi.empty:
    print("\nNo MMSI with multiple vessel names.")
else:
    print("\nMMSI with multiple vessel names (resolved during cleaning):")
    display(inconsistent_mmsi)

Invalid latitude count: 0
Invalid longitude count: 0
Invalid speed count: 0
Invalid course count: 0

No MMSI with multiple vessel names.
