# IMPORT LIBRARIES

In [None]:
import json
from pathlib import Path

import networkx as nx
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 20)


: 

## Exploratory Data Analysis

This notebook explores the MC2 graph dataset from the VAST Challenge 2024. The following sections document how the data is loaded, what entities and relationships exist, temporal patterns, and the overall network structure.

### Load Graph Data

In [None]:
DATA_PATH = Path('../Data/MC2/mc2.json')

with DATA_PATH.open() as f:
    raw_graph = json.load(f)

nodes_df = pd.DataFrame(raw_graph['nodes'])
links_df = pd.DataFrame(raw_graph['links'])

print(f"Nodes: {nodes_df.shape[0]:,} rows | {nodes_df.shape[1]} columns")
print(f"Links: {links_df.shape[0]:,} rows | {links_df.shape[1]} columns")
print(f"Sample node columns: {nodes_df.columns.tolist()[:10]}")
print(f"Sample link columns: {links_df.columns.tolist()[:10]}")

### Node-Level Overview

In [None]:
node_type_counts = (
    nodes_df['type']
    .value_counts()
    .rename_axis('node_type')
    .reset_index(name='count')
)
node_type_counts

In [None]:
nodes_df['display_name'] = nodes_df.get('name').fillna(nodes_df.get('Name'))
node_missing_pct = (nodes_df.isna().mean().sort_values(ascending=False) * 100)
node_missing_pct[node_missing_pct > 0].head(15)

In [None]:
delivery_reports = nodes_df[nodes_df['type'] == 'Entity.Document.DeliveryReport'].copy()
delivery_reports['qty_tons'] = pd.to_numeric(delivery_reports['qty_tons'], errors='coerce')
delivery_reports['date'] = pd.to_datetime(delivery_reports['date'], errors='coerce')
delivery_reports[['qty_tons']].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

In [None]:
vessel_mask = nodes_df['type'].str.startswith('Entity.Vessel')
vessel_df = nodes_df.loc[vessel_mask].copy()
for column in ['length_overall', 'tonnage']:
    if column in vessel_df:
        vessel_df[column] = pd.to_numeric(vessel_df[column], errors='coerce')
vessel_summary = {
    'flag_country_top': vessel_df['flag_country'].value_counts().head(10),
    'length_overall_stats': vessel_df['length_overall'].describe(),
    'tonnage_stats': vessel_df['tonnage'].describe()
}
vessel_summary

### Link / Event Overview

In [None]:
links_df['time'] = pd.to_datetime(links_df['time'], errors='coerce')
links_df['date'] = pd.to_datetime(links_df['date'], errors='coerce')
link_type_counts = (
    links_df['type']
    .value_counts()
    .rename_axis('event_type')
    .reset_index(name='count')
)
link_type_counts

In [None]:
time_coverage = (
    links_df
    .groupby('type')
    .agg(n_events=('type', 'size'), earliest_time=('time', 'min'), latest_time=('time', 'max'), earliest_date=('date', 'min'), latest_date=('date', 'max'))
    .reset_index()
)
time_coverage

In [None]:
interaction_counts = (
    links_df
    .groupby('source')
    .size()
    .rename('outgoing')
    .to_frame()
    .join(links_df.groupby('target').size().rename('incoming'), how='outer')
    .fillna(0)
    .assign(total=lambda df: df['incoming'] + df['outgoing'])
    .sort_values('total', ascending=False)
    .head(10)
)
interaction_counts

In [None]:
top_relationships = (
    links_df
    .value_counts(subset=['source', 'target', 'type'])
    .rename('count')
    .reset_index()
    .sort_values('count', ascending=False)
    .head(10)
)
top_relationships

In [None]:
ping_events = links_df[links_df['time'].notna()].copy()
ping_events['month'] = ping_events['time'].dt.to_period('M')
monthly_event_counts = (
    ping_events
    .groupby(['type', 'month'])
    .size()
    .reset_index(name='count')
    .pivot(index='month', columns='type', values='count')
    .fillna(0)
    .astype(int)
)
monthly_event_counts

In [None]:
ping_dwell = pd.to_numeric(
    links_df.loc[links_df['type'] == 'Event.TransportEvent.TransponderPing', 'dwell'],
    errors='coerce'
)
ping_dwell.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95])

### Network Connectivity

In [None]:
graph = nx.from_pandas_edgelist(
    links_df,
    source='source',
    target='target',
    create_using=nx.MultiDiGraph()
)

num_nodes = graph.number_of_nodes()
num_edges = graph.number_of_edges()
weak_components = nx.number_weakly_connected_components(graph)
strong_components = nx.number_strongly_connected_components(graph)

print(f'Graph has {num_nodes:,} nodes and {num_edges:,} edges')
print(f'Weakly connected components: {weak_components}')
print(f'Strongly connected components: {strong_components}')

In [None]:
degree_series = pd.Series(dict(graph.degree()))
degree_summary = degree_series.describe(percentiles=[0.5, 0.9, 0.99])
degree_summary

In [None]:
top_degree = degree_series.sort_values(ascending=False).head(15).to_frame('degree')
top_degree

### Follow-Up Ideas

- Investigate anomalies in delivery reports by cross-referencing unusually high or low `qty_tons` with vessel activity.
- Map vessels with the highest degree centrality to their flag countries and owners for potential suspicious behavior.
- Visualize temporal spikes in transponder pings vs. transactions to detect coordinated campaigns.
- Enrich the graph with geospatial coordinates for cities/regions to support mapping and route analysis.