# IMPORT LIBRARIES

In [178]:
import json 
from pathlib import Path

import pandas as pd
import numpy as np
import networkx as nx

# DATA LOADING

In [179]:
links = pd.read_parquet("data_clean/links.parquet")
vessels = pd.read_parquet("data_clean/vessels.parquet")
locations = pd.read_parquet("data_clean/locations.parquet")
deliveries = pd.read_parquet("data_clean/deliveries.parquet")
fish = pd.read_parquet("data_clean/fish.parquet")

# INSPECTION

In [180]:
vessels.columns

Index(['vessel_id', 'company', 'flag_country', 'length_overall', 'tonnage'], dtype='object')

In [181]:
fish.columns

Index(['id', 'entity_name'], dtype='object')

In [182]:
deliveries.columns

Index(['delivery_cargo_id', 'date', 'qty_tons'], dtype='object')

In [183]:
locations.columns

Index(['location_id', 'Activities', 'fish_species_present', 'kind'], dtype='object')

In [184]:
links.columns

Index(['type', 'time', 'dwell', 'source', 'target', 'key', 'main_type',
       'week'],
      dtype='object')

In [185]:
vessels = vessels[["vessel_id", "company", "flag_country", "length_overall", "tonnage"]]
locations = locations[["location_id", "Activities", "fish_species_present", "kind"]]
fish = fish[["id","entity_name" ]]
deliveries = deliveries[["delivery_cargo_id", "date", "qty_tons"]]
links = links[["source", "target", "main_type", "time", "dwell", "week"]]

In [186]:
ping   = links[links["main_type"] == "TransportEvent"].copy()
harbor = links[links["main_type"] == "HarborReport"].copy()
trans  = links[links["main_type"] == "Transaction"].copy()

# Merge

In [187]:
vessels

Unnamed: 0,vessel_id,company,flag_country,length_overall,tonnage
0,wavewranglerc2d,"Roth, Logan and Moreno",Oceanus,110.0,700.0
1,yellowfintunataker08b,Brown-Haas,Oceanus,110.0,4500.0
2,arcticgraylingangler094,"Smith, Davis and Acosta",Oceanus,80.0,1500.0
3,anchovyassaulterb1c,Cisneros-Meyer,Oceanus,60.0,400.0
4,oceanreaper44a,FlounderLeska Marine BV,Oceanus,50.0,300.0
...,...,...,...,...,...
273,seawayspectrumca2,Unknown,Kethanor,100.0,2600.0
274,oceanicomnipotenta4a,Unknown,Playa Solis,210.0,76300.0
275,seawaysage5ce,Unknown,Coralada,100.0,2100.0
276,nauticalnucleus107c4,Unknown,Zawalinda,100.0,2100.0


In [188]:
locations

Unnamed: 0,location_id,Activities,fish_species_present,kind
0,City of Haacklee,"Tourism, Local shipping",Unknown,city
1,City of Lomark,"Deep sea fishing, Commercial fishing, Tourism,...",Unknown,city
2,City of Himark,"Recreation, tourism",Unknown,city
3,City of Paackland,"Industry, Fishing industry, Local shipping, To...",Unknown,city
4,City of South Paackland,"Industry, Fishing industry, Local shipping",Unknown,city
5,City of Port Grove,"Tourism, Research",Unknown,city
6,Exit West,"International shipping, Deep sea fishing",Unknown,buoy
7,Nav 3,Navigation,Unknown,buoy
8,Nav D,Navigation,Unknown,buoy
9,Nav B,Navigation,Unknown,buoy


In [189]:
harbor

Unnamed: 0,source,target,main_type,time,dwell,week
32062,wavewranglerc2d,City of Haacklee,HarborReport,2035-09-14,0.0,37
32063,wavewranglerc2d,City of Himark,HarborReport,2035-08-16,0.0,33
32064,wavewranglerc2d,City of Lomark,HarborReport,2035-04-07,0.0,14
32065,wavewranglerc2d,City of South Paackland,HarborReport,2035-07-05,0.0,27
32066,yellowfintunataker08b,City of Haacklee,HarborReport,2035-09-03,0.0,36
...,...,...,...,...,...,...
34933,nauticalnucleus107c4,City of Haacklee,HarborReport,2035-11-01,0.0,44
34934,maritimemagnitude2e9,City of Himark,HarborReport,2035-02-28,0.0,9
34935,maritimemagnitude2e9,City of South Paackland,HarborReport,2035-08-05,0.0,31
34936,maritimemagnitude2e9,City of Lomark,HarborReport,2035-04-07,0.0,14


In [190]:
len(set(vessels["vessel_id"]) & set(harbor["source"]))

278

In [191]:
mistakes = []
for sources in harbor["source"].unique():
    if sources not in set(vessels["vessel_id"]):
        mistakes.append(sources)


In [192]:
mistakes

['webigailba7',
 'venerable89c',
 'sturdyd7f',
 'stout369',
 'inquisitive8c0',
 'dewie961',
 'himarkroyal032',
 'mrray9c4',
 'manatee17ea',
 'hewey2ef',
 'honorablea6e',
 'clownfishe3d',
 'louietheii3e0',
 'respectable717',
 'spiritofoceanus404',
 'louieda4',
 'athenad34',
 'heartofoceanusf11']

In [193]:
vessels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   vessel_id       278 non-null    object 
 1   company         278 non-null    object 
 2   flag_country    278 non-null    object 
 3   length_overall  278 non-null    float64
 4   tonnage         278 non-null    float64
dtypes: float64(2), object(3)
memory usage: 11.0+ KB


In [194]:
harbor.info()

<class 'pandas.core.frame.DataFrame'>
Index: 894 entries, 32062 to 34937
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   source     894 non-null    object        
 1   target     894 non-null    object        
 2   main_type  894 non-null    object        
 3   time       894 non-null    datetime64[ns]
 4   dwell      894 non-null    float64       
 5   week       894 non-null    UInt32        
dtypes: UInt32(1), datetime64[ns](1), float64(1), object(3)
memory usage: 46.3+ KB


In [195]:
vessels[vessels["vessel_id"] == "venerable89c"]

Unnamed: 0,vessel_id,company,flag_country,length_overall,tonnage


In [196]:
# Porto = source, Nave = target
harbor = harbor.merge(vessels, left_on="source", right_on="vessel_id", how="left")
harbor = harbor.merge(locations, left_on="target", right_on="location_id", how="left")



In [197]:
harbor.dropna(axis=0, how='any', inplace=True)

In [198]:
harbor = harbor[[
    "source", "target", "time", "week",
    "company", "flag_country", "length_overall", "tonnage",
    "Activities", "fish_species_present", "kind"
]]



In [199]:
# cargo → harbor
trans_harbor = trans[trans["target"].isin(locations["location_id"])].copy()

# cargo → fish
trans_fish = trans[trans["target"].isin(fish["id"])].copy()

print("Cargo→Harbor:", len(trans_harbor))
print("Cargo→Fish:", len(trans_fish))


Cargo→Harbor: 5307
Cargo→Fish: 5307


In [200]:
# Merge sui cargo_id comuni
trans = trans_harbor.merge(
    trans_fish,
    on="source",  # cargo_2035_...
    suffixes=("_harbor", "_fish")
)

print("Record combinati (cargo→harbor→fish):", len(trans))

Record combinati (cargo→harbor→fish): 5307


In [201]:
trans.head()

Unnamed: 0,source,target_harbor,main_type_harbor,time_harbor,dwell_harbor,week_harbor,target_fish,main_type_fish,time_fish,dwell_fish,week_fish
0,cargo_2035_2394778c,City of South Paackland,Transaction,2035-11-03,0.0,44,gadusnspecificatae4ba,Transaction,2035-11-03,0.0,44
1,cargo_2035_23956ba0,City of South Paackland,Transaction,2035-08-16,0.0,33,gadusnspecificatae4ba,Transaction,2035-08-16,0.0,33
2,cargo_2035_23957cfd,City of South Paackland,Transaction,2035-08-20,0.0,34,gadusnspecificatae4ba,Transaction,2035-08-20,0.0,34
3,cargo_2035_23958501,City of Paackland,Transaction,2035-11-07,0.0,45,gadusnspecificatae4ba,Transaction,2035-11-07,0.0,45
4,cargo_2035_23959ab6,City of South Paackland,Transaction,2035-08-24,0.0,34,gadusnspecificatae4ba,Transaction,2035-08-24,0.0,34


In [202]:
# Porto (harbor)
locations_ = locations.rename(columns={
    "location_id": "harbor_id",
    "Activities": "Activities_harbor",
    "fish_species_present": "fish_species_harbor",
    "kind": "kind_harbor"
})

# Pesce
fish_ = fish.rename(columns={
    "id": "fish_id",
    "name": "fish_name"
}) if "name" in fish.columns else fish.rename(columns={"id": "fish_id"})

# Merge con i dettagli
trans = (
    trans.merge(locations_, left_on="target_harbor", right_on="harbor_id", how="left")
         .merge(fish_, left_on="target_fish", right_on="fish_id", how="left")
)

In [203]:
trans

Unnamed: 0,source,target_harbor,main_type_harbor,time_harbor,dwell_harbor,week_harbor,target_fish,main_type_fish,time_fish,dwell_fish,week_fish,harbor_id,Activities_harbor,fish_species_harbor,kind_harbor,fish_id,entity_name
0,cargo_2035_2394778c,City of South Paackland,Transaction,2035-11-03,0.0,44,gadusnspecificatae4ba,Transaction,2035-11-03,0.0,44,City of South Paackland,"Industry, Fishing industry, Local shipping",Unknown,city,gadusnspecificatae4ba,Cod/Gadus n.specificatae
1,cargo_2035_23956ba0,City of South Paackland,Transaction,2035-08-16,0.0,33,gadusnspecificatae4ba,Transaction,2035-08-16,0.0,33,City of South Paackland,"Industry, Fishing industry, Local shipping",Unknown,city,gadusnspecificatae4ba,Cod/Gadus n.specificatae
2,cargo_2035_23957cfd,City of South Paackland,Transaction,2035-08-20,0.0,34,gadusnspecificatae4ba,Transaction,2035-08-20,0.0,34,City of South Paackland,"Industry, Fishing industry, Local shipping",Unknown,city,gadusnspecificatae4ba,Cod/Gadus n.specificatae
3,cargo_2035_23958501,City of Paackland,Transaction,2035-11-07,0.0,45,gadusnspecificatae4ba,Transaction,2035-11-07,0.0,45,City of Paackland,"Industry, Fishing industry, Local shipping, To...",Unknown,city,gadusnspecificatae4ba,Cod/Gadus n.specificatae
4,cargo_2035_23959ab6,City of South Paackland,Transaction,2035-08-24,0.0,34,gadusnspecificatae4ba,Transaction,2035-08-24,0.0,34,City of South Paackland,"Industry, Fishing industry, Local shipping",Unknown,city,gadusnspecificatae4ba,Cod/Gadus n.specificatae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5302,cargo_2035_3119010f,City of Paackland,Transaction,2035-08-24,0.0,34,habeaspisces4eb,Transaction,2035-08-24,0.0,34,City of Paackland,"Industry, Fishing industry, Local shipping, To...",Unknown,city,habeaspisces4eb,Beauvoir/Habeas pisces
5303,cargo_2035_3119118e,City of Paackland,Transaction,2035-08-25,0.0,34,habeaspisces4eb,Transaction,2035-08-25,0.0,34,City of Paackland,"Industry, Fishing industry, Local shipping, To...",Unknown,city,habeaspisces4eb,Beauvoir/Habeas pisces
5304,cargo_2035_3119237c,City of Paackland,Transaction,2035-08-28,0.0,35,habeaspisces4eb,Transaction,2035-08-28,0.0,35,City of Paackland,"Industry, Fishing industry, Local shipping, To...",Unknown,city,habeaspisces4eb,Beauvoir/Habeas pisces
5305,cargo_2035_311936c4,City of Paackland,Transaction,2035-08-30,0.0,35,habeaspisces4eb,Transaction,2035-08-30,0.0,35,City of Paackland,"Industry, Fishing industry, Local shipping, To...",Unknown,city,habeaspisces4eb,Beauvoir/Habeas pisces


In [205]:
trans = trans[[
    "source",                  # cargo id
    "target_harbor",           # porto
    "target_fish",             # specie
    "Activities_harbor", "fish_species_harbor", "kind_harbor", "time_harbor",
    "fish_id"
]]

In [210]:
deliveries

Unnamed: 0,delivery_cargo_id,date,qty_tons
0,cargo_2035_2394778c,2035-11-03,24.375
1,cargo_2035_23956ba0,2035-08-16,18.125
2,cargo_2035_23957cfd,2035-08-20,20.625
3,cargo_2035_23958501,2035-11-07,13.125
4,cargo_2035_23959ab6,2035-08-24,13.125
...,...,...,...
4987,cargo_2035_31187527,2035-08-21,7.500
4988,cargo_2035_3119010f,2035-08-24,10.000
4989,cargo_2035_3119118e,2035-08-25,15.000
4990,cargo_2035_311936c4,2035-08-30,19.000


In [211]:
trans = trans.merge(deliveries, left_on="source", right_on="delivery_cargo_id", how="left")

In [212]:
cols = [
    "source",
    "target_harbor",
    "Activities_harbor",
    "fish_species_harbor",
    "kind_harbor",
    "fish_id",
    "date",
    "qty_tons",
]
trans = trans[cols]


In [216]:
trans.dropna(axis=0, inplace=True)

In [221]:
vessels = vessels.rename(columns={"id": "vessel_id"})
locations = locations.rename(columns={"id": "location_id"})

ping = ping.dropna(subset=["time"]).reset_index(drop=True)

ping = ping.merge(locations, left_on="source", right_on="location_id", how="left")
ping = ping.merge(vessels, left_on="target", right_on="vessel_id", how="left")

ping = ping[[
    "source", "target", "time", "dwell", "week",
    "company", "flag_country", "length_overall", "tonnage",
    "Activities", "fish_species_present", "kind"
]]

In [223]:
ping.dropna(inplace=True)

In [224]:
for i in [ping, trans, harbor]:
    print(i.info())

<class 'pandas.core.frame.DataFrame'>
Index: 218477 entries, 0 to 240709
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   source                218477 non-null  object        
 1   target                218477 non-null  object        
 2   time                  218477 non-null  datetime64[ns]
 3   dwell                 218477 non-null  float64       
 4   week                  218477 non-null  UInt32        
 5   company               218477 non-null  object        
 6   flag_country          218477 non-null  object        
 7   length_overall        218477 non-null  float64       
 8   tonnage               218477 non-null  float64       
 9   Activities            218477 non-null  object        
 10  fish_species_present  218477 non-null  object        
 11  kind                  218477 non-null  object        
dtypes: UInt32(1), datetime64[ns](1), float64(3), object(7)
memory u

In [225]:
fish.to_parquet("data_clean/fish.parquet", index=False, engine='fastparquet')
vessels.to_parquet("data_clean/vessels.parquet", index=False, engine='fastparquet')
locations.to_parquet("data_clean/locations.parquet", index=False, engine='fastparquet')
deliveries.to_parquet("data_clean/deliveries.parquet", index=False, engine='fastparquet')

In [226]:
ping.to_parquet("data_clean/pings.parquet", index=False, engine='fastparquet')
harbor.to_parquet("data_clean/harbors.parquet", index=False, engine='fastparquet')
trans.to_parquet("data_clean/transactions.parquet", index=False, engine='fastparquet')