From d6426f5640176063b39bba4232b68c4a4c3d7eba Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Tue, 18 Nov 2025 15:39:32 -0600 Subject: [PATCH 01/25] fix: track transient connections separately --- .../src/node/network_bridge/p2p_protoc.rs | 210 +++------ crates/core/src/ring/connection_manager.rs | 438 +++++++++++------- crates/core/src/ring/mod.rs | 80 ++-- 3 files changed, 377 insertions(+), 351 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 15d1b1534..55ccc6160 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -665,13 +665,13 @@ impl P2pConnManager { peer, tx, callback, - is_gw: transient, + is_gw: courtesy, } => { tracing::info!( tx = %tx, remote = %peer, remote_addr = %peer.addr, - transient, + courtesy, "NodeEvent::ConnectPeer received" ); ctx.handle_connect_peer( @@ -680,7 +680,7 @@ impl P2pConnManager { tx, &handshake_cmd_sender, &mut state, - transient, + courtesy, ) .await?; } @@ -691,7 +691,7 @@ impl P2pConnManager { .send(HandshakeCommand::ExpectInbound { peer: peer.clone(), transaction: None, - transient: false, + courtesy: false, }) .await { @@ -976,6 +976,13 @@ impl P2pConnManager { } => { tracing::debug!(%tx, %key, "local subscribe complete"); + // If this is a child operation, complete it and let the parent flow handle result delivery. + if op_manager.is_sub_operation(tx) { + tracing::info!(%tx, %key, "completing child subscribe operation"); + op_manager.completed(tx); + continue; + } + if !op_manager.is_sub_operation(tx) { let response = Ok(HostResponse::ContractResponse( ContractResponse::SubscribeResponse { key, subscribed }, @@ -1208,7 +1215,7 @@ impl P2pConnManager { tx: Transaction, handshake_commands: &HandshakeCommandSender, state: &mut EventListenerState, - transient: bool, + courtesy: bool, ) -> anyhow::Result<()> { let mut peer = peer; let mut peer_addr = peer.addr; @@ -1221,13 +1228,13 @@ impl P2pConnManager { tx = %tx, remote = %peer, fallback_addr = %peer_addr, - transient, + courtesy, "ConnectPeer provided unspecified address; using existing connection address" ); } else { tracing::debug!( tx = %tx, - transient, + courtesy, "ConnectPeer received unspecified address without existing connection reference" ); } @@ -1237,7 +1244,7 @@ impl P2pConnManager { tx = %tx, remote = %peer, remote_addr = %peer_addr, - transient, + courtesy, "Connecting to peer" ); if let Some(blocked_addrs) = &self.blocked_addresses { @@ -1273,7 +1280,7 @@ impl P2pConnManager { tracing::info!( tx = %tx, remote = %peer, - transient, + courtesy, "connect_peer: reusing existing transport / promoting transient if present" ); let connection_manager = &self.bridge.op_manager.ring.connection_manager; @@ -1289,8 +1296,9 @@ impl P2pConnManager { tracing::info!(tx = %tx, remote = %peer, "connect_peer: promoted transient"); } - // Return the remote peer we are connected to (not our own peer key). - let resolved_peer_id = peer.clone(); + let resolved_peer_id = connection_manager + .get_peer_key() + .expect("peer key should be set"); callback .send_result(Ok((resolved_peer_id, None))) .await @@ -1316,7 +1324,7 @@ impl P2pConnManager { tx = %tx, remote = %peer_addr, pending = callbacks.get().len(), - transient, + courtesy, "Connection already pending, queuing additional requester" ); callbacks.get_mut().push(callback); @@ -1325,7 +1333,7 @@ impl P2pConnManager { remote = %peer_addr, pending = callbacks.get().len(), pending_txs = ?txs_entry, - transient, + courtesy, "connect_peer: connection already pending, queued callback" ); return Ok(()); @@ -1336,7 +1344,7 @@ impl P2pConnManager { tracing::debug!( tx = %tx, remote = %peer_addr, - transient, + courtesy, "connect_peer: registering new pending connection" ); entry.insert(vec![callback]); @@ -1345,7 +1353,7 @@ impl P2pConnManager { remote = %peer_addr, pending = 1, pending_txs = ?txs_entry, - transient, + courtesy, "connect_peer: registered new pending connection" ); state.outbound_handler.expect_incoming(peer_addr); @@ -1356,14 +1364,14 @@ impl P2pConnManager { .send(HandshakeCommand::Connect { peer: peer.clone(), transaction: tx, - transient, + courtesy, }) .await { tracing::warn!( tx = %tx, remote = %peer.addr, - transient, + courtesy, ?error, "Failed to enqueue connect command" ); @@ -1378,7 +1386,7 @@ impl P2pConnManager { tx = %tx, remote = %peer_addr, callbacks = callbacks.len(), - transient, + courtesy, "Cleaning up callbacks after connect command failure" ); for mut cb in callbacks { @@ -1405,7 +1413,7 @@ impl P2pConnManager { tracing::debug!( tx = %tx, remote = %peer_addr, - transient, + courtesy, "connect_peer: handshake command dispatched" ); } @@ -1424,17 +1432,16 @@ impl P2pConnManager { transaction, peer, connection, - transient, + courtesy, } => { - tracing::info!(provided = ?peer, transient, tx = ?transaction, "InboundConnection event"); - let _conn_manager = &self.bridge.op_manager.ring.connection_manager; + let conn_manager = &self.bridge.op_manager.ring.connection_manager; let remote_addr = connection.remote_addr(); if let Some(blocked_addrs) = &self.blocked_addresses { if blocked_addrs.contains(&remote_addr) { tracing::info!( remote = %remote_addr, - transient, + courtesy, transaction = ?transaction, "Inbound connection blocked by local policy" ); @@ -1442,11 +1449,11 @@ impl P2pConnManager { } } - let _provided_peer = peer.clone(); + let provided_peer = peer.clone(); let peer_id = peer.unwrap_or_else(|| { tracing::info!( remote = %remote_addr, - transient, + courtesy, transaction = ?transaction, "Inbound connection arrived without matching expectation; accepting provisionally" ); @@ -1464,14 +1471,13 @@ impl P2pConnManager { tracing::info!( remote = %peer_id.addr, - transient, + courtesy, transaction = ?transaction, "Inbound connection established" ); - // Treat only transient connections as transient. Normal inbound dials (including - // gateway bootstrap from peers) should be promoted into the ring once established. - let is_transient = transient; + let is_transient = + conn_manager.is_gateway() && provided_peer.is_none() && transaction.is_none(); self.handle_successful_connection(peer_id, connection, state, None, is_transient) .await?; @@ -1480,11 +1486,11 @@ impl P2pConnManager { transaction, peer, connection, - transient, + courtesy, } => { tracing::info!( remote = %peer.addr, - transient, + courtesy, transaction = %transaction, "Outbound connection established" ); @@ -1495,11 +1501,11 @@ impl P2pConnManager { transaction, peer, error, - transient, + courtesy, } => { tracing::info!( remote = %peer.addr, - transient, + courtesy, transaction = %transaction, ?error, "Outbound connection failed" @@ -1521,7 +1527,7 @@ impl P2pConnManager { remote = %peer.addr, callbacks = callbacks.len(), pending_txs = ?pending_txs, - transient, + courtesy, "Notifying callbacks after outbound failure" ); @@ -1613,14 +1619,9 @@ impl P2pConnManager { current = connection_manager.transient_count(), "Transient connection budget exhausted; dropping inbound connection" ); - if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { - for mut cb in callbacks { - let _ = cb.send_result(Err(())).await; - } - } - state.awaiting_connection_txs.remove(&peer_id.addr); return Ok(()); } + let pending_txs = state .awaiting_connection_txs .remove(&peer_id.addr) @@ -1683,19 +1684,6 @@ impl P2pConnManager { // Only insert if connection doesn't already exist to avoid dropping existing channel let mut newly_inserted = false; if !self.connections.contains_key(&peer_id) { - if is_transient { - let cm = &self.bridge.op_manager.ring.connection_manager; - let current = cm.transient_count(); - if current >= cm.transient_budget() { - tracing::warn!( - remote = %peer_id.addr, - budget = cm.transient_budget(), - current, - "Transient connection budget exhausted; dropping inbound connection before insert" - ); - return Ok(()); - } - } let (tx, rx) = mpsc::channel(10); tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %peer_id, conn_map_size = self.connections.len(), "[CONN_TRACK] INSERT: OutboundConnectionSuccessful - adding to connections HashMap"); self.connections.insert(peer_id.clone(), tx); @@ -1711,106 +1699,42 @@ impl P2pConnManager { tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %peer_id, conn_map_size = self.connections.len(), "[CONN_TRACK] SKIP INSERT: OutboundConnectionSuccessful - connection already exists in HashMap"); } - let promote_to_ring = !is_transient || connection_manager.is_gateway(); - if newly_inserted { - tracing::info!(remote = %peer_id, is_transient, "handle_successful_connection: inserted new connection entry"); let pending_loc = connection_manager.prune_in_transit_connection(&peer_id); - if promote_to_ring { + if !is_transient { let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); - tracing::info!( - remote = %peer_id, - %loc, - pending_loc_known = pending_loc.is_some(), - "handle_successful_connection: evaluating promotion to ring" - ); - // Re-apply admission logic on promotion to avoid bypassing capacity/heuristic checks. - let should_accept = connection_manager.should_accept(loc, &peer_id); - if !should_accept { - tracing::warn!( - %peer_id, - %loc, - "handle_successful_connection: promotion rejected by admission logic" - ); - return Ok(()); - } - let current = connection_manager.num_connections(); - if current >= connection_manager.max_connections { - tracing::warn!( - %peer_id, - current_connections = current, - max_connections = connection_manager.max_connections, - %loc, - "handle_successful_connection: rejecting new connection to enforce cap" - ); - return Ok(()); - } - tracing::info!(remote = %peer_id, %loc, "handle_successful_connection: promoting connection into ring"); self.bridge .op_manager .ring .add_connection(loc, peer_id.clone(), false) .await; - if is_transient { - connection_manager.drop_transient(&peer_id); - } } else { - let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); - // Evaluate whether this transient should be promoted; gateways need routable peers. - let should_accept = connection_manager.should_accept(loc, &peer_id); - if should_accept { - connection_manager.drop_transient(&peer_id); - let current = connection_manager.num_connections(); - if current >= connection_manager.max_connections { - tracing::warn!( - %peer_id, - current_connections = current, - max_connections = connection_manager.max_connections, - %loc, - "handle_successful_connection: rejecting transient promotion to enforce cap" - ); - return Ok(()); - } - tracing::info!( - remote = %peer_id, - %loc, - pending_loc_known = pending_loc.is_some(), - "handle_successful_connection: promoting transient into ring" - ); - self.bridge - .op_manager - .ring - .add_connection(loc, peer_id.clone(), true) - .await; - } else { - // Keep the connection as transient; budget was reserved before any work. - connection_manager.try_register_transient(peer_id.clone(), pending_loc); - tracing::info!( - peer = %peer_id, - pending_loc_known = pending_loc.is_some(), - "Registered transient connection (not added to ring topology)" - ); - let ttl = connection_manager.transient_ttl(); - let drop_tx = self.bridge.ev_listener_tx.clone(); - let cm = connection_manager.clone(); - let peer = peer_id.clone(); - tokio::spawn(async move { - sleep(ttl).await; - if cm.drop_transient(&peer).is_some() { - tracing::info!(%peer, "Transient connection expired; dropping"); - if let Err(err) = drop_tx - .send(Right(NodeEvent::DropConnection(peer.clone()))) - .await - { - tracing::warn!( - %peer, - ?err, - "Failed to dispatch DropConnection for expired transient" - ); - } + // Update location now that we know it; budget was reserved before any work. + connection_manager.try_register_transient(peer_id.clone(), pending_loc); + tracing::info!( + peer = %peer_id, + "Registered transient connection (not added to ring topology)" + ); + let ttl = connection_manager.transient_ttl(); + let drop_tx = self.bridge.ev_listener_tx.clone(); + let cm = connection_manager.clone(); + let peer = peer_id.clone(); + tokio::spawn(async move { + sleep(ttl).await; + if cm.drop_transient(&peer).is_some() { + tracing::info!(%peer, "Transient connection expired; dropping"); + if let Err(err) = drop_tx + .send(Right(NodeEvent::DropConnection(peer.clone()))) + .await + { + tracing::warn!( + %peer, + ?err, + "Failed to dispatch DropConnection for expired transient" + ); } - }); - } + } + }); } } else if is_transient { // We reserved budget earlier, but didn't take ownership of the connection. diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index f910becf2..26c62b164 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -12,18 +12,17 @@ use super::*; #[derive(Clone)] pub(crate) struct TransientEntry { - /// Entry tracking a transient connection that hasn't been added to the ring topology yet. - /// Transient connections are typically unsolicited inbound connections to gateways. #[allow(dead_code)] pub opened_at: Instant, - /// Advertised location for the transient peer, if known at admission time. pub location: Option, } #[derive(Clone)] pub(crate) struct ConnectionManager { - pending_reservations: Arc>>, + open_connections: Arc, + reserved_connections: Arc, pub(super) location_for_peer: Arc>>, + pending_locations: Arc>>, pub(super) topology_manager: Arc>, connections_by_location: Arc>>>, /// Interim connections ongoing handshake or successfully open connections @@ -123,7 +122,9 @@ impl ConnectionManager { Self { connections_by_location: Arc::new(RwLock::new(BTreeMap::new())), location_for_peer: Arc::new(RwLock::new(BTreeMap::new())), - pending_reservations: Arc::new(RwLock::new(BTreeMap::new())), + pending_locations: Arc::new(RwLock::new(BTreeMap::new())), + open_connections: Arc::new(AtomicUsize::new(0)), + reserved_connections: Arc::new(AtomicUsize::new(0)), topology_manager, own_location: own_location.into(), peer_key: Arc::new(Mutex::new(peer_id)), @@ -146,8 +147,12 @@ impl ConnectionManager { /// Will panic if the node checking for this condition has no location assigned. pub fn should_accept(&self, location: Location, peer_id: &PeerId) -> bool { tracing::info!("Checking if should accept connection"); - let open = self.connection_count(); - let reserved_before = self.pending_reservations.read().len(); + let open = self + .open_connections + .load(std::sync::atomic::Ordering::SeqCst); + let reserved_before = self + .reserved_connections + .load(std::sync::atomic::Ordering::SeqCst); tracing::info!( %peer_id, @@ -160,6 +165,16 @@ impl ConnectionManager { "should_accept: evaluating direct acceptance guard" ); + if self.has_connection_or_pending(peer_id) { + tracing::debug!( + %peer_id, + open, + reserved_before, + "Peer already connected; rejecting duplicate reservation" + ); + return false; + } + if self.is_gateway && (open > 0 || reserved_before > 0) { tracing::info!( %peer_id, @@ -169,16 +184,34 @@ impl ConnectionManager { ); } - if self.location_for_peer.read().get(peer_id).is_some() { - // We've already accepted this peer (pending or active); treat as a no-op acceptance. - tracing::debug!(%peer_id, "Peer already pending/connected; acknowledging acceptance"); - return true; - } - - { - let mut pending = self.pending_reservations.write(); - pending.insert(peer_id.clone(), location); - } + let reserved_before = loop { + let current = self + .reserved_connections + .load(std::sync::atomic::Ordering::SeqCst); + if current == usize::MAX { + tracing::error!( + %peer_id, + "reserved connection counter overflowed; rejecting new connection" + ); + return false; + } + match self.reserved_connections.compare_exchange( + current, + current + 1, + std::sync::atomic::Ordering::SeqCst, + std::sync::atomic::Ordering::SeqCst, + ) { + Ok(_) => break current, + Err(actual) => { + tracing::debug!( + %peer_id, + expected = current, + actual, + "reserved connection counter changed concurrently; retrying" + ); + } + } + }; let total_conn = match reserved_before .checked_add(1) @@ -192,49 +225,70 @@ impl ConnectionManager { open, "connection counters would overflow; rejecting connection" ); - self.pending_reservations.write().remove(peer_id); + self.reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); return false; } }; - if open == 0 { + let accepted = if open == 0 { tracing::debug!(%peer_id, "should_accept: first connection -> accepting"); - return true; - } - - let accepted = if total_conn < self.min_connections { - tracing::info!(%peer_id, total_conn, "should_accept: accepted (below min connections)"); true - } else if total_conn >= self.max_connections { - tracing::info!(%peer_id, total_conn, "should_accept: rejected (max connections reached)"); - false } else { - let accepted = self - .topology_manager - .write() - .evaluate_new_connection(location, Instant::now()) - .unwrap_or(true); + const GATEWAY_DIRECT_ACCEPT_LIMIT: usize = 2; + if self.is_gateway { + let direct_total = open + reserved_before; + if direct_total >= GATEWAY_DIRECT_ACCEPT_LIMIT { + tracing::info!( + %peer_id, + open, + reserved_before, + limit = GATEWAY_DIRECT_ACCEPT_LIMIT, + "Gateway reached direct-accept limit; forwarding join request instead" + ); + self.reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + tracing::info!(%peer_id, "should_accept: gateway direct-accept limit hit, forwarding instead"); + return false; + } + } - tracing::info!( - %peer_id, - total_conn, - accepted, - "should_accept: topology manager decision" - ); - accepted + if total_conn < self.min_connections { + tracing::info!(%peer_id, total_conn, "should_accept: accepted (below min connections)"); + true + } else if total_conn >= self.max_connections { + tracing::info!(%peer_id, total_conn, "should_accept: rejected (max connections reached)"); + false + } else { + let accepted = self + .topology_manager + .write() + .evaluate_new_connection(location, Instant::now()) + .unwrap_or(true); + + tracing::info!( + %peer_id, + total_conn, + accepted, + "should_accept: topology manager decision" + ); + accepted + } }; + tracing::info!( %peer_id, accepted, total_conn, open_connections = open, - reserved_connections = self.pending_reservations.read().len(), - max_connections = self.max_connections, - min_connections = self.min_connections, + reserved_connections = self + .reserved_connections + .load(std::sync::atomic::Ordering::SeqCst), "should_accept: final decision" ); if !accepted { - self.pending_reservations.write().remove(peer_id); + self.reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); } else { tracing::info!(%peer_id, total_conn, "should_accept: accepted (reserving spot)"); self.record_pending_location(peer_id, location); @@ -244,11 +298,11 @@ impl ConnectionManager { /// Record the advertised location for a peer that we have decided to accept. /// - /// This makes the peer discoverable to the routing layer even before the connection - /// is fully established. The entry is removed automatically if the handshake fails - /// via `prune_in_transit_connection`. + /// Pending peers are tracked separately so that other operations cannot route through them + /// until the handshake is fully complete. Once the connection is established the entry is + /// removed automatically via `prune_in_transit_connection`. pub fn record_pending_location(&self, peer_id: &PeerId, location: Location) { - let mut locations = self.location_for_peer.write(); + let mut locations = self.pending_locations.write(); let entry = locations.entry(peer_id.clone()); match entry { Entry::Occupied(_) => { @@ -308,7 +362,6 @@ impl ConnectionManager { self.peer_key.lock().clone() } - #[allow(dead_code)] pub fn is_gateway(&self) -> bool { self.is_gateway } @@ -360,22 +413,25 @@ impl ConnectionManager { removed } - /// Check whether a peer is currently tracked as transient. pub fn is_transient(&self, peer: &PeerId) -> bool { self.transient_connections.contains_key(peer) } - /// Current number of tracked transient connections. + #[allow(dead_code)] + pub fn is_transient_addr(&self, addr: &SocketAddr) -> bool { + self.transient_connections + .iter() + .any(|entry| entry.key().addr == *addr) + } + pub fn transient_count(&self) -> usize { self.transient_in_use.load(Ordering::Acquire) } - /// Maximum transient slots allowed. pub fn transient_budget(&self) -> usize { self.transient_budget } - /// Time-to-live for transients before automatic drop. pub fn transient_ttl(&self) -> Duration { self.transient_ttl } @@ -402,31 +458,25 @@ impl ConnectionManager { pub fn add_connection(&self, loc: Location, peer: PeerId, was_reserved: bool) { tracing::info!(%peer, %loc, %was_reserved, "Adding connection to topology"); debug_assert!(self.get_peer_key().expect("should be set") != peer); - if was_reserved { - self.pending_reservations.write().remove(&peer); + { + let mut pending = self.pending_locations.write(); + pending.remove(&peer); } - let mut lop = self.location_for_peer.write(); - let previous_location = lop.insert(peer.clone(), loc); - drop(lop); - - if let Some(prev_loc) = previous_location { - tracing::info!( - %peer, - %prev_loc, - %loc, - "add_connection: replacing existing connection for peer" - ); - let mut cbl = self.connections_by_location.write(); - if let Some(prev_list) = cbl.get_mut(&prev_loc) { - if let Some(pos) = prev_list.iter().position(|c| c.location.peer == peer) { - prev_list.swap_remove(pos); - } - if prev_list.is_empty() { - cbl.remove(&prev_loc); + if was_reserved { + let old = self + .reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + #[cfg(debug_assertions)] + { + tracing::debug!(old, "Decremented reserved connections"); + if old == 0 { + panic!("Underflow of reserved connections"); } } + let _ = old; } - + let mut lop = self.location_for_peer.write(); + lop.insert(peer.clone(), loc); { let mut cbl = self.connections_by_location.write(); cbl.entry(loc).or_default().push(Connection { @@ -434,8 +484,12 @@ impl ConnectionManager { peer: peer.clone(), location: Some(loc), }, + open_at: Instant::now(), }); } + self.open_connections + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + std::mem::drop(lop); } pub fn update_peer_identity(&self, old_peer: &PeerId, new_peer: PeerId) -> bool { @@ -475,6 +529,7 @@ impl ConnectionManager { peer: new_peer, location: Some(loc), }, + open_at: Instant::now(), }); } @@ -485,63 +540,57 @@ impl ConnectionManager { let connection_type = if is_alive { "active" } else { "in transit" }; tracing::debug!(%peer, "Pruning {} connection", connection_type); - let mut locations_for_peer = self.location_for_peer.write(); - - let Some(loc) = locations_for_peer.remove(peer) else { - if is_alive { - tracing::debug!("no location found for peer, skip pruning"); - return None; - } else { - let removed = self.pending_reservations.write().remove(peer).is_some(); - if !removed { - tracing::warn!( - %peer, - "prune_connection: no pending reservation to release for in-transit peer" - ); + let loc = if is_alive { + let mut locations_for_peer = self.location_for_peer.write(); + match locations_for_peer.remove(peer) { + Some(loc) => { + let conns = &mut *self.connections_by_location.write(); + if let Some(conns) = conns.get_mut(&loc) { + if let Some(pos) = conns.iter().position(|c| &c.location.peer == peer) { + conns.swap_remove(pos); + } + } + loc + } + None => { + tracing::debug!("no location found for peer, skip pruning"); + return None; } } - return None; - }; - - let conns = &mut *self.connections_by_location.write(); - if let Some(conns) = conns.get_mut(&loc) { - if let Some(pos) = conns.iter().position(|c| &c.location.peer == peer) { - conns.swap_remove(pos); + } else { + match self.pending_locations.write().remove(peer) { + Some(loc) => loc, + None => { + tracing::debug!("no pending location found for peer, skip pruning"); + self.reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + return None; + } } - } + }; - if !is_alive { - self.pending_reservations.write().remove(peer); + if is_alive { + self.open_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + } else { + self.reserved_connections + .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); } Some(loc) } - pub(crate) fn connection_count(&self) -> usize { - // Count only established connections tracked by location buckets. - self.connections_by_location - .read() - .values() - .map(|conns| conns.len()) - .sum() - } - - #[allow(dead_code)] pub(super) fn get_open_connections(&self) -> usize { - self.connection_count() + self.open_connections + .load(std::sync::atomic::Ordering::SeqCst) } - #[allow(dead_code)] pub(crate) fn get_reserved_connections(&self) -> usize { - self.pending_reservations.read().len() + self.reserved_connections + .load(std::sync::atomic::Ordering::SeqCst) } - pub fn has_connection_or_pending(&self, peer: &PeerId) -> bool { - self.location_for_peer.read().contains_key(peer) - || self.pending_reservations.read().contains_key(peer) - } - - pub(crate) fn get_connections_by_location(&self) -> BTreeMap> { + pub(super) fn get_connections_by_location(&self) -> BTreeMap> { self.connections_by_location.read().clone() } @@ -558,15 +607,6 @@ impl ConnectionManager { router: &Router, ) -> Option { let connections = self.connections_by_location.read(); - tracing::debug!( - total_locations = connections.len(), - self_peer = self - .get_peer_key() - .as_ref() - .map(|id| id.to_string()) - .unwrap_or_else(|| "unknown".into()), - "routing: considering connections" - ); let peers = connections.values().filter_map(|conns| { let conn = conns.choose(&mut rand::rng())?; if self.is_transient(&conn.location.peer) { @@ -582,40 +622,6 @@ impl ConnectionManager { router.select_peer(peers, target).cloned() } - pub fn routing_candidates( - &self, - target: Location, - requesting: Option<&PeerId>, - skip_list: impl Contains, - ) -> Vec { - let connections = self.connections_by_location.read(); - let mut candidates: Vec = connections - .values() - .flat_map(|conns| conns.iter()) - .filter(|conn| { - !self.is_transient(&conn.location.peer) - && (requesting != Some(&conn.location.peer)) - && !skip_list.has_element(conn.location.peer.clone()) - }) - .map(|conn| conn.location.clone()) - .collect(); - - candidates.sort_by(|a, b| { - let da = a - .location - .unwrap_or_else(|| Location::from_address(&a.peer.addr)) - .distance(target) - .as_f64(); - let db = b - .location - .unwrap_or_else(|| Location::from_address(&b.peer.addr)) - .distance(target) - .as_f64(); - da.partial_cmp(&db).unwrap_or(std::cmp::Ordering::Equal) - }); - candidates - } - pub fn num_connections(&self) -> usize { let connections = self.connections_by_location.read(); let total: usize = connections.values().map(|v| v.len()).sum(); @@ -632,4 +638,120 @@ impl ConnectionManager { let read = self.location_for_peer.read(); read.keys().cloned().collect::>().into_iter() } + + pub fn has_connection_or_pending(&self, peer: &PeerId) -> bool { + if self.location_for_peer.read().contains_key(peer) { + return true; + } + self.pending_locations.read().contains_key(peer) + } + + #[cfg(test)] + pub(crate) fn is_pending_connection(&self, peer: &PeerId) -> bool { + self.pending_locations.read().contains_key(peer) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::topology::rate::Rate; + use crate::transport::TransportKeypair; + use std::net::{IpAddr, Ipv4Addr, SocketAddr}; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::Duration; + + fn make_connection_manager() -> ConnectionManager { + let keypair = TransportKeypair::new(); + let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 4100); + let own_location = Location::from_address(&addr); + let atomic_loc = AtomicU64::new(u64::from_le_bytes(own_location.as_f64().to_le_bytes())); + let self_peer = PeerId::new(addr, keypair.public().clone()); + ConnectionManager::init( + Rate::new_per_second(10_000.0), + Rate::new_per_second(10_000.0), + 1, + 32, + 4, + (keypair.public().clone(), Some(self_peer), atomic_loc), + false, + 4, + Duration::from_secs(30), + ) + } + + fn make_peer(port: u16) -> PeerId { + let keypair = TransportKeypair::new(); + let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), port); + PeerId::new(addr, keypair.public().clone()) + } + + #[test] + fn pending_connections_hidden_from_known_locations() { + let manager = make_connection_manager(); + let peer = make_peer(4200); + let location = Location::from_address(&peer.addr); + + assert!(manager.should_accept(location, &peer)); + assert!(manager.is_pending_connection(&peer)); + assert!( + !manager.get_known_locations().contains_key(&peer), + "pending connection leaked into established pool" + ); + + let restored = manager + .prune_in_transit_connection(&peer) + .expect("pending location should exist"); + assert_eq!(restored, location); + + manager.add_connection(restored, peer.clone(), false); + assert!( + !manager.is_pending_connection(&peer), + "pending slot should be cleared after promotion" + ); + + let known = manager.get_known_locations(); + assert_eq!(known.get(&peer), Some(&location)); + } + + #[test] + fn should_accept_does_not_leak_reservations_for_duplicate_peer() { + let keypair = TransportKeypair::new(); + let peer_addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 20_000); + let peer_id = PeerId::new(peer_addr, keypair.public().clone()); + let location = Location::from_address(&peer_addr); + + let manager = ConnectionManager::init( + Rate::new_per_second(1_000_000.0), + Rate::new_per_second(1_000_000.0), + Ring::DEFAULT_MIN_CONNECTIONS, + Ring::DEFAULT_MAX_CONNECTIONS, + Ring::DEFAULT_RAND_WALK_ABOVE_HTL, + (keypair.public().clone(), None, AtomicU64::new(0)), + false, + 32, + Duration::from_secs(30), + ); + + assert!(manager.should_accept(location, &peer_id)); + let after_first = manager.reserved_connections.load(Ordering::SeqCst); + assert_eq!(after_first, 1); + { + assert!( + manager.is_pending_connection(&peer_id), + "pending connection should be tracked separately after initial acceptance" + ); + } + + // Second attempt for the same peer should be rejected immediately. + assert!( + !manager.should_accept(location, &peer_id), + "duplicate peer should be rejected by should_accept" + ); + assert_eq!( + manager.reserved_connections.load(Ordering::SeqCst), + after_first, + "repeat should_accept calls should not leak reservations for an existing peer" + ); + } } diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index b23bf8ebe..6233d71e2 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -3,7 +3,7 @@ //! Mainly maintains a healthy and optimal pool of connections to other peers in the network //! and routes requests to the optimal peers. -use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::collections::{BTreeSet, HashSet}; use std::{ sync::{atomic::AtomicU64, Arc, Weak}, time::{Duration, Instant}, @@ -157,7 +157,7 @@ impl Ring { } pub fn open_connections(&self) -> usize { - self.connection_manager.connection_count() + self.connection_manager.get_open_connections() } async fn refresh_router(router: Arc>, register: ER) { @@ -385,6 +385,11 @@ impl Ring { tracing::info!("Initializing connection maintenance task"); let is_gateway = self.is_gateway; tracing::info!(is_gateway, "Connection maintenance task starting"); + #[cfg(not(test))] + const CONNECTION_AGE_THRESOLD: Duration = Duration::from_secs(60 * 5); + #[cfg(test)] + const CONNECTION_AGE_THRESOLD: Duration = Duration::from_secs(5); + #[cfg(not(test))] const CHECK_TICK_DURATION: Duration = Duration::from_secs(60); #[cfg(test)] @@ -455,7 +460,7 @@ impl Ring { error })?; if live_tx.is_none() { - let conns = self.connection_manager.connection_count(); + let conns = self.connection_manager.get_open_connections(); tracing::warn!( "acquire_new returned None - likely no peers to query through (connections: {})", conns @@ -472,50 +477,30 @@ impl Ring { } } - let current_connections = self.connection_manager.connection_count(); + let current_connections = self.connection_manager.get_open_connections(); let pending_connection_targets = pending_conn_adds.len(); - let peers = self.connection_manager.get_connections_by_location(); - let connections_considered: usize = peers.values().map(|c| c.len()).sum(); - - let mut neighbor_locations: BTreeMap<_, Vec<_>> = peers - .iter() - .map(|(loc, conns)| { - let conns: Vec<_> = conns - .iter() - .filter(|conn| !live_tx_tracker.has_live_connection(&conn.location.peer)) - .cloned() - .collect(); - (*loc, conns) - }) - .filter(|(_, conns)| !conns.is_empty()) - .collect(); - - if neighbor_locations.is_empty() && connections_considered > 0 { - tracing::warn!( - current_connections, - connections_considered, - live_tx_peers = live_tx_tracker.len(), - "Neighbor filtering removed all candidates; using all connections" + let neighbor_locations = { + let peers = self.connection_manager.get_connections_by_location(); + tracing::debug!( + "Maintenance task: current connections = {}, checking topology", + current_connections ); - - neighbor_locations = peers + peers .iter() - .map(|(loc, conns)| (*loc, conns.clone())) + .map(|(loc, conns)| { + let conns: Vec<_> = conns + .iter() + .filter(|conn| { + conn.open_at.elapsed() > CONNECTION_AGE_THRESOLD + && !live_tx_tracker.has_live_connection(&conn.location.peer) + }) + .cloned() + .collect(); + (*loc, conns) + }) .filter(|(_, conns)| !conns.is_empty()) - .collect(); - } - - if current_connections > self.connection_manager.max_connections { - // When over capacity, consider all connections for removal regardless of live_tx filter. - neighbor_locations = peers.clone(); - } - - tracing::debug!( - "Maintenance task: current connections = {}, candidates = {}, live_tx_peers = {}", - current_connections, - peers.len(), - live_tx_tracker.len() - ); + .collect() + }; let adjustment = self .connection_manager @@ -604,7 +589,7 @@ impl Ring { live_tx_tracker: &LiveTransactionTracker, op_manager: &Arc, ) -> anyhow::Result> { - let current_connections = self.connection_manager.connection_count(); + let current_connections = self.connection_manager.get_open_connections(); let is_gateway = self.is_gateway; tracing::info!( @@ -620,18 +605,13 @@ impl Ring { %ideal_location, num_connections, skip_list_size = skip_list.len(), - self_peer = %self.connection_manager.get_peer_key().as_ref().map(|id| id.to_string()).unwrap_or_else(|| "unknown".into()), "Looking for peer to route through" ); if let Some(target) = self.connection_manager .routing(ideal_location, None, skip_list, &router) { - tracing::info!( - query_target = %target, - %ideal_location, - "connection_maintenance selected routing target" - ); + tracing::debug!(query_target = %target, "Found routing target"); target } else { tracing::warn!( From 3511ae8085ba3ed132f4ef2212b4562765b0f6c9 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Tue, 18 Nov 2025 15:40:43 -0600 Subject: [PATCH 02/25] fix: tidy transient registry formatting --- crates/core/src/ring/connection_manager.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 26c62b164..427fd8467 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -400,6 +400,14 @@ impl ConnectionManager { true } + /// Registers (or updates) a transient connection without performing budget checks. + /// Used when the caller already reserved budget via `try_register_transient`. + pub fn register_transient(&self, peer: PeerId, location: Option) { + if !self.try_register_transient(peer.clone(), location) { + tracing::warn!(%peer, "register_transient: budget exhausted while updating"); + } + } + /// Drops a transient connection and returns its metadata, if it existed. /// Also decrements the transient budget counter. pub fn drop_transient(&self, peer: &PeerId) -> Option { @@ -413,15 +421,16 @@ impl ConnectionManager { removed } + pub fn deregister_transient(&self, peer: &PeerId) -> Option { + self.drop_transient(peer) + } + pub fn is_transient(&self, peer: &PeerId) -> bool { self.transient_connections.contains_key(peer) } - #[allow(dead_code)] - pub fn is_transient_addr(&self, addr: &SocketAddr) -> bool { - self.transient_connections - .iter() - .any(|entry| entry.key().addr == *addr) + pub fn is_transient_peer(&self, peer: &PeerId) -> bool { + self.is_transient(peer) } pub fn transient_count(&self) -> usize { From 13851e78b62552ec7ebe30dac573b3223ef95899 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Tue, 18 Nov 2025 17:13:34 -0600 Subject: [PATCH 03/25] fix: clean transient promotion handling --- crates/core/src/node/network_bridge/p2p_protoc.rs | 12 +++++++++--- crates/core/src/ring/connection_manager.rs | 8 +++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 55ccc6160..69ff66c2a 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -1296,9 +1296,8 @@ impl P2pConnManager { tracing::info!(tx = %tx, remote = %peer, "connect_peer: promoted transient"); } - let resolved_peer_id = connection_manager - .get_peer_key() - .expect("peer key should be set"); + // Return the remote peer we are connected to (not our own peer key). + let resolved_peer_id = peer.clone(); callback .send_result(Ok((resolved_peer_id, None))) .await @@ -1619,8 +1618,15 @@ impl P2pConnManager { current = connection_manager.transient_count(), "Transient connection budget exhausted; dropping inbound connection" ); + if let Some(callbacks) = state.awaiting_connection.remove(&peer_id.addr) { + for mut cb in callbacks { + let _ = cb.send_result(Err(())).await; + } + } + state.awaiting_connection_txs.remove(&peer_id.addr); return Ok(()); } + } let pending_txs = state .awaiting_connection_txs diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 427fd8467..09c4a4666 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -14,6 +14,7 @@ use super::*; pub(crate) struct TransientEntry { #[allow(dead_code)] pub opened_at: Instant, + /// Advertised location for the transient peer, if known at admission time. pub location: Option, } @@ -400,7 +401,7 @@ impl ConnectionManager { true } - /// Registers (or updates) a transient connection without performing budget checks. + /// Record a transient connection for bookkeeping (kept out of routing/topology counts). /// Used when the caller already reserved budget via `try_register_transient`. pub fn register_transient(&self, peer: PeerId, location: Option) { if !self.try_register_transient(peer.clone(), location) { @@ -421,10 +422,12 @@ impl ConnectionManager { removed } + /// Remove transient tracking for a peer, returning any stored metadata. pub fn deregister_transient(&self, peer: &PeerId) -> Option { self.drop_transient(peer) } + /// Check whether a peer is currently tracked as transient. pub fn is_transient(&self, peer: &PeerId) -> bool { self.transient_connections.contains_key(peer) } @@ -433,14 +436,17 @@ impl ConnectionManager { self.is_transient(peer) } + /// Current number of tracked transient connections. pub fn transient_count(&self) -> usize { self.transient_in_use.load(Ordering::Acquire) } + /// Maximum transient slots allowed. pub fn transient_budget(&self) -> usize { self.transient_budget } + /// Time-to-live for transients before automatic drop. pub fn transient_ttl(&self) -> Duration { self.transient_ttl } From a262c272578e5da811f4a7939529c4694d997fe3 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Tue, 18 Nov 2025 17:16:14 -0600 Subject: [PATCH 04/25] fix: honor transient budget and promote correctly --- crates/core/src/node/network_bridge/p2p_protoc.rs | 15 +++++++++++++-- crates/core/src/ring/connection_manager.rs | 10 +++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 69ff66c2a..f166f6968 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -1626,8 +1626,6 @@ impl P2pConnManager { state.awaiting_connection_txs.remove(&peer_id.addr); return Ok(()); } - } - let pending_txs = state .awaiting_connection_txs .remove(&peer_id.addr) @@ -1690,6 +1688,19 @@ impl P2pConnManager { // Only insert if connection doesn't already exist to avoid dropping existing channel let mut newly_inserted = false; if !self.connections.contains_key(&peer_id) { + if is_transient { + let cm = &self.bridge.op_manager.ring.connection_manager; + let current = cm.transient_count(); + if current >= cm.transient_budget() { + tracing::warn!( + remote = %peer_id.addr, + budget = cm.transient_budget(), + current, + "Transient connection budget exhausted; dropping inbound connection before insert" + ); + return Ok(()); + } + } let (tx, rx) = mpsc::channel(10); tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %peer_id, conn_map_size = self.connections.len(), "[CONN_TRACK] INSERT: OutboundConnectionSuccessful - adding to connections HashMap"); self.connections.insert(peer_id.clone(), tx); diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 09c4a4666..fa5adb2ea 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -12,6 +12,8 @@ use super::*; #[derive(Clone)] pub(crate) struct TransientEntry { + /// Entry tracking a transient connection that hasn't been added to the ring topology yet. + /// Transient connections are typically unsolicited inbound connections to gateways. #[allow(dead_code)] pub opened_at: Instant, /// Advertised location for the transient peer, if known at admission time. @@ -401,8 +403,8 @@ impl ConnectionManager { true } - /// Record a transient connection for bookkeeping (kept out of routing/topology counts). - /// Used when the caller already reserved budget via `try_register_transient`. + /// Registers a new transient connection that is not yet part of the ring topology. + /// Transient connections are tracked separately and subject to budget and TTL limits. pub fn register_transient(&self, peer: PeerId, location: Option) { if !self.try_register_transient(peer.clone(), location) { tracing::warn!(%peer, "register_transient: budget exhausted while updating"); @@ -422,7 +424,9 @@ impl ConnectionManager { removed } - /// Remove transient tracking for a peer, returning any stored metadata. + /// Deregisters a transient connection, removing it from tracking. + /// + /// Returns the removed entry if it existed. pub fn deregister_transient(&self, peer: &PeerId) -> Option { self.drop_transient(peer) } From e16fd8bce6d0f4ef2f5256f36e344a827aeb6b79 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 13:02:23 -0600 Subject: [PATCH 05/25] fix: remove unused transient helpers --- crates/core/src/ring/connection_manager.rs | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index fa5adb2ea..e1f9d9512 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -403,14 +403,6 @@ impl ConnectionManager { true } - /// Registers a new transient connection that is not yet part of the ring topology. - /// Transient connections are tracked separately and subject to budget and TTL limits. - pub fn register_transient(&self, peer: PeerId, location: Option) { - if !self.try_register_transient(peer.clone(), location) { - tracing::warn!(%peer, "register_transient: budget exhausted while updating"); - } - } - /// Drops a transient connection and returns its metadata, if it existed. /// Also decrements the transient budget counter. pub fn drop_transient(&self, peer: &PeerId) -> Option { @@ -424,22 +416,11 @@ impl ConnectionManager { removed } - /// Deregisters a transient connection, removing it from tracking. - /// - /// Returns the removed entry if it existed. - pub fn deregister_transient(&self, peer: &PeerId) -> Option { - self.drop_transient(peer) - } - /// Check whether a peer is currently tracked as transient. pub fn is_transient(&self, peer: &PeerId) -> bool { self.transient_connections.contains_key(peer) } - pub fn is_transient_peer(&self, peer: &PeerId) -> bool { - self.is_transient(peer) - } - /// Current number of tracked transient connections. pub fn transient_count(&self) -> usize { self.transient_in_use.load(Ordering::Acquire) From adf89ad8a9f2888c12ee5bd224f7d8c4c959084d Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 15:54:10 -0600 Subject: [PATCH 06/25] test: add large network soak test with diagnostic snapshots --- crates/core/Cargo.toml | 3 +- crates/core/tests/large_network.rs | 80 ++++++------------------------ 2 files changed, 15 insertions(+), 68 deletions(-) diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 18695cfc2..bfea19de1 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -65,7 +65,6 @@ xz2 = { version = "0.1" } reqwest = { version = "0.12", features = ["json"] } rsa = { version = "0.9", features = ["serde", "pem"] } pkcs8 = { version = "0.10", features = ["std", "pem"] } -sha2 = "0.10" # Tracing deps opentelemetry = "0.31" @@ -91,7 +90,7 @@ arbitrary = { features = ["derive"], version = "1" } chrono = { features = ["arbitrary"], workspace = true } freenet-stdlib = { features = ["net", "testing"], workspace = true } freenet-macros = { path = "../freenet-macros" } -freenet-test-network = "0.1.3" +freenet-test-network = { version = "0.1.1", path = "../../../../freenet-test-network" } httptest = "0.16" statrs = "0.18" tempfile = "3" diff --git a/crates/core/tests/large_network.rs b/crates/core/tests/large_network.rs index 53bf5eb46..b60c86798 100644 --- a/crates/core/tests/large_network.rs +++ b/crates/core/tests/large_network.rs @@ -37,10 +37,7 @@ use which::which; const DEFAULT_PEER_COUNT: usize = 38; const DEFAULT_SNAPSHOT_INTERVAL: Duration = Duration::from_secs(60); const DEFAULT_SNAPSHOT_ITERATIONS: usize = 5; -const DEFAULT_SNAPSHOT_WARMUP: Duration = Duration::from_secs(60); const DEFAULT_CONNECTIVITY_TARGET: f64 = 0.75; -const DEFAULT_MIN_CONNECTIONS: usize = 5; -const DEFAULT_MAX_CONNECTIONS: usize = 7; #[tokio::test(flavor = "multi_thread", worker_threads = 4)] #[ignore = "Large soak test - run manually (see file header for instructions)"] @@ -62,25 +59,10 @@ async fn large_network_soak() -> anyhow::Result<()> { .ok() .and_then(|val| val.parse::().ok()) .unwrap_or(DEFAULT_CONNECTIVITY_TARGET); - let snapshot_warmup = env::var("SOAK_SNAPSHOT_WARMUP_SECS") - .ok() - .and_then(|val| val.parse().ok()) - .map(Duration::from_secs) - .unwrap_or(DEFAULT_SNAPSHOT_WARMUP); - let min_connections = env::var("SOAK_MIN_CONNECTIONS") - .ok() - .and_then(|val| val.parse().ok()) - .unwrap_or(DEFAULT_MIN_CONNECTIONS); - let max_connections = env::var("SOAK_MAX_CONNECTIONS") - .ok() - .and_then(|val| val.parse().ok()) - .unwrap_or(DEFAULT_MAX_CONNECTIONS); let network = TestNetwork::builder() .gateways(2) .peers(peer_count) - .min_connections(min_connections) - .max_connections(max_connections) .require_connectivity(connectivity_target) .connectivity_timeout(Duration::from_secs(120)) .preserve_temp_dirs_on_failure(true) @@ -96,10 +78,6 @@ async fn large_network_soak() -> anyhow::Result<()> { peer_count, network.run_root().display() ); - println!( - "Min connections: {}, max connections: {} (override via SOAK_MIN_CONNECTIONS / SOAK_MAX_CONNECTIONS)", - min_connections, max_connections - ); let riverctl_path = which("riverctl") .context("riverctl not found in PATH; install via `cargo install riverctl`")?; @@ -111,13 +89,6 @@ async fn large_network_soak() -> anyhow::Result<()> { let snapshots_dir = network.run_root().join("large-soak"); fs::create_dir_all(&snapshots_dir)?; - // Allow topology maintenance to run before the first snapshot. - println!( - "Waiting {:?} before first snapshot to allow topology maintenance to converge", - snapshot_warmup - ); - sleep(snapshot_warmup).await; - let mut iteration = 0usize; let mut next_tick = Instant::now(); while iteration < snapshot_iterations { @@ -126,11 +97,6 @@ async fn large_network_soak() -> anyhow::Result<()> { let snapshot_path = snapshots_dir.join(format!("snapshot-{iteration:02}.json")); fs::write(&snapshot_path, to_string_pretty(&snapshot)?)?; - // Also capture ring topology for visualizing evolution over time. - let ring_snapshot = network.ring_snapshot().await?; - let ring_path = snapshots_dir.join(format!("ring-{iteration:02}.json")); - fs::write(&ring_path, to_string_pretty(&ring_snapshot)?)?; - let healthy = snapshot .peers .iter() @@ -281,47 +247,29 @@ impl RiverSession { .map(|_| ()) } - async fn run_riverctl(&self, user: RiverUser, args: &[&str]) -> anyhow::Result { + async fn run_riverctl<'a>(&self, user: RiverUser, args: &[&'a str]) -> anyhow::Result { let (url, config_dir) = match user { RiverUser::Alice => (&self.alice_url, self.alice_dir.path()), RiverUser::Bob => (&self.bob_url, self.bob_dir.path()), }; - const MAX_RETRIES: usize = 3; - const RETRY_DELAY: Duration = Duration::from_secs(5); - - for attempt in 1..=MAX_RETRIES { - let mut cmd = tokio::process::Command::new(&self.riverctl); - cmd.arg("--node-url").arg(url); - cmd.args(args); - cmd.env("RIVER_CONFIG_DIR", config_dir); + let mut cmd = tokio::process::Command::new(&self.riverctl); + cmd.arg("--node-url").arg(url); + cmd.args(args); + cmd.env("RIVER_CONFIG_DIR", config_dir); - let output = cmd - .output() - .await - .context("failed to execute riverctl command")?; - if output.status.success() { - return Ok(String::from_utf8_lossy(&output.stdout).to_string()); - } - - let stderr = String::from_utf8_lossy(&output.stderr).to_string(); - let retriable = stderr.contains("Timeout waiting for") - || stderr.contains("connection refused") - || stderr.contains("HTTP request failed"); - if attempt == MAX_RETRIES || !retriable { - bail!("riverctl failed (user {:?}): {}", user, stderr); - } - println!( - "riverctl attempt {}/{} failed for {:?}: {}; retrying in {}s", - attempt, - MAX_RETRIES, + let output = cmd + .output() + .await + .context("failed to execute riverctl command")?; + if !output.status.success() { + bail!( + "riverctl failed (user {:?}): {}", user, - stderr.trim(), - RETRY_DELAY.as_secs() + String::from_utf8_lossy(&output.stderr) ); - sleep(RETRY_DELAY).await; } - unreachable!("riverctl retry loop should always return or bail") + Ok(String::from_utf8_lossy(&output.stdout).to_string()) } } From 7f9016011cbe80214950dae0e4e9c398b2eca4d5 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 15:55:20 -0600 Subject: [PATCH 07/25] test: add large network soak test with diagnostic snapshots --- Cargo.lock | 5 +---- crates/core/tests/large_network.rs | 2 +- crates/core/tests/test_network_integration.rs | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f3d68c75..38a2c9f91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1686,7 +1686,6 @@ dependencies = [ "serde", "serde_json", "serde_with", - "sha2", "sqlx", "statrs", "stretto", @@ -1814,9 +1813,7 @@ dependencies = [ [[package]] name = "freenet-test-network" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5b74dc741d17bc57e55be2a2b2dc0b15bdb4299b77b3f779d371a379611cb13" +version = "0.1.1" dependencies = [ "anyhow", "chrono", diff --git a/crates/core/tests/large_network.rs b/crates/core/tests/large_network.rs index b60c86798..d84c2c2d3 100644 --- a/crates/core/tests/large_network.rs +++ b/crates/core/tests/large_network.rs @@ -247,7 +247,7 @@ impl RiverSession { .map(|_| ()) } - async fn run_riverctl<'a>(&self, user: RiverUser, args: &[&'a str]) -> anyhow::Result { + async fn run_riverctl(&self, user: RiverUser, args: &[&str]) -> anyhow::Result { let (url, config_dir) = match user { RiverUser::Alice => (&self.alice_url, self.alice_dir.path()), RiverUser::Bob => (&self.bob_url, self.bob_dir.path()), diff --git a/crates/core/tests/test_network_integration.rs b/crates/core/tests/test_network_integration.rs index e130d93fb..3373d5cd0 100644 --- a/crates/core/tests/test_network_integration.rs +++ b/crates/core/tests/test_network_integration.rs @@ -7,7 +7,7 @@ use freenet_test_network::TestNetwork; use testresult::TestResult; use tokio_tungstenite::connect_async; -// Build a fresh network for each test to avoid static Sync requirements +// Helper to get or create network async fn get_network() -> TestNetwork { TestNetwork::builder() .gateways(1) From d8742ab5d3c04d781ba68b6769fd0ef0e9e4b184 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 16:12:45 -0600 Subject: [PATCH 08/25] test: harden soak riverctl retries --- crates/core/tests/large_network.rs | 44 +++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/crates/core/tests/large_network.rs b/crates/core/tests/large_network.rs index d84c2c2d3..20970a647 100644 --- a/crates/core/tests/large_network.rs +++ b/crates/core/tests/large_network.rs @@ -253,23 +253,41 @@ impl RiverSession { RiverUser::Bob => (&self.bob_url, self.bob_dir.path()), }; - let mut cmd = tokio::process::Command::new(&self.riverctl); - cmd.arg("--node-url").arg(url); - cmd.args(args); - cmd.env("RIVER_CONFIG_DIR", config_dir); + const MAX_RETRIES: usize = 3; + const RETRY_DELAY: Duration = Duration::from_secs(5); - let output = cmd - .output() - .await - .context("failed to execute riverctl command")?; - if !output.status.success() { - bail!( - "riverctl failed (user {:?}): {}", + for attempt in 1..=MAX_RETRIES { + let mut cmd = tokio::process::Command::new(&self.riverctl); + cmd.arg("--node-url").arg(url); + cmd.args(args); + cmd.env("RIVER_CONFIG_DIR", config_dir); + + let output = cmd + .output() + .await + .context("failed to execute riverctl command")?; + if output.status.success() { + return Ok(String::from_utf8_lossy(&output.stdout).to_string()); + } + + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + let retriable = stderr.contains("Timeout waiting for") + || stderr.contains("connection refused") + || stderr.contains("HTTP request failed"); + if attempt == MAX_RETRIES || !retriable { + bail!("riverctl failed (user {:?}): {}", user, stderr); + } + println!( + "riverctl attempt {}/{} failed for {:?}: {}; retrying in {}s", + attempt, + MAX_RETRIES, user, - String::from_utf8_lossy(&output.stderr) + stderr.trim(), + RETRY_DELAY.as_secs() ); + sleep(RETRY_DELAY).await; } - Ok(String::from_utf8_lossy(&output.stdout).to_string()) + unreachable!("riverctl retry loop should always return or bail") } } From 5f387beed4cb494a8c8d4edf5ef2e18bb41fb875 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 21:35:00 -0600 Subject: [PATCH 09/25] refactor: rename courtesy links to transient --- .../core/src/node/network_bridge/handshake.rs | 18 +- .../src/node/network_bridge/p2p_protoc.rs | 64 ++--- crates/core/src/operations/connect.rs | 243 ++++++++---------- crates/core/src/ring/connection_manager.rs | 10 + crates/core/src/ring/mod.rs | 7 +- 5 files changed, 168 insertions(+), 174 deletions(-) diff --git a/crates/core/src/node/network_bridge/handshake.rs b/crates/core/src/node/network_bridge/handshake.rs index 99f527f77..c2f3c8b31 100644 --- a/crates/core/src/node/network_bridge/handshake.rs +++ b/crates/core/src/node/network_bridge/handshake.rs @@ -138,14 +138,14 @@ impl ExpectedInboundTracker { tx = ?transaction, "ExpectInbound: registering expectation" ); - let list = self.entries.entry(peer.addr.ip()).or_default(); - // Replace any existing expectation for the same peer/port to ensure the newest registration wins. - list.retain(|entry| entry.peer.addr.port() != peer.addr.port()); - list.push(ExpectedInbound { - peer, - transaction, - transient, - }); + self.entries + .entry(peer.addr.ip()) + .or_default() + .push(ExpectedInbound { + peer, + transaction, + transient, + }); } fn drop_peer(&mut self, peer: &PeerId) { @@ -184,7 +184,7 @@ impl ExpectedInboundTracker { #[cfg(test)] fn contains(&self, addr: SocketAddr) -> bool { - self.entries.contains_key(&addr.ip()) + self.entries.contains_key(&addr) } } diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index f166f6968..f3b0fcc51 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -665,13 +665,13 @@ impl P2pConnManager { peer, tx, callback, - is_gw: courtesy, + is_gw: transient, } => { tracing::info!( tx = %tx, remote = %peer, remote_addr = %peer.addr, - courtesy, + transient, "NodeEvent::ConnectPeer received" ); ctx.handle_connect_peer( @@ -680,7 +680,7 @@ impl P2pConnManager { tx, &handshake_cmd_sender, &mut state, - courtesy, + transient, ) .await?; } @@ -691,7 +691,7 @@ impl P2pConnManager { .send(HandshakeCommand::ExpectInbound { peer: peer.clone(), transaction: None, - courtesy: false, + transient: false, }) .await { @@ -1215,7 +1215,7 @@ impl P2pConnManager { tx: Transaction, handshake_commands: &HandshakeCommandSender, state: &mut EventListenerState, - courtesy: bool, + transient: bool, ) -> anyhow::Result<()> { let mut peer = peer; let mut peer_addr = peer.addr; @@ -1228,13 +1228,13 @@ impl P2pConnManager { tx = %tx, remote = %peer, fallback_addr = %peer_addr, - courtesy, + transient, "ConnectPeer provided unspecified address; using existing connection address" ); } else { tracing::debug!( tx = %tx, - courtesy, + transient, "ConnectPeer received unspecified address without existing connection reference" ); } @@ -1244,7 +1244,7 @@ impl P2pConnManager { tx = %tx, remote = %peer, remote_addr = %peer_addr, - courtesy, + transient, "Connecting to peer" ); if let Some(blocked_addrs) = &self.blocked_addresses { @@ -1280,7 +1280,7 @@ impl P2pConnManager { tracing::info!( tx = %tx, remote = %peer, - courtesy, + transient, "connect_peer: reusing existing transport / promoting transient if present" ); let connection_manager = &self.bridge.op_manager.ring.connection_manager; @@ -1323,7 +1323,7 @@ impl P2pConnManager { tx = %tx, remote = %peer_addr, pending = callbacks.get().len(), - courtesy, + transient, "Connection already pending, queuing additional requester" ); callbacks.get_mut().push(callback); @@ -1332,7 +1332,7 @@ impl P2pConnManager { remote = %peer_addr, pending = callbacks.get().len(), pending_txs = ?txs_entry, - courtesy, + transient, "connect_peer: connection already pending, queued callback" ); return Ok(()); @@ -1343,7 +1343,7 @@ impl P2pConnManager { tracing::debug!( tx = %tx, remote = %peer_addr, - courtesy, + transient, "connect_peer: registering new pending connection" ); entry.insert(vec![callback]); @@ -1352,7 +1352,7 @@ impl P2pConnManager { remote = %peer_addr, pending = 1, pending_txs = ?txs_entry, - courtesy, + transient, "connect_peer: registered new pending connection" ); state.outbound_handler.expect_incoming(peer_addr); @@ -1363,14 +1363,14 @@ impl P2pConnManager { .send(HandshakeCommand::Connect { peer: peer.clone(), transaction: tx, - courtesy, + transient, }) .await { tracing::warn!( tx = %tx, remote = %peer.addr, - courtesy, + transient, ?error, "Failed to enqueue connect command" ); @@ -1385,7 +1385,7 @@ impl P2pConnManager { tx = %tx, remote = %peer_addr, callbacks = callbacks.len(), - courtesy, + transient, "Cleaning up callbacks after connect command failure" ); for mut cb in callbacks { @@ -1412,7 +1412,7 @@ impl P2pConnManager { tracing::debug!( tx = %tx, remote = %peer_addr, - courtesy, + transient, "connect_peer: handshake command dispatched" ); } @@ -1431,16 +1431,17 @@ impl P2pConnManager { transaction, peer, connection, - courtesy, + transient, } => { - let conn_manager = &self.bridge.op_manager.ring.connection_manager; + tracing::info!(provided = ?peer, transient = transient, tx = ?transaction, "InboundConnection event"); + let _conn_manager = &self.bridge.op_manager.ring.connection_manager; let remote_addr = connection.remote_addr(); if let Some(blocked_addrs) = &self.blocked_addresses { if blocked_addrs.contains(&remote_addr) { tracing::info!( remote = %remote_addr, - courtesy, + transient = transient, transaction = ?transaction, "Inbound connection blocked by local policy" ); @@ -1448,11 +1449,11 @@ impl P2pConnManager { } } - let provided_peer = peer.clone(); + let _provided_peer = peer.clone(); let peer_id = peer.unwrap_or_else(|| { tracing::info!( remote = %remote_addr, - courtesy, + transient = transient, transaction = ?transaction, "Inbound connection arrived without matching expectation; accepting provisionally" ); @@ -1470,13 +1471,14 @@ impl P2pConnManager { tracing::info!( remote = %peer_id.addr, - courtesy, + transient = transient, transaction = ?transaction, "Inbound connection established" ); - let is_transient = - conn_manager.is_gateway() && provided_peer.is_none() && transaction.is_none(); + // Treat only transient connections as transient. Normal inbound dials (including + // gateway bootstrap from peers) should be promoted into the ring once established. + let is_transient = transient; self.handle_successful_connection(peer_id, connection, state, None, is_transient) .await?; @@ -1485,11 +1487,11 @@ impl P2pConnManager { transaction, peer, connection, - courtesy, + transient, } => { tracing::info!( remote = %peer.addr, - courtesy, + transient = transient, transaction = %transaction, "Outbound connection established" ); @@ -1500,11 +1502,11 @@ impl P2pConnManager { transaction, peer, error, - courtesy, + transient, } => { tracing::info!( remote = %peer.addr, - courtesy, + transient = transient, transaction = %transaction, ?error, "Outbound connection failed" @@ -1526,7 +1528,7 @@ impl P2pConnManager { remote = %peer.addr, callbacks = callbacks.len(), pending_txs = ?pending_txs, - courtesy, + transient, "Notifying callbacks after outbound failure" ); @@ -1717,9 +1719,11 @@ impl P2pConnManager { } if newly_inserted { + tracing::info!(remote = %peer_id, is_transient, "handle_successful_connection: inserted new connection entry"); let pending_loc = connection_manager.prune_in_transit_connection(&peer_id); if !is_transient { let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); + tracing::info!(remote = %peer_id, %loc, "handle_successful_connection: promoting connection into ring"); self.bridge .op_manager .ring diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 456ebd54a..0f55fab66 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -3,7 +3,7 @@ //! The legacy multi-stage connect operation has been removed; this module now powers the node’s //! connection and maintenance paths end-to-end. -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::net::SocketAddr; use std::sync::Arc; @@ -77,30 +77,21 @@ impl InnerMessage for ConnectMsg { impl fmt::Display for ConnectMsg { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - ConnectMsg::Request { - target, payload, .. - } => write!( + ConnectMsg::Request { target, payload, .. } => write!( f, "ConnectRequest {{ target: {target}, desired: {}, ttl: {}, joiner: {} }}", - payload.desired_location, payload.ttl, payload.joiner + payload.desired_location, + payload.ttl, + payload.joiner ), - ConnectMsg::Response { - sender, - target, - payload, - .. - } => write!( + ConnectMsg::Response { sender, target, payload, .. } => write!( f, - "ConnectResponse {{ sender: {sender}, target: {target}, acceptor: {} }}", + "ConnectResponse {{ sender: {sender}, target: {target}, acceptor: {}, transient: {} }}", payload.acceptor, + payload.transient ), - ConnectMsg::ObservedAddress { - target, address, .. - } => { - write!( - f, - "ObservedAddress {{ target: {target}, address: {address} }}" - ) + ConnectMsg::ObservedAddress { target, address, .. } => { + write!(f, "ObservedAddress {{ target: {target}, address: {address} }}") } } } @@ -135,6 +126,8 @@ pub(crate) struct ConnectRequest { pub(crate) struct ConnectResponse { /// The peer that accepted the join request. pub acceptor: PeerKeyLocation, + /// Whether this acceptance is a short-lived transient link. + pub transient: bool, } /// New minimal state machine the joiner tracks. @@ -161,6 +154,7 @@ pub(crate) struct RelayState { pub upstream: PeerKeyLocation, pub request: ConnectRequest, pub forwarded_to: Option, + pub transient_hint: bool, pub observed_sent: bool, pub accepted_locally: bool, } @@ -179,8 +173,10 @@ pub(crate) trait RelayContext { &self, desired_location: Location, visited: &[PeerKeyLocation], - recency: &HashMap, ) -> Option; + + /// Whether the acceptance should be treated as a short-lived transient link. + fn transient_hint(&self, acceptor: &PeerKeyLocation, joiner: &PeerKeyLocation) -> bool; } /// Result of processing a request at a relay. @@ -197,7 +193,6 @@ impl RelayState { &mut self, ctx: &C, observed_remote: &PeerKeyLocation, - recency: &HashMap, ) -> RelayActions { let mut actions = RelayActions::default(); push_unique_peer(&mut self.request.visited, observed_remote.clone()); @@ -220,35 +215,22 @@ impl RelayState { if !self.accepted_locally && ctx.should_accept(&self.request.joiner) { self.accepted_locally = true; let acceptor = ctx.self_location().clone(); - let dist = ring_distance(acceptor.location, self.request.joiner.location); + let transient = ctx.transient_hint(&acceptor, &self.request.joiner); + self.transient_hint = transient; actions.accept_response = Some(ConnectResponse { acceptor: acceptor.clone(), + transient, }); actions.expect_connection_from = Some(self.request.joiner.clone()); - tracing::info!( - acceptor_peer = %acceptor.peer, - joiner_peer = %self.request.joiner.peer, - acceptor_loc = ?acceptor.location, - joiner_loc = ?self.request.joiner.location, - ring_distance = ?dist, - "connect: acceptance issued" - ); } if self.forwarded_to.is_none() && self.request.ttl > 0 { - match ctx.select_next_hop( - self.request.desired_location, - &self.request.visited, - recency, - ) { + match ctx.select_next_hop(self.request.desired_location, &self.request.visited) { Some(next) => { - let dist = ring_distance(next.location, Some(self.request.desired_location)); - tracing::info!( + tracing::debug!( target = %self.request.desired_location, ttl = self.request.ttl, next_peer = %next.peer, - next_loc = ?next.location, - ring_distance_to_target = ?dist, "connect: forwarding join request to next hop" ); let mut forward_req = self.request.clone(); @@ -260,7 +242,7 @@ impl RelayState { actions.forward = Some((next, forward_snapshot)); } None => { - tracing::info!( + tracing::debug!( target = %self.request.desired_location, ttl = self.request.ttl, visited = ?self.request.visited, @@ -308,50 +290,27 @@ impl RelayContext for RelayEnv<'_> { &self, desired_location: Location, visited: &[PeerKeyLocation], - recency: &HashMap, ) -> Option { let skip = VisitedPeerIds { peers: visited }; let router = self.op_manager.ring.router.read(); - let candidates = self.op_manager.ring.connection_manager.routing_candidates( - desired_location, - None, - skip, - ); - - // Prefer least recently forwarded peers. Missing recency wins; otherwise pick the oldest - // recency bucket, then let the router choose among that bucket. This keeps routing bias - // toward the target while avoiding hammering a single neighbor. - let mut best_key: Option> = None; - let mut best: Vec = Vec::new(); - for cand in candidates { - let key = recency.get(&cand.peer).cloned(); - match best_key { - None => { - best_key = Some(key); - best = vec![cand.clone()]; - } - Some(k) => { - if key < k { - best_key = Some(key); - best = vec![cand.clone()]; - } else if key == k { - best.push(cand.clone()); - } - } - } - } + self.op_manager + .ring + .connection_manager + .routing(desired_location, None, skip, &router) + } - if best.is_empty() { - None - } else { - router.select_peer(best.iter(), desired_location).cloned() - } + fn transient_hint(&self, _acceptor: &PeerKeyLocation, _joiner: &PeerKeyLocation) -> bool { + // Courtesy slots still piggyback on regular connections. Flag the first acceptance so the + // joiner can prioritise it, and keep the logic simple until dedicated transient tracking + // is wired in (see transient-connection-budget branch). + self.op_manager.ring.open_connections() == 0 } } #[derive(Debug)] pub struct AcceptedPeer { pub peer: PeerKeyLocation, + pub transient: bool, } #[derive(Debug, Default)] @@ -372,6 +331,7 @@ impl JoinerState { self.last_progress = now; acceptance.new_acceptor = Some(AcceptedPeer { peer: response.acceptor.clone(), + transient: response.transient, }); acceptance.assigned_location = self.accepted.len() == 1; } @@ -395,10 +355,6 @@ pub(crate) struct ConnectOp { pub(crate) gateway: Option>, pub(crate) backoff: Option, pub(crate) desired_location: Option, - /// Tracks when we last forwarded this connect to a peer, to avoid hammering the same - /// neighbors when no acceptors are available. Peers without an entry are treated as - /// immediately eligible. - recency: HashMap, } impl ConnectOp { @@ -423,7 +379,6 @@ impl ConnectOp { gateway: gateway.map(Box::new), backoff, desired_location: Some(desired_location), - recency: HashMap::new(), } } @@ -436,6 +391,7 @@ impl ConnectOp { upstream, request, forwarded_to: None, + transient_hint: false, observed_sent: false, accepted_locally: false, })); @@ -445,7 +401,6 @@ impl ConnectOp { gateway: None, backoff: None, desired_location: None, - recency: HashMap::new(), } } @@ -525,15 +480,7 @@ impl ConnectOp { ) -> Option { match self.state.as_mut() { Some(ConnectState::WaitingForResponses(state)) => { - tracing::info!( - acceptor = %response.acceptor.peer, - acceptor_loc = ?response.acceptor.location, - "connect: joiner received ConnectResponse" - ); let result = state.register_acceptance(response, now); - if let Some(new_acceptor) = &result.new_acceptor { - self.recency.remove(&new_acceptor.peer.peer); - } if result.satisfied { self.state = Some(ConnectState::Completed); } @@ -560,6 +507,7 @@ impl ConnectOp { upstream: upstream.clone(), request: request.clone(), forwarded_to: None, + transient_hint: false, observed_sent: false, accepted_locally: false, }))); @@ -570,7 +518,7 @@ impl ConnectOp { state.upstream = upstream; state.request = request; let upstream_snapshot = state.upstream.clone(); - state.handle_request(ctx, &upstream_snapshot, &self.recency) + state.handle_request(ctx, &upstream_snapshot) } _ => RelayActions::default(), } @@ -655,8 +603,6 @@ impl Operation for ConnectOp { } if let Some((next, request)) = actions.forward { - // Record recency for this forward to avoid hammering the same neighbor. - self.recency.insert(next.peer.clone(), Instant::now()); let forward_msg = ConnectMsg::Request { id: self.id, from: env.self_location().clone(), @@ -720,7 +666,7 @@ impl Operation for ConnectOp { peer: new_acceptor.peer.peer.clone(), tx: self.id, callback, - is_gw: false, + is_gw: new_acceptor.transient, }) .await?; @@ -817,19 +763,11 @@ fn store_operation_state_with_msg(op: &mut ConnectOp, msg: Option) - gateway: op.gateway.clone(), backoff: op.backoff.clone(), desired_location: op.desired_location, - recency: op.recency.clone(), })) }), } } -fn ring_distance(a: Option, b: Option) -> Option { - match (a, b) { - (Some(a), Some(b)) => Some(a.distance(b).as_f64()), - _ => None, - } -} - #[tracing::instrument(fields(peer = %op_manager.ring.connection_manager.pub_key), skip_all)] pub(crate) async fn join_ring_request( backoff: Option, @@ -842,6 +780,15 @@ pub(crate) async fn join_ring_request( OpError::ConnError(ConnectionError::LocationUnknown) })?; + tracing::debug!( + peer = %gateway.peer, + reserved_connections = op_manager + .ring + .connection_manager + .get_reserved_connections(), + "join_ring_request: evaluating gateway connection attempt" + ); + if !op_manager .ring .connection_manager @@ -926,56 +873,71 @@ pub(crate) async fn initial_join_procedure( gateways.len() ); + let mut in_flight_gateways = HashSet::new(); + loop { let open_conns = op_manager.ring.open_connections(); let unconnected_gateways: Vec<_> = op_manager.ring.is_not_connected(gateways.iter()).collect(); + let available_gateways: Vec<_> = unconnected_gateways + .into_iter() + .filter(|gateway| !in_flight_gateways.contains(&gateway.peer)) + .collect(); tracing::debug!( - "Connection status: open_connections = {}, unconnected_gateways = {}", - open_conns, - unconnected_gateways.len() + open_connections = open_conns, + inflight_gateway_dials = in_flight_gateways.len(), + available_gateways = available_gateways.len(), + "Connection status before join attempt" ); - let unconnected_count = unconnected_gateways.len(); + let available_count = available_gateways.len(); - if open_conns < BOOTSTRAP_THRESHOLD && unconnected_count > 0 { + if open_conns < BOOTSTRAP_THRESHOLD && available_count > 0 { tracing::info!( "Below bootstrap threshold ({} < {}), attempting to connect to {} gateways", open_conns, BOOTSTRAP_THRESHOLD, - number_of_parallel_connections.min(unconnected_count) + number_of_parallel_connections.min(available_count) ); - let select_all = FuturesUnordered::new(); - for gateway in unconnected_gateways + let mut select_all = FuturesUnordered::new(); + for gateway in available_gateways .into_iter() .shuffle() .take(number_of_parallel_connections) { tracing::info!(%gateway, "Attempting connection to gateway"); + in_flight_gateways.insert(gateway.peer.clone()); let op_manager = op_manager.clone(); + let gateway_clone = gateway.clone(); select_all.push(async move { - (join_ring_request(None, gateway, &op_manager).await, gateway) + ( + join_ring_request(None, &gateway_clone, &op_manager).await, + gateway_clone, + ) }); } - select_all - .for_each(|(res, gateway)| async move { - if let Err(error) = res { - if !matches!( - error, - OpError::ConnError( - crate::node::ConnectionError::UnwantedConnection - ) - ) { - tracing::error!( - %gateway, - %error, - "Failed while attempting connection to gateway" - ); - } + while let Some((res, gateway)) = select_all.next().await { + if let Err(error) = res { + if !matches!( + error, + OpError::ConnError(crate::node::ConnectionError::UnwantedConnection) + ) { + tracing::error!( + %gateway, + %error, + "Failed while attempting connection to gateway" + ); } - }) - .await; + } + in_flight_gateways.remove(&gateway.peer); + } + } else if open_conns < BOOTSTRAP_THRESHOLD && available_count == 0 { + tracing::debug!( + open_connections = open_conns, + inflight = in_flight_gateways.len(), + "Below threshold but all gateways are already connected or in-flight" + ); } else if open_conns >= BOOTSTRAP_THRESHOLD { tracing::trace!( "Have {} connections (>= threshold of {}), not attempting gateway connections", @@ -1022,6 +984,7 @@ mod tests { self_loc: PeerKeyLocation, accept: bool, next_hop: Option, + transient: bool, } impl TestRelayContext { @@ -1030,6 +993,7 @@ mod tests { self_loc, accept: true, next_hop: None, + transient: false, } } @@ -1042,6 +1006,11 @@ mod tests { self.next_hop = hop; self } + + fn transient(mut self, transient: bool) -> Self { + self.transient = transient; + self + } } impl RelayContext for TestRelayContext { @@ -1057,10 +1026,13 @@ mod tests { &self, _desired_location: Location, _visited: &[PeerKeyLocation], - _recency: &HashMap, ) -> Option { self.next_hop.clone() } + + fn transient_hint(&self, _acceptor: &PeerKeyLocation, _joiner: &PeerKeyLocation) -> bool { + self.transient + } } fn make_peer(port: u16) -> PeerKeyLocation { @@ -1086,16 +1058,17 @@ mod tests { observed_addr: Some(joiner.peer.addr), }, forwarded_to: None, + transient_hint: false, observed_sent: false, accepted_locally: false, }; - let ctx = TestRelayContext::new(self_loc.clone()); - let recency = HashMap::new(); - let actions = state.handle_request(&ctx, &joiner, &recency); + let ctx = TestRelayContext::new(self_loc.clone()).transient(true); + let actions = state.handle_request(&ctx, &joiner); let response = actions.accept_response.expect("expected acceptance"); assert_eq!(response.acceptor.peer, self_loc.peer); + assert!(response.transient); assert_eq!(actions.expect_connection_from.unwrap().peer, joiner.peer); assert!(actions.forward.is_none()); } @@ -1115,6 +1088,7 @@ mod tests { observed_addr: Some(joiner.peer.addr), }, forwarded_to: None, + transient_hint: false, observed_sent: false, accepted_locally: false, }; @@ -1122,8 +1096,7 @@ mod tests { let ctx = TestRelayContext::new(self_loc) .accept(false) .next_hop(Some(next_hop.clone())); - let recency = HashMap::new(); - let actions = state.handle_request(&ctx, &joiner, &recency); + let actions = state.handle_request(&ctx, &joiner); assert!(actions.accept_response.is_none()); let (forward_to, request) = actions.forward.expect("expected forward"); @@ -1150,13 +1123,13 @@ mod tests { observed_addr: Some(observed_addr), }, forwarded_to: None, + transient_hint: false, observed_sent: false, accepted_locally: false, }; let ctx = TestRelayContext::new(self_loc); - let recency = HashMap::new(); - let actions = state.handle_request(&ctx, &joiner, &recency); + let actions = state.handle_request(&ctx, &joiner); let (target, addr) = actions .observed_address @@ -1178,11 +1151,13 @@ mod tests { let response = ConnectResponse { acceptor: acceptor.clone(), + transient: false, }; let result = state.register_acceptance(&response, Instant::now()); assert!(result.satisfied); let new = result.new_acceptor.expect("expected new acceptor"); assert_eq!(new.peer.peer, acceptor.peer); + assert!(!new.transient); } #[test] diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index e1f9d9512..a7f906f28 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -365,6 +365,7 @@ impl ConnectionManager { self.peer_key.lock().clone() } + #[allow(dead_code)] pub fn is_gateway(&self) -> bool { self.is_gateway } @@ -607,6 +608,15 @@ impl ConnectionManager { router: &Router, ) -> Option { let connections = self.connections_by_location.read(); + tracing::debug!( + total_locations = connections.len(), + self_peer = self + .get_peer_key() + .as_ref() + .map(|id| id.to_string()) + .unwrap_or_else(|| "unknown".into()), + "routing: considering connections" + ); let peers = connections.values().filter_map(|conns| { let conn = conns.choose(&mut rand::rng())?; if self.is_transient(&conn.location.peer) { diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index 6233d71e2..438602b41 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -605,13 +605,18 @@ impl Ring { %ideal_location, num_connections, skip_list_size = skip_list.len(), + self_peer = %self.connection_manager.get_peer_key().as_ref().map(|id| id.to_string()).unwrap_or_else(|| "unknown".into()), "Looking for peer to route through" ); if let Some(target) = self.connection_manager .routing(ideal_location, None, skip_list, &router) { - tracing::debug!(query_target = %target, "Found routing target"); + tracing::info!( + query_target = %target, + %ideal_location, + "connection_maintenance selected routing target" + ); target } else { tracing::warn!( From 107a7d191ee1d05de0b8798b111a0c0faecf0ba5 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 21:36:13 -0600 Subject: [PATCH 10/25] test: fix ExpectedInboundTracker helper for transient rename --- crates/core/src/node/network_bridge/handshake.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/src/node/network_bridge/handshake.rs b/crates/core/src/node/network_bridge/handshake.rs index c2f3c8b31..ef516a0ae 100644 --- a/crates/core/src/node/network_bridge/handshake.rs +++ b/crates/core/src/node/network_bridge/handshake.rs @@ -184,7 +184,7 @@ impl ExpectedInboundTracker { #[cfg(test)] fn contains(&self, addr: SocketAddr) -> bool { - self.entries.contains_key(&addr) + self.entries.contains_key(&addr.ip()) } } From 985cb175795114b8c2e161c1639374375f5676b0 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 22:24:04 -0600 Subject: [PATCH 11/25] feat: expose connection tuning and bump test harness --- Cargo.lock | 2 +- crates/core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 38a2c9f91..6f2dfb7f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1813,7 +1813,7 @@ dependencies = [ [[package]] name = "freenet-test-network" -version = "0.1.1" +version = "0.1.2" dependencies = [ "anyhow", "chrono", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index bfea19de1..98d266021 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -90,7 +90,7 @@ arbitrary = { features = ["derive"], version = "1" } chrono = { features = ["arbitrary"], workspace = true } freenet-stdlib = { features = ["net", "testing"], workspace = true } freenet-macros = { path = "../freenet-macros" } -freenet-test-network = { version = "0.1.1", path = "../../../../freenet-test-network" } +freenet-test-network = { version = "0.1.2", path = "../../../../freenet-test-network" } httptest = "0.16" statrs = "0.18" tempfile = "3" From 35b87c3a48801c7b1059d2392782547b4d3ef606 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Wed, 19 Nov 2025 23:08:46 -0600 Subject: [PATCH 12/25] test: instrument neighbor candidates and live tx tracking --- crates/core/src/ring/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index 438602b41..af0f970f9 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -482,8 +482,10 @@ impl Ring { let neighbor_locations = { let peers = self.connection_manager.get_connections_by_location(); tracing::debug!( - "Maintenance task: current connections = {}, checking topology", - current_connections + "Maintenance task: current connections = {}, candidates = {}, live_tx_peers = {}", + current_connections, + peers.len(), + live_tx_tracker.len() ); peers .iter() From 3bdda4596a84cfe588946853ac3f84e9e758a023 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 20 Nov 2025 18:33:01 -0600 Subject: [PATCH 13/25] fix: transient connection handling and viz tooling --- Cargo.lock | 19 +- crates/core/Cargo.toml | 3 +- .../src/node/network_bridge/p2p_protoc.rs | 29 +- crates/core/src/operations/connect.rs | 24 +- crates/core/src/ring/connection_manager.rs | 384 +++++------------- crates/core/src/ring/mod.rs | 73 ++-- .../core/src/transport/connection_handler.rs | 4 +- 7 files changed, 217 insertions(+), 319 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6f2dfb7f5..611aeb8bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1227,7 +1227,7 @@ dependencies = [ "libc", "option-ext", "redox_users 0.5.2", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -1429,7 +1429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -1686,6 +1686,7 @@ dependencies = [ "serde", "serde_json", "serde_with", + "sha2", "sqlx", "statrs", "stretto", @@ -1813,7 +1814,7 @@ dependencies = [ [[package]] name = "freenet-test-network" -version = "0.1.2" +version = "0.1.3" dependencies = [ "anyhow", "chrono", @@ -2429,7 +2430,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.1", + "socket2 0.5.10", "system-configuration", "tokio", "tower-service", @@ -2681,7 +2682,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3199,7 +3200,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -4320,7 +4321,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5193,7 +5194,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -6443,7 +6444,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 98d266021..f86cfcc81 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -65,6 +65,7 @@ xz2 = { version = "0.1" } reqwest = { version = "0.12", features = ["json"] } rsa = { version = "0.9", features = ["serde", "pem"] } pkcs8 = { version = "0.10", features = ["std", "pem"] } +sha2 = "0.10" # Tracing deps opentelemetry = "0.31" @@ -90,7 +91,7 @@ arbitrary = { features = ["derive"], version = "1" } chrono = { features = ["arbitrary"], workspace = true } freenet-stdlib = { features = ["net", "testing"], workspace = true } freenet-macros = { path = "../freenet-macros" } -freenet-test-network = { version = "0.1.2", path = "../../../../freenet-test-network" } +freenet-test-network = { version = "0.1.3", path = "../../../../freenet-test-network" } httptest = "0.16" statrs = "0.18" tempfile = "3" diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index f3b0fcc51..acb899457 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -635,6 +635,12 @@ impl P2pConnManager { "Failed to enqueue DropConnection command" ); } + // Immediately prune topology counters so we don't leak open connection slots. + ctx.bridge + .op_manager + .ring + .prune_connection(peer.clone()) + .await; if let Some(conn) = ctx.connections.remove(&peer) { // TODO: review: this could potentially leave garbage tasks in the background with peer listener match timeout( @@ -1723,11 +1729,32 @@ impl P2pConnManager { let pending_loc = connection_manager.prune_in_transit_connection(&peer_id); if !is_transient { let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); + // Re-apply admission logic on promotion to avoid bypassing capacity/heuristic checks. + let should_accept = connection_manager.should_accept(loc, &peer_id); + if !should_accept { + tracing::warn!( + %peer_id, + %loc, + "handle_successful_connection: promotion rejected by admission logic" + ); + return Ok(()); + } + let current = connection_manager.connection_count(); + if current >= connection_manager.max_connections { + tracing::warn!( + %peer_id, + current_connections = current, + max_connections = connection_manager.max_connections, + %loc, + "handle_successful_connection: rejecting new connection to enforce cap" + ); + return Ok(()); + } tracing::info!(remote = %peer_id, %loc, "handle_successful_connection: promoting connection into ring"); self.bridge .op_manager .ring - .add_connection(loc, peer_id.clone(), false) + .add_connection(loc, peer_id.clone(), true) .await; } else { // Update location now that we know it; budget was reserved before any work. diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 0f55fab66..0f277056a 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -215,6 +215,7 @@ impl RelayState { if !self.accepted_locally && ctx.should_accept(&self.request.joiner) { self.accepted_locally = true; let acceptor = ctx.self_location().clone(); + let dist = ring_distance(acceptor.location, self.request.joiner.location); let transient = ctx.transient_hint(&acceptor, &self.request.joiner); self.transient_hint = transient; actions.accept_response = Some(ConnectResponse { @@ -222,15 +223,27 @@ impl RelayState { transient, }); actions.expect_connection_from = Some(self.request.joiner.clone()); + tracing::info!( + acceptor_peer = %acceptor.peer, + joiner_peer = %self.request.joiner.peer, + acceptor_loc = ?acceptor.location, + joiner_loc = ?self.request.joiner.location, + ring_distance = ?dist, + transient, + "connect: acceptance issued" + ); } if self.forwarded_to.is_none() && self.request.ttl > 0 { match ctx.select_next_hop(self.request.desired_location, &self.request.visited) { Some(next) => { - tracing::debug!( + let dist = ring_distance(next.location, Some(self.request.desired_location)); + tracing::info!( target = %self.request.desired_location, ttl = self.request.ttl, next_peer = %next.peer, + next_loc = ?next.location, + ring_distance_to_target = ?dist, "connect: forwarding join request to next hop" ); let mut forward_req = self.request.clone(); @@ -242,7 +255,7 @@ impl RelayState { actions.forward = Some((next, forward_snapshot)); } None => { - tracing::debug!( + tracing::info!( target = %self.request.desired_location, ttl = self.request.ttl, visited = ?self.request.visited, @@ -768,6 +781,13 @@ fn store_operation_state_with_msg(op: &mut ConnectOp, msg: Option) - } } +fn ring_distance(a: Option, b: Option) -> Option { + match (a, b) { + (Some(a), Some(b)) => Some(a.distance(b).as_f64()), + _ => None, + } +} + #[tracing::instrument(fields(peer = %op_manager.ring.connection_manager.pub_key), skip_all)] pub(crate) async fn join_ring_request( backoff: Option, diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index a7f906f28..4e515f696 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -22,10 +22,8 @@ pub(crate) struct TransientEntry { #[derive(Clone)] pub(crate) struct ConnectionManager { - open_connections: Arc, - reserved_connections: Arc, + pending_reservations: Arc>>, pub(super) location_for_peer: Arc>>, - pending_locations: Arc>>, pub(super) topology_manager: Arc>, connections_by_location: Arc>>>, /// Interim connections ongoing handshake or successfully open connections @@ -125,9 +123,7 @@ impl ConnectionManager { Self { connections_by_location: Arc::new(RwLock::new(BTreeMap::new())), location_for_peer: Arc::new(RwLock::new(BTreeMap::new())), - pending_locations: Arc::new(RwLock::new(BTreeMap::new())), - open_connections: Arc::new(AtomicUsize::new(0)), - reserved_connections: Arc::new(AtomicUsize::new(0)), + pending_reservations: Arc::new(RwLock::new(BTreeMap::new())), topology_manager, own_location: own_location.into(), peer_key: Arc::new(Mutex::new(peer_id)), @@ -150,12 +146,8 @@ impl ConnectionManager { /// Will panic if the node checking for this condition has no location assigned. pub fn should_accept(&self, location: Location, peer_id: &PeerId) -> bool { tracing::info!("Checking if should accept connection"); - let open = self - .open_connections - .load(std::sync::atomic::Ordering::SeqCst); - let reserved_before = self - .reserved_connections - .load(std::sync::atomic::Ordering::SeqCst); + let open = self.connection_count(); + let reserved_before = self.pending_reservations.read().len(); tracing::info!( %peer_id, @@ -168,16 +160,6 @@ impl ConnectionManager { "should_accept: evaluating direct acceptance guard" ); - if self.has_connection_or_pending(peer_id) { - tracing::debug!( - %peer_id, - open, - reserved_before, - "Peer already connected; rejecting duplicate reservation" - ); - return false; - } - if self.is_gateway && (open > 0 || reserved_before > 0) { tracing::info!( %peer_id, @@ -187,34 +169,16 @@ impl ConnectionManager { ); } - let reserved_before = loop { - let current = self - .reserved_connections - .load(std::sync::atomic::Ordering::SeqCst); - if current == usize::MAX { - tracing::error!( - %peer_id, - "reserved connection counter overflowed; rejecting new connection" - ); - return false; - } - match self.reserved_connections.compare_exchange( - current, - current + 1, - std::sync::atomic::Ordering::SeqCst, - std::sync::atomic::Ordering::SeqCst, - ) { - Ok(_) => break current, - Err(actual) => { - tracing::debug!( - %peer_id, - expected = current, - actual, - "reserved connection counter changed concurrently; retrying" - ); - } - } - }; + if self.location_for_peer.read().get(peer_id).is_some() { + // We've already accepted this peer (pending or active); treat as a no-op acceptance. + tracing::debug!(%peer_id, "Peer already pending/connected; acknowledging acceptance"); + return true; + } + + { + let mut pending = self.pending_reservations.write(); + pending.insert(peer_id.clone(), location); + } let total_conn = match reserved_before .checked_add(1) @@ -228,70 +192,66 @@ impl ConnectionManager { open, "connection counters would overflow; rejecting connection" ); - self.reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + self.pending_reservations.write().remove(peer_id); return false; } }; - let accepted = if open == 0 { + if open == 0 { tracing::debug!(%peer_id, "should_accept: first connection -> accepting"); - true - } else { - const GATEWAY_DIRECT_ACCEPT_LIMIT: usize = 2; - if self.is_gateway { - let direct_total = open + reserved_before; - if direct_total >= GATEWAY_DIRECT_ACCEPT_LIMIT { - tracing::info!( - %peer_id, - open, - reserved_before, - limit = GATEWAY_DIRECT_ACCEPT_LIMIT, - "Gateway reached direct-accept limit; forwarding join request instead" - ); - self.reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); - tracing::info!(%peer_id, "should_accept: gateway direct-accept limit hit, forwarding instead"); - return false; - } - } - - if total_conn < self.min_connections { - tracing::info!(%peer_id, total_conn, "should_accept: accepted (below min connections)"); - true - } else if total_conn >= self.max_connections { - tracing::info!(%peer_id, total_conn, "should_accept: rejected (max connections reached)"); - false - } else { - let accepted = self - .topology_manager - .write() - .evaluate_new_connection(location, Instant::now()) - .unwrap_or(true); + return true; + } + const GATEWAY_DIRECT_ACCEPT_LIMIT: usize = 2; + if self.is_gateway { + let direct_total = open + reserved_before; + if direct_total >= GATEWAY_DIRECT_ACCEPT_LIMIT { tracing::info!( %peer_id, - total_conn, - accepted, - "should_accept: topology manager decision" + open, + reserved_before, + limit = GATEWAY_DIRECT_ACCEPT_LIMIT, + "Gateway reached direct-accept limit; forwarding join request instead" ); - accepted + self.pending_reservations.write().remove(peer_id); + tracing::info!(%peer_id, "should_accept: gateway direct-accept limit hit, forwarding instead"); + return false; } - }; + } + let accepted = if total_conn < self.min_connections { + tracing::info!(%peer_id, total_conn, "should_accept: accepted (below min connections)"); + true + } else if total_conn >= self.max_connections { + tracing::info!(%peer_id, total_conn, "should_accept: rejected (max connections reached)"); + false + } else { + let accepted = self + .topology_manager + .write() + .evaluate_new_connection(location, Instant::now()) + .unwrap_or(true); + + tracing::info!( + %peer_id, + total_conn, + accepted, + "should_accept: topology manager decision" + ); + accepted + }; tracing::info!( %peer_id, accepted, total_conn, open_connections = open, - reserved_connections = self - .reserved_connections - .load(std::sync::atomic::Ordering::SeqCst), + reserved_connections = self.pending_reservations.read().len(), + max_connections = self.max_connections, + min_connections = self.min_connections, "should_accept: final decision" ); if !accepted { - self.reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + self.pending_reservations.write().remove(peer_id); } else { tracing::info!(%peer_id, total_conn, "should_accept: accepted (reserving spot)"); self.record_pending_location(peer_id, location); @@ -301,11 +261,11 @@ impl ConnectionManager { /// Record the advertised location for a peer that we have decided to accept. /// - /// Pending peers are tracked separately so that other operations cannot route through them - /// until the handshake is fully complete. Once the connection is established the entry is - /// removed automatically via `prune_in_transit_connection`. + /// This makes the peer discoverable to the routing layer even before the connection + /// is fully established. The entry is removed automatically if the handshake fails + /// via `prune_in_transit_connection`. pub fn record_pending_location(&self, peer_id: &PeerId, location: Location) { - let mut locations = self.pending_locations.write(); + let mut locations = self.location_for_peer.write(); let entry = locations.entry(peer_id.clone()); match entry { Entry::Occupied(_) => { @@ -459,25 +419,31 @@ impl ConnectionManager { pub fn add_connection(&self, loc: Location, peer: PeerId, was_reserved: bool) { tracing::info!(%peer, %loc, %was_reserved, "Adding connection to topology"); debug_assert!(self.get_peer_key().expect("should be set") != peer); - { - let mut pending = self.pending_locations.write(); - pending.remove(&peer); - } if was_reserved { - let old = self - .reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); - #[cfg(debug_assertions)] - { - tracing::debug!(old, "Decremented reserved connections"); - if old == 0 { - panic!("Underflow of reserved connections"); + self.pending_reservations.write().remove(&peer); + } + let mut lop = self.location_for_peer.write(); + let previous_location = lop.insert(peer.clone(), loc); + drop(lop); + + if let Some(prev_loc) = previous_location { + tracing::info!( + %peer, + %prev_loc, + %loc, + "add_connection: replacing existing connection for peer" + ); + let mut cbl = self.connections_by_location.write(); + if let Some(prev_list) = cbl.get_mut(&prev_loc) { + if let Some(pos) = prev_list.iter().position(|c| c.location.peer == peer) { + prev_list.swap_remove(pos); + } + if prev_list.is_empty() { + cbl.remove(&prev_loc); } } - let _ = old; } - let mut lop = self.location_for_peer.write(); - lop.insert(peer.clone(), loc); + { let mut cbl = self.connections_by_location.write(); cbl.entry(loc).or_default().push(Connection { @@ -485,12 +451,8 @@ impl ConnectionManager { peer: peer.clone(), location: Some(loc), }, - open_at: Instant::now(), }); } - self.open_connections - .fetch_add(1, std::sync::atomic::Ordering::SeqCst); - std::mem::drop(lop); } pub fn update_peer_identity(&self, old_peer: &PeerId, new_peer: PeerId) -> bool { @@ -530,7 +492,6 @@ impl ConnectionManager { peer: new_peer, location: Some(loc), }, - open_at: Instant::now(), }); } @@ -541,54 +502,45 @@ impl ConnectionManager { let connection_type = if is_alive { "active" } else { "in transit" }; tracing::debug!(%peer, "Pruning {} connection", connection_type); - let loc = if is_alive { - let mut locations_for_peer = self.location_for_peer.write(); - match locations_for_peer.remove(peer) { - Some(loc) => { - let conns = &mut *self.connections_by_location.write(); - if let Some(conns) = conns.get_mut(&loc) { - if let Some(pos) = conns.iter().position(|c| &c.location.peer == peer) { - conns.swap_remove(pos); - } - } - loc - } - None => { - tracing::debug!("no location found for peer, skip pruning"); - return None; - } - } - } else { - match self.pending_locations.write().remove(peer) { - Some(loc) => loc, - None => { - tracing::debug!("no pending location found for peer, skip pruning"); - self.reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); - return None; + let mut locations_for_peer = self.location_for_peer.write(); + + let Some(loc) = locations_for_peer.remove(peer) else { + if is_alive { + tracing::debug!("no location found for peer, skip pruning"); + return None; + } else { + let removed = self.pending_reservations.write().remove(peer).is_some(); + if !removed { + tracing::warn!( + %peer, + "prune_connection: no pending reservation to release for in-transit peer" + ); } } + return None; }; - if is_alive { - self.open_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); - } else { - self.reserved_connections - .fetch_sub(1, std::sync::atomic::Ordering::SeqCst); + let conns = &mut *self.connections_by_location.write(); + if let Some(conns) = conns.get_mut(&loc) { + if let Some(pos) = conns.iter().position(|c| &c.location.peer == peer) { + conns.swap_remove(pos); + } } - Some(loc) - } + if !is_alive { + self.pending_reservations.write().remove(peer); + } - pub(super) fn get_open_connections(&self) -> usize { - self.open_connections - .load(std::sync::atomic::Ordering::SeqCst) + Some(loc) } - pub(crate) fn get_reserved_connections(&self) -> usize { - self.reserved_connections - .load(std::sync::atomic::Ordering::SeqCst) + pub(crate) fn connection_count(&self) -> usize { + // Count only established connections tracked by location buckets. + self.connections_by_location + .read() + .values() + .map(|conns| conns.len()) + .sum() } pub(super) fn get_connections_by_location(&self) -> BTreeMap> { @@ -648,120 +600,4 @@ impl ConnectionManager { let read = self.location_for_peer.read(); read.keys().cloned().collect::>().into_iter() } - - pub fn has_connection_or_pending(&self, peer: &PeerId) -> bool { - if self.location_for_peer.read().contains_key(peer) { - return true; - } - self.pending_locations.read().contains_key(peer) - } - - #[cfg(test)] - pub(crate) fn is_pending_connection(&self, peer: &PeerId) -> bool { - self.pending_locations.read().contains_key(peer) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::topology::rate::Rate; - use crate::transport::TransportKeypair; - use std::net::{IpAddr, Ipv4Addr, SocketAddr}; - use std::sync::atomic::{AtomicU64, Ordering}; - use std::time::Duration; - - fn make_connection_manager() -> ConnectionManager { - let keypair = TransportKeypair::new(); - let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 4100); - let own_location = Location::from_address(&addr); - let atomic_loc = AtomicU64::new(u64::from_le_bytes(own_location.as_f64().to_le_bytes())); - let self_peer = PeerId::new(addr, keypair.public().clone()); - ConnectionManager::init( - Rate::new_per_second(10_000.0), - Rate::new_per_second(10_000.0), - 1, - 32, - 4, - (keypair.public().clone(), Some(self_peer), atomic_loc), - false, - 4, - Duration::from_secs(30), - ) - } - - fn make_peer(port: u16) -> PeerId { - let keypair = TransportKeypair::new(); - let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), port); - PeerId::new(addr, keypair.public().clone()) - } - - #[test] - fn pending_connections_hidden_from_known_locations() { - let manager = make_connection_manager(); - let peer = make_peer(4200); - let location = Location::from_address(&peer.addr); - - assert!(manager.should_accept(location, &peer)); - assert!(manager.is_pending_connection(&peer)); - assert!( - !manager.get_known_locations().contains_key(&peer), - "pending connection leaked into established pool" - ); - - let restored = manager - .prune_in_transit_connection(&peer) - .expect("pending location should exist"); - assert_eq!(restored, location); - - manager.add_connection(restored, peer.clone(), false); - assert!( - !manager.is_pending_connection(&peer), - "pending slot should be cleared after promotion" - ); - - let known = manager.get_known_locations(); - assert_eq!(known.get(&peer), Some(&location)); - } - - #[test] - fn should_accept_does_not_leak_reservations_for_duplicate_peer() { - let keypair = TransportKeypair::new(); - let peer_addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 20_000); - let peer_id = PeerId::new(peer_addr, keypair.public().clone()); - let location = Location::from_address(&peer_addr); - - let manager = ConnectionManager::init( - Rate::new_per_second(1_000_000.0), - Rate::new_per_second(1_000_000.0), - Ring::DEFAULT_MIN_CONNECTIONS, - Ring::DEFAULT_MAX_CONNECTIONS, - Ring::DEFAULT_RAND_WALK_ABOVE_HTL, - (keypair.public().clone(), None, AtomicU64::new(0)), - false, - 32, - Duration::from_secs(30), - ); - - assert!(manager.should_accept(location, &peer_id)); - let after_first = manager.reserved_connections.load(Ordering::SeqCst); - assert_eq!(after_first, 1); - { - assert!( - manager.is_pending_connection(&peer_id), - "pending connection should be tracked separately after initial acceptance" - ); - } - - // Second attempt for the same peer should be rejected immediately. - assert!( - !manager.should_accept(location, &peer_id), - "duplicate peer should be rejected by should_accept" - ); - assert_eq!( - manager.reserved_connections.load(Ordering::SeqCst), - after_first, - "repeat should_accept calls should not leak reservations for an existing peer" - ); - } } diff --git a/crates/core/src/ring/mod.rs b/crates/core/src/ring/mod.rs index af0f970f9..b23bf8ebe 100644 --- a/crates/core/src/ring/mod.rs +++ b/crates/core/src/ring/mod.rs @@ -3,7 +3,7 @@ //! Mainly maintains a healthy and optimal pool of connections to other peers in the network //! and routes requests to the optimal peers. -use std::collections::{BTreeSet, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::{ sync::{atomic::AtomicU64, Arc, Weak}, time::{Duration, Instant}, @@ -157,7 +157,7 @@ impl Ring { } pub fn open_connections(&self) -> usize { - self.connection_manager.get_open_connections() + self.connection_manager.connection_count() } async fn refresh_router(router: Arc>, register: ER) { @@ -385,11 +385,6 @@ impl Ring { tracing::info!("Initializing connection maintenance task"); let is_gateway = self.is_gateway; tracing::info!(is_gateway, "Connection maintenance task starting"); - #[cfg(not(test))] - const CONNECTION_AGE_THRESOLD: Duration = Duration::from_secs(60 * 5); - #[cfg(test)] - const CONNECTION_AGE_THRESOLD: Duration = Duration::from_secs(5); - #[cfg(not(test))] const CHECK_TICK_DURATION: Duration = Duration::from_secs(60); #[cfg(test)] @@ -460,7 +455,7 @@ impl Ring { error })?; if live_tx.is_none() { - let conns = self.connection_manager.get_open_connections(); + let conns = self.connection_manager.connection_count(); tracing::warn!( "acquire_new returned None - likely no peers to query through (connections: {})", conns @@ -477,32 +472,50 @@ impl Ring { } } - let current_connections = self.connection_manager.get_open_connections(); + let current_connections = self.connection_manager.connection_count(); let pending_connection_targets = pending_conn_adds.len(); - let neighbor_locations = { - let peers = self.connection_manager.get_connections_by_location(); - tracing::debug!( - "Maintenance task: current connections = {}, candidates = {}, live_tx_peers = {}", + let peers = self.connection_manager.get_connections_by_location(); + let connections_considered: usize = peers.values().map(|c| c.len()).sum(); + + let mut neighbor_locations: BTreeMap<_, Vec<_>> = peers + .iter() + .map(|(loc, conns)| { + let conns: Vec<_> = conns + .iter() + .filter(|conn| !live_tx_tracker.has_live_connection(&conn.location.peer)) + .cloned() + .collect(); + (*loc, conns) + }) + .filter(|(_, conns)| !conns.is_empty()) + .collect(); + + if neighbor_locations.is_empty() && connections_considered > 0 { + tracing::warn!( current_connections, - peers.len(), - live_tx_tracker.len() + connections_considered, + live_tx_peers = live_tx_tracker.len(), + "Neighbor filtering removed all candidates; using all connections" ); - peers + + neighbor_locations = peers .iter() - .map(|(loc, conns)| { - let conns: Vec<_> = conns - .iter() - .filter(|conn| { - conn.open_at.elapsed() > CONNECTION_AGE_THRESOLD - && !live_tx_tracker.has_live_connection(&conn.location.peer) - }) - .cloned() - .collect(); - (*loc, conns) - }) + .map(|(loc, conns)| (*loc, conns.clone())) .filter(|(_, conns)| !conns.is_empty()) - .collect() - }; + .collect(); + } + + if current_connections > self.connection_manager.max_connections { + // When over capacity, consider all connections for removal regardless of live_tx filter. + neighbor_locations = peers.clone(); + } + + tracing::debug!( + "Maintenance task: current connections = {}, candidates = {}, live_tx_peers = {}", + current_connections, + peers.len(), + live_tx_tracker.len() + ); let adjustment = self .connection_manager @@ -591,7 +604,7 @@ impl Ring { live_tx_tracker: &LiveTransactionTracker, op_manager: &Arc, ) -> anyhow::Result> { - let current_connections = self.connection_manager.get_open_connections(); + let current_connections = self.connection_manager.connection_count(); let is_gateway = self.is_gateway; tracing::info!( diff --git a/crates/core/src/transport/connection_handler.rs b/crates/core/src/transport/connection_handler.rs index 83e0650f1..7b26e75fd 100644 --- a/crates/core/src/transport/connection_handler.rs +++ b/crates/core/src/transport/connection_handler.rs @@ -445,9 +445,9 @@ impl UdpPacketsListener { } else { // Non-gateway peers: mark as expected and wait for the normal peer handshake flow. self.expected_non_gateway.insert(remote_addr.ip()); - tracing::warn!( + tracing::debug!( %remote_addr, - "unexpected peer intro from non-gateway; marking expected_non_gateway and continuing" + "unexpected peer intro; marking expected_non_gateway" ); continue; } From 74654250fd037dfcc6d14367869f3699e16e9acb Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 20 Nov 2025 19:02:55 -0600 Subject: [PATCH 14/25] fix: enforce caps on transient promotion and add cap repro test --- .../src/node/network_bridge/p2p_protoc.rs | 49 ++++++++++++++++++- crates/core/src/ring/connection_manager.rs | 16 ++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index acb899457..15101a95a 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -1294,10 +1294,57 @@ impl P2pConnManager { let loc = entry .location .unwrap_or_else(|| Location::from_address(&peer.addr)); + // Re-run admission + cap guard when promoting a transient connection. + let should_accept = connection_manager.should_accept(loc, &peer); + if !should_accept { + tracing::warn!( + tx = %tx, + %peer, + %loc, + "connect_peer: promotion rejected by admission logic" + ); + callback + .send_result(Err(())) + .await + .inspect_err(|err| { + tracing::debug!( + tx = %tx, + remote = %peer, + ?err, + "connect_peer: failed to notify rejected-promotion callback" + ); + }) + .ok(); + return Ok(()); + } + let current = connection_manager.connection_count(); + if current >= connection_manager.max_connections { + tracing::warn!( + tx = %tx, + %peer, + current_connections = current, + max_connections = connection_manager.max_connections, + %loc, + "connect_peer: rejecting transient promotion to enforce cap" + ); + callback + .send_result(Err(())) + .await + .inspect_err(|err| { + tracing::debug!( + tx = %tx, + remote = %peer, + ?err, + "connect_peer: failed to notify cap-rejection callback" + ); + }) + .ok(); + return Ok(()); + } self.bridge .op_manager .ring - .add_connection(loc, peer.clone(), false) + .add_connection(loc, peer.clone(), true) .await; tracing::info!(tx = %tx, remote = %peer, "connect_peer: promoted transient"); } diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 4e515f696..e0260fd56 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -426,6 +426,22 @@ impl ConnectionManager { let previous_location = lop.insert(peer.clone(), loc); drop(lop); + // Enforce the global cap when adding a new peer (not a relocation). + if previous_location.is_none() && self.connection_count() >= self.max_connections { + tracing::warn!( + %peer, + %loc, + max = self.max_connections, + "add_connection: rejecting new connection to enforce cap" + ); + // Roll back bookkeeping since we're refusing the connection. + self.location_for_peer.write().remove(&peer); + if was_reserved { + self.pending_reservations.write().remove(&peer); + } + return; + } + if let Some(prev_loc) = previous_location { tracing::info!( %peer, From 32de9239437df88f6476bb7525a79fe769e31fe4 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 20 Nov 2025 19:04:31 -0600 Subject: [PATCH 15/25] test: add small cap repro harness --- crates/core/tests/connection_cap.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/core/tests/connection_cap.rs b/crates/core/tests/connection_cap.rs index 4342244fe..82186c451 100644 --- a/crates/core/tests/connection_cap.rs +++ b/crates/core/tests/connection_cap.rs @@ -9,10 +9,12 @@ use freenet_test_network::{BuildProfile, FreenetBinary, NetworkBuilder}; #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn connection_cap_respected() -> anyhow::Result<()> { - let max_connections = freenet::config::DEFAULT_MAX_CONNECTIONS; + let max_connections = 6usize; let net = NetworkBuilder::new() .gateways(2) .peers(6) + .min_connections(4) + .max_connections(max_connections) .start_stagger(std::time::Duration::from_millis(300)) .require_connectivity(0.9) .connectivity_timeout(std::time::Duration::from_secs(40)) From 1034d91468b390de3a2cc6ed8a0c9a3133b1fccf Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 20 Nov 2025 19:23:25 -0600 Subject: [PATCH 16/25] fix: report ring connections in diagnostics and bound soak caps --- .../src/node/network_bridge/p2p_protoc.rs | 61 +++++++++++++------ crates/core/src/ring/connection_manager.rs | 2 +- crates/core/tests/large_network.rs | 16 +++++ 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 15101a95a..e0fb5e52a 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -834,15 +834,23 @@ impl P2pConnManager { // Collect network information if config.include_network_info { - let connected_peers: Vec<_> = ctx - .connections - .keys() - .map(|p| (p.to_string(), p.addr.to_string())) - .collect(); + let cm = &op_manager.ring.connection_manager; + let connections_by_loc = cm.get_connections_by_location(); + let mut connected_peers = Vec::new(); + for conns in connections_by_loc.values() { + for conn in conns { + connected_peers.push(( + conn.location.peer.to_string(), + conn.location.peer.addr.to_string(), + )); + } + } + connected_peers.sort_by(|a, b| a.0.cmp(&b.0)); + connected_peers.dedup_by(|a, b| a.0 == b.0); response.network_info = Some(NetworkInfo { + active_connections: connected_peers.len(), connected_peers, - active_connections: ctx.connections.len(), }); } @@ -921,28 +929,43 @@ impl P2pConnManager { } } + // Collect topology-backed connection info (exclude transient transports). + let cm = &op_manager.ring.connection_manager; + let connections_by_loc = cm.get_connections_by_location(); + let mut connected_peer_ids = Vec::new(); + if config.include_detailed_peer_info { + use freenet_stdlib::client_api::ConnectedPeerInfo; + for conns in connections_by_loc.values() { + for conn in conns { + connected_peer_ids.push(conn.location.peer.to_string()); + response.connected_peers_detailed.push( + ConnectedPeerInfo { + peer_id: conn.location.peer.to_string(), + address: conn.location.peer.addr.to_string(), + }, + ); + } + } + } else { + for conns in connections_by_loc.values() { + connected_peer_ids.extend( + conns.iter().map(|c| c.location.peer.to_string()), + ); + } + } + connected_peer_ids.sort(); + connected_peer_ids.dedup(); + // Collect system metrics if config.include_system_metrics { let seeding_contracts = op_manager.ring.all_network_subscriptions().len() as u32; response.system_metrics = Some(SystemMetrics { - active_connections: ctx.connections.len() as u32, + active_connections: connected_peer_ids.len() as u32, seeding_contracts, }); } - // Collect detailed peer information if requested - if config.include_detailed_peer_info { - use freenet_stdlib::client_api::ConnectedPeerInfo; - // Populate detailed peer information from actual connections - for peer in ctx.connections.keys() { - response.connected_peers_detailed.push(ConnectedPeerInfo { - peer_id: peer.to_string(), - address: peer.addr.to_string(), - }); - } - } - match timeout( Duration::from_secs(2), callback.send(QueryResult::NodeDiagnostics(response)), diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index e0260fd56..214f9715f 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -559,7 +559,7 @@ impl ConnectionManager { .sum() } - pub(super) fn get_connections_by_location(&self) -> BTreeMap> { + pub(crate) fn get_connections_by_location(&self) -> BTreeMap> { self.connections_by_location.read().clone() } diff --git a/crates/core/tests/large_network.rs b/crates/core/tests/large_network.rs index 20970a647..dc2f6d1d3 100644 --- a/crates/core/tests/large_network.rs +++ b/crates/core/tests/large_network.rs @@ -38,6 +38,8 @@ const DEFAULT_PEER_COUNT: usize = 38; const DEFAULT_SNAPSHOT_INTERVAL: Duration = Duration::from_secs(60); const DEFAULT_SNAPSHOT_ITERATIONS: usize = 5; const DEFAULT_CONNECTIVITY_TARGET: f64 = 0.75; +const DEFAULT_MIN_CONNECTIONS: usize = 5; +const DEFAULT_MAX_CONNECTIONS: usize = 7; #[tokio::test(flavor = "multi_thread", worker_threads = 4)] #[ignore = "Large soak test - run manually (see file header for instructions)"] @@ -59,10 +61,20 @@ async fn large_network_soak() -> anyhow::Result<()> { .ok() .and_then(|val| val.parse::().ok()) .unwrap_or(DEFAULT_CONNECTIVITY_TARGET); + let min_connections = env::var("SOAK_MIN_CONNECTIONS") + .ok() + .and_then(|val| val.parse().ok()) + .unwrap_or(DEFAULT_MIN_CONNECTIONS); + let max_connections = env::var("SOAK_MAX_CONNECTIONS") + .ok() + .and_then(|val| val.parse().ok()) + .unwrap_or(DEFAULT_MAX_CONNECTIONS); let network = TestNetwork::builder() .gateways(2) .peers(peer_count) + .min_connections(min_connections) + .max_connections(max_connections) .require_connectivity(connectivity_target) .connectivity_timeout(Duration::from_secs(120)) .preserve_temp_dirs_on_failure(true) @@ -78,6 +90,10 @@ async fn large_network_soak() -> anyhow::Result<()> { peer_count, network.run_root().display() ); + println!( + "Min connections: {}, max connections: {} (override via SOAK_MIN_CONNECTIONS / SOAK_MAX_CONNECTIONS)", + min_connections, max_connections + ); let riverctl_path = which("riverctl") .context("riverctl not found in PATH; install via `cargo install riverctl`")?; From 47f69f277d0a33ac907468a348d9f536d6451879 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Thu, 20 Nov 2025 20:03:20 -0600 Subject: [PATCH 17/25] test: add warmup and ring snapshots to soak --- crates/core/tests/large_network.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/crates/core/tests/large_network.rs b/crates/core/tests/large_network.rs index dc2f6d1d3..53bf5eb46 100644 --- a/crates/core/tests/large_network.rs +++ b/crates/core/tests/large_network.rs @@ -37,6 +37,7 @@ use which::which; const DEFAULT_PEER_COUNT: usize = 38; const DEFAULT_SNAPSHOT_INTERVAL: Duration = Duration::from_secs(60); const DEFAULT_SNAPSHOT_ITERATIONS: usize = 5; +const DEFAULT_SNAPSHOT_WARMUP: Duration = Duration::from_secs(60); const DEFAULT_CONNECTIVITY_TARGET: f64 = 0.75; const DEFAULT_MIN_CONNECTIONS: usize = 5; const DEFAULT_MAX_CONNECTIONS: usize = 7; @@ -61,6 +62,11 @@ async fn large_network_soak() -> anyhow::Result<()> { .ok() .and_then(|val| val.parse::().ok()) .unwrap_or(DEFAULT_CONNECTIVITY_TARGET); + let snapshot_warmup = env::var("SOAK_SNAPSHOT_WARMUP_SECS") + .ok() + .and_then(|val| val.parse().ok()) + .map(Duration::from_secs) + .unwrap_or(DEFAULT_SNAPSHOT_WARMUP); let min_connections = env::var("SOAK_MIN_CONNECTIONS") .ok() .and_then(|val| val.parse().ok()) @@ -105,6 +111,13 @@ async fn large_network_soak() -> anyhow::Result<()> { let snapshots_dir = network.run_root().join("large-soak"); fs::create_dir_all(&snapshots_dir)?; + // Allow topology maintenance to run before the first snapshot. + println!( + "Waiting {:?} before first snapshot to allow topology maintenance to converge", + snapshot_warmup + ); + sleep(snapshot_warmup).await; + let mut iteration = 0usize; let mut next_tick = Instant::now(); while iteration < snapshot_iterations { @@ -113,6 +126,11 @@ async fn large_network_soak() -> anyhow::Result<()> { let snapshot_path = snapshots_dir.join(format!("snapshot-{iteration:02}.json")); fs::write(&snapshot_path, to_string_pretty(&snapshot)?)?; + // Also capture ring topology for visualizing evolution over time. + let ring_snapshot = network.ring_snapshot().await?; + let ring_path = snapshots_dir.join(format!("ring-{iteration:02}.json")); + fs::write(&ring_path, to_string_pretty(&ring_snapshot)?)?; + let healthy = snapshot .peers .iter() From aff9286d349a068b4ba33df80d8077ab4982dd64 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 10:11:30 -0600 Subject: [PATCH 18/25] chore: use published freenet-test-network --- crates/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index f86cfcc81..18695cfc2 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -91,7 +91,7 @@ arbitrary = { features = ["derive"], version = "1" } chrono = { features = ["arbitrary"], workspace = true } freenet-stdlib = { features = ["net", "testing"], workspace = true } freenet-macros = { path = "../freenet-macros" } -freenet-test-network = { version = "0.1.3", path = "../../../../freenet-test-network" } +freenet-test-network = "0.1.3" httptest = "0.16" statrs = "0.18" tempfile = "3" From 9cec60207738c1c24eabe8092f7d8dbada6a1570 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 10:39:14 -0600 Subject: [PATCH 19/25] fix: overwrite expected inbound for same peer --- crates/core/src/node/network_bridge/handshake.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/core/src/node/network_bridge/handshake.rs b/crates/core/src/node/network_bridge/handshake.rs index ef516a0ae..99f527f77 100644 --- a/crates/core/src/node/network_bridge/handshake.rs +++ b/crates/core/src/node/network_bridge/handshake.rs @@ -138,14 +138,14 @@ impl ExpectedInboundTracker { tx = ?transaction, "ExpectInbound: registering expectation" ); - self.entries - .entry(peer.addr.ip()) - .or_default() - .push(ExpectedInbound { - peer, - transaction, - transient, - }); + let list = self.entries.entry(peer.addr.ip()).or_default(); + // Replace any existing expectation for the same peer/port to ensure the newest registration wins. + list.retain(|entry| entry.peer.addr.port() != peer.addr.port()); + list.push(ExpectedInbound { + peer, + transaction, + transient, + }); } fn drop_peer(&mut self, peer: &PeerId) { From 210716f4bf9e521f9735cb006b5991c0d082ad42 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 10:49:19 -0600 Subject: [PATCH 20/25] test: avoid unsupported connection cap flags --- Cargo.lock | 18 ++++++++++-------- crates/core/tests/connection_cap.rs | 4 +--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 611aeb8bd..5f3d68c75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1227,7 +1227,7 @@ dependencies = [ "libc", "option-ext", "redox_users 0.5.2", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1429,7 +1429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -1815,6 +1815,8 @@ dependencies = [ [[package]] name = "freenet-test-network" version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5b74dc741d17bc57e55be2a2b2dc0b15bdb4299b77b3f779d371a379611cb13" dependencies = [ "anyhow", "chrono", @@ -2430,7 +2432,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.1", "system-configuration", "tokio", "tower-service", @@ -2682,7 +2684,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -3200,7 +3202,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -4321,7 +4323,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -5194,7 +5196,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -6444,7 +6446,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/crates/core/tests/connection_cap.rs b/crates/core/tests/connection_cap.rs index 82186c451..4342244fe 100644 --- a/crates/core/tests/connection_cap.rs +++ b/crates/core/tests/connection_cap.rs @@ -9,12 +9,10 @@ use freenet_test_network::{BuildProfile, FreenetBinary, NetworkBuilder}; #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn connection_cap_respected() -> anyhow::Result<()> { - let max_connections = 6usize; + let max_connections = freenet::config::DEFAULT_MAX_CONNECTIONS; let net = NetworkBuilder::new() .gateways(2) .peers(6) - .min_connections(4) - .max_connections(max_connections) .start_stagger(std::time::Duration::from_millis(300)) .require_connectivity(0.9) .connectivity_timeout(std::time::Duration::from_secs(40)) From 93f0222080832996d811af8e02e6ff26f88222af Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 11:51:02 -0600 Subject: [PATCH 21/25] fix: restore connection helper methods --- crates/core/src/ring/connection_manager.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 214f9715f..098bb0afb 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -559,6 +559,21 @@ impl ConnectionManager { .sum() } + #[allow(dead_code)] + pub(super) fn get_open_connections(&self) -> usize { + self.connection_count() + } + + #[allow(dead_code)] + pub(crate) fn get_reserved_connections(&self) -> usize { + self.pending_reservations.read().len() + } + + pub fn has_connection_or_pending(&self, peer: &PeerId) -> bool { + self.location_for_peer.read().contains_key(peer) + || self.pending_reservations.read().contains_key(peer) + } + pub(crate) fn get_connections_by_location(&self) -> BTreeMap> { self.connections_by_location.read().clone() } From 367d2e374c94be59e82330a345db405a1d332c43 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 12:23:30 -0600 Subject: [PATCH 22/25] fix: promote transient gateway links --- .../core/src/node/network_bridge/p2p_protoc.rs | 7 ++++++- crates/core/src/ring/connection_manager.rs | 17 ----------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index e0fb5e52a..5d7287b44 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -1794,10 +1794,12 @@ impl P2pConnManager { tracing::debug!(self_peer = %self.bridge.op_manager.ring.connection_manager.pub_key, %peer_id, conn_map_size = self.connections.len(), "[CONN_TRACK] SKIP INSERT: OutboundConnectionSuccessful - connection already exists in HashMap"); } + let promote_to_ring = !is_transient || connection_manager.is_gateway(); + if newly_inserted { tracing::info!(remote = %peer_id, is_transient, "handle_successful_connection: inserted new connection entry"); let pending_loc = connection_manager.prune_in_transit_connection(&peer_id); - if !is_transient { + if promote_to_ring { let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); // Re-apply admission logic on promotion to avoid bypassing capacity/heuristic checks. let should_accept = connection_manager.should_accept(loc, &peer_id); @@ -1826,6 +1828,9 @@ impl P2pConnManager { .ring .add_connection(loc, peer_id.clone(), true) .await; + if is_transient { + connection_manager.drop_transient(&peer_id); + } } else { // Update location now that we know it; budget was reserved before any work. connection_manager.try_register_transient(peer_id.clone(), pending_loc); diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 098bb0afb..897fd1871 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -202,23 +202,6 @@ impl ConnectionManager { return true; } - const GATEWAY_DIRECT_ACCEPT_LIMIT: usize = 2; - if self.is_gateway { - let direct_total = open + reserved_before; - if direct_total >= GATEWAY_DIRECT_ACCEPT_LIMIT { - tracing::info!( - %peer_id, - open, - reserved_before, - limit = GATEWAY_DIRECT_ACCEPT_LIMIT, - "Gateway reached direct-accept limit; forwarding join request instead" - ); - self.pending_reservations.write().remove(peer_id); - tracing::info!(%peer_id, "should_accept: gateway direct-accept limit hit, forwarding instead"); - return false; - } - } - let accepted = if total_conn < self.min_connections { tracing::info!(%peer_id, total_conn, "should_accept: accepted (below min connections)"); true From 7f8d1edf3b8bdf4b77bbc8ac8f920221a63780d9 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 13:14:13 -0600 Subject: [PATCH 23/25] test: add settle delay after mesh check --- crates/core/tests/connectivity.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/core/tests/connectivity.rs b/crates/core/tests/connectivity.rs index 9d06ec191..b06c09d66 100644 --- a/crates/core/tests/connectivity.rs +++ b/crates/core/tests/connectivity.rs @@ -420,6 +420,9 @@ async fn test_three_node_network_connectivity(ctx: &mut TestContext) -> TestResu ); } + // Allow a brief settling period before exercising contract operations. + tokio::time::sleep(Duration::from_secs(2)).await; + // Verify functionality with PUT/GET tracing::info!("Verifying network functionality with PUT/GET operations"); From 4d6fe20314ba77b9d32a98a841e56b6dece1ef20 Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sat, 22 Nov 2025 15:48:07 -0600 Subject: [PATCH 24/25] fix: transient handshake and routing cleanup --- .../src/node/network_bridge/p2p_protoc.rs | 98 ++++++++---- crates/core/src/operations/connect.rs | 145 +++++++++++------- crates/core/src/ring/connection_manager.rs | 86 ++++++++--- 3 files changed, 218 insertions(+), 111 deletions(-) diff --git a/crates/core/src/node/network_bridge/p2p_protoc.rs b/crates/core/src/node/network_bridge/p2p_protoc.rs index 5d7287b44..4cc9d99d1 100644 --- a/crates/core/src/node/network_bridge/p2p_protoc.rs +++ b/crates/core/src/node/network_bridge/p2p_protoc.rs @@ -1509,7 +1509,7 @@ impl P2pConnManager { connection, transient, } => { - tracing::info!(provided = ?peer, transient = transient, tx = ?transaction, "InboundConnection event"); + tracing::info!(provided = ?peer, transient, tx = ?transaction, "InboundConnection event"); let _conn_manager = &self.bridge.op_manager.ring.connection_manager; let remote_addr = connection.remote_addr(); @@ -1517,7 +1517,7 @@ impl P2pConnManager { if blocked_addrs.contains(&remote_addr) { tracing::info!( remote = %remote_addr, - transient = transient, + transient, transaction = ?transaction, "Inbound connection blocked by local policy" ); @@ -1529,7 +1529,7 @@ impl P2pConnManager { let peer_id = peer.unwrap_or_else(|| { tracing::info!( remote = %remote_addr, - transient = transient, + transient, transaction = ?transaction, "Inbound connection arrived without matching expectation; accepting provisionally" ); @@ -1547,7 +1547,7 @@ impl P2pConnManager { tracing::info!( remote = %peer_id.addr, - transient = transient, + transient, transaction = ?transaction, "Inbound connection established" ); @@ -1567,7 +1567,7 @@ impl P2pConnManager { } => { tracing::info!( remote = %peer.addr, - transient = transient, + transient, transaction = %transaction, "Outbound connection established" ); @@ -1582,7 +1582,7 @@ impl P2pConnManager { } => { tracing::info!( remote = %peer.addr, - transient = transient, + transient, transaction = %transaction, ?error, "Outbound connection failed" @@ -1801,6 +1801,12 @@ impl P2pConnManager { let pending_loc = connection_manager.prune_in_transit_connection(&peer_id); if promote_to_ring { let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); + tracing::info!( + remote = %peer_id, + %loc, + pending_loc_known = pending_loc.is_some(), + "handle_successful_connection: evaluating promotion to ring" + ); // Re-apply admission logic on promotion to avoid bypassing capacity/heuristic checks. let should_accept = connection_manager.should_accept(loc, &peer_id); if !should_accept { @@ -1832,32 +1838,62 @@ impl P2pConnManager { connection_manager.drop_transient(&peer_id); } } else { - // Update location now that we know it; budget was reserved before any work. - connection_manager.try_register_transient(peer_id.clone(), pending_loc); - tracing::info!( - peer = %peer_id, - "Registered transient connection (not added to ring topology)" - ); - let ttl = connection_manager.transient_ttl(); - let drop_tx = self.bridge.ev_listener_tx.clone(); - let cm = connection_manager.clone(); - let peer = peer_id.clone(); - tokio::spawn(async move { - sleep(ttl).await; - if cm.drop_transient(&peer).is_some() { - tracing::info!(%peer, "Transient connection expired; dropping"); - if let Err(err) = drop_tx - .send(Right(NodeEvent::DropConnection(peer.clone()))) - .await - { - tracing::warn!( - %peer, - ?err, - "Failed to dispatch DropConnection for expired transient" - ); - } + let loc = pending_loc.unwrap_or_else(|| Location::from_address(&peer_id.addr)); + // Evaluate whether this transient should be promoted; gateways need routable peers. + let should_accept = connection_manager.should_accept(loc, &peer_id); + if should_accept { + connection_manager.drop_transient(&peer_id); + let current = connection_manager.connection_count(); + if current >= connection_manager.max_connections { + tracing::warn!( + %peer_id, + current_connections = current, + max_connections = connection_manager.max_connections, + %loc, + "handle_successful_connection: rejecting transient promotion to enforce cap" + ); + return Ok(()); } - }); + tracing::info!( + remote = %peer_id, + %loc, + pending_loc_known = pending_loc.is_some(), + "handle_successful_connection: promoting transient into ring" + ); + self.bridge + .op_manager + .ring + .add_connection(loc, peer_id.clone(), true) + .await; + } else { + // Keep the connection as transient; budget was reserved before any work. + connection_manager.try_register_transient(peer_id.clone(), pending_loc); + tracing::info!( + peer = %peer_id, + pending_loc_known = pending_loc.is_some(), + "Registered transient connection (not added to ring topology)" + ); + let ttl = connection_manager.transient_ttl(); + let drop_tx = self.bridge.ev_listener_tx.clone(); + let cm = connection_manager.clone(); + let peer = peer_id.clone(); + tokio::spawn(async move { + sleep(ttl).await; + if cm.drop_transient(&peer).is_some() { + tracing::info!(%peer, "Transient connection expired; dropping"); + if let Err(err) = drop_tx + .send(Right(NodeEvent::DropConnection(peer.clone()))) + .await + { + tracing::warn!( + %peer, + ?err, + "Failed to dispatch DropConnection for expired transient" + ); + } + } + }); + } } } else if is_transient { // We reserved budget earlier, but didn't take ownership of the connection. diff --git a/crates/core/src/operations/connect.rs b/crates/core/src/operations/connect.rs index 0f277056a..42f568e6b 100644 --- a/crates/core/src/operations/connect.rs +++ b/crates/core/src/operations/connect.rs @@ -3,7 +3,7 @@ //! The legacy multi-stage connect operation has been removed; this module now powers the node’s //! connection and maintenance paths end-to-end. -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::net::SocketAddr; use std::sync::Arc; @@ -77,21 +77,30 @@ impl InnerMessage for ConnectMsg { impl fmt::Display for ConnectMsg { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - ConnectMsg::Request { target, payload, .. } => write!( + ConnectMsg::Request { + target, payload, .. + } => write!( f, "ConnectRequest {{ target: {target}, desired: {}, ttl: {}, joiner: {} }}", - payload.desired_location, - payload.ttl, - payload.joiner + payload.desired_location, payload.ttl, payload.joiner ), - ConnectMsg::Response { sender, target, payload, .. } => write!( + ConnectMsg::Response { + sender, + target, + payload, + .. + } => write!( f, - "ConnectResponse {{ sender: {sender}, target: {target}, acceptor: {}, transient: {} }}", + "ConnectResponse {{ sender: {sender}, target: {target}, acceptor: {} }}", payload.acceptor, - payload.transient ), - ConnectMsg::ObservedAddress { target, address, .. } => { - write!(f, "ObservedAddress {{ target: {target}, address: {address} }}") + ConnectMsg::ObservedAddress { + target, address, .. + } => { + write!( + f, + "ObservedAddress {{ target: {target}, address: {address} }}" + ) } } } @@ -126,8 +135,6 @@ pub(crate) struct ConnectRequest { pub(crate) struct ConnectResponse { /// The peer that accepted the join request. pub acceptor: PeerKeyLocation, - /// Whether this acceptance is a short-lived transient link. - pub transient: bool, } /// New minimal state machine the joiner tracks. @@ -154,7 +161,6 @@ pub(crate) struct RelayState { pub upstream: PeerKeyLocation, pub request: ConnectRequest, pub forwarded_to: Option, - pub transient_hint: bool, pub observed_sent: bool, pub accepted_locally: bool, } @@ -173,10 +179,8 @@ pub(crate) trait RelayContext { &self, desired_location: Location, visited: &[PeerKeyLocation], + recency: &HashMap, ) -> Option; - - /// Whether the acceptance should be treated as a short-lived transient link. - fn transient_hint(&self, acceptor: &PeerKeyLocation, joiner: &PeerKeyLocation) -> bool; } /// Result of processing a request at a relay. @@ -193,6 +197,7 @@ impl RelayState { &mut self, ctx: &C, observed_remote: &PeerKeyLocation, + recency: &HashMap, ) -> RelayActions { let mut actions = RelayActions::default(); push_unique_peer(&mut self.request.visited, observed_remote.clone()); @@ -216,11 +221,8 @@ impl RelayState { self.accepted_locally = true; let acceptor = ctx.self_location().clone(); let dist = ring_distance(acceptor.location, self.request.joiner.location); - let transient = ctx.transient_hint(&acceptor, &self.request.joiner); - self.transient_hint = transient; actions.accept_response = Some(ConnectResponse { acceptor: acceptor.clone(), - transient, }); actions.expect_connection_from = Some(self.request.joiner.clone()); tracing::info!( @@ -229,13 +231,16 @@ impl RelayState { acceptor_loc = ?acceptor.location, joiner_loc = ?self.request.joiner.location, ring_distance = ?dist, - transient, "connect: acceptance issued" ); } if self.forwarded_to.is_none() && self.request.ttl > 0 { - match ctx.select_next_hop(self.request.desired_location, &self.request.visited) { + match ctx.select_next_hop( + self.request.desired_location, + &self.request.visited, + recency, + ) { Some(next) => { let dist = ring_distance(next.location, Some(self.request.desired_location)); tracing::info!( @@ -303,27 +308,50 @@ impl RelayContext for RelayEnv<'_> { &self, desired_location: Location, visited: &[PeerKeyLocation], + recency: &HashMap, ) -> Option { let skip = VisitedPeerIds { peers: visited }; let router = self.op_manager.ring.router.read(); - self.op_manager - .ring - .connection_manager - .routing(desired_location, None, skip, &router) - } + let candidates = self.op_manager.ring.connection_manager.routing_candidates( + desired_location, + None, + skip, + ); + + // Prefer least recently forwarded peers. Missing recency wins; otherwise pick the oldest + // recency bucket, then let the router choose among that bucket. This keeps routing bias + // toward the target while avoiding hammering a single neighbor. + let mut best_key: Option> = None; + let mut best: Vec = Vec::new(); + for cand in candidates { + let key = recency.get(&cand.peer).cloned(); + match best_key { + None => { + best_key = Some(key); + best = vec![cand.clone()]; + } + Some(k) => { + if key < k { + best_key = Some(key); + best = vec![cand.clone()]; + } else if key == k { + best.push(cand.clone()); + } + } + } + } - fn transient_hint(&self, _acceptor: &PeerKeyLocation, _joiner: &PeerKeyLocation) -> bool { - // Courtesy slots still piggyback on regular connections. Flag the first acceptance so the - // joiner can prioritise it, and keep the logic simple until dedicated transient tracking - // is wired in (see transient-connection-budget branch). - self.op_manager.ring.open_connections() == 0 + if best.is_empty() { + None + } else { + router.select_peer(best.iter(), desired_location).cloned() + } } } #[derive(Debug)] pub struct AcceptedPeer { pub peer: PeerKeyLocation, - pub transient: bool, } #[derive(Debug, Default)] @@ -344,7 +372,6 @@ impl JoinerState { self.last_progress = now; acceptance.new_acceptor = Some(AcceptedPeer { peer: response.acceptor.clone(), - transient: response.transient, }); acceptance.assigned_location = self.accepted.len() == 1; } @@ -368,6 +395,10 @@ pub(crate) struct ConnectOp { pub(crate) gateway: Option>, pub(crate) backoff: Option, pub(crate) desired_location: Option, + /// Tracks when we last forwarded this connect to a peer, to avoid hammering the same + /// neighbors when no acceptors are available. Peers without an entry are treated as + /// immediately eligible. + recency: HashMap, } impl ConnectOp { @@ -392,6 +423,7 @@ impl ConnectOp { gateway: gateway.map(Box::new), backoff, desired_location: Some(desired_location), + recency: HashMap::new(), } } @@ -404,7 +436,6 @@ impl ConnectOp { upstream, request, forwarded_to: None, - transient_hint: false, observed_sent: false, accepted_locally: false, })); @@ -414,6 +445,7 @@ impl ConnectOp { gateway: None, backoff: None, desired_location: None, + recency: HashMap::new(), } } @@ -493,7 +525,15 @@ impl ConnectOp { ) -> Option { match self.state.as_mut() { Some(ConnectState::WaitingForResponses(state)) => { + tracing::info!( + acceptor = %response.acceptor.peer, + acceptor_loc = ?response.acceptor.location, + "connect: joiner received ConnectResponse" + ); let result = state.register_acceptance(response, now); + if let Some(new_acceptor) = &result.new_acceptor { + self.recency.remove(&new_acceptor.peer.peer); + } if result.satisfied { self.state = Some(ConnectState::Completed); } @@ -520,7 +560,6 @@ impl ConnectOp { upstream: upstream.clone(), request: request.clone(), forwarded_to: None, - transient_hint: false, observed_sent: false, accepted_locally: false, }))); @@ -531,7 +570,7 @@ impl ConnectOp { state.upstream = upstream; state.request = request; let upstream_snapshot = state.upstream.clone(); - state.handle_request(ctx, &upstream_snapshot) + state.handle_request(ctx, &upstream_snapshot, &self.recency) } _ => RelayActions::default(), } @@ -616,6 +655,8 @@ impl Operation for ConnectOp { } if let Some((next, request)) = actions.forward { + // Record recency for this forward to avoid hammering the same neighbor. + self.recency.insert(next.peer.clone(), Instant::now()); let forward_msg = ConnectMsg::Request { id: self.id, from: env.self_location().clone(), @@ -679,7 +720,7 @@ impl Operation for ConnectOp { peer: new_acceptor.peer.peer.clone(), tx: self.id, callback, - is_gw: new_acceptor.transient, + is_gw: false, }) .await?; @@ -776,6 +817,7 @@ fn store_operation_state_with_msg(op: &mut ConnectOp, msg: Option) - gateway: op.gateway.clone(), backoff: op.backoff.clone(), desired_location: op.desired_location, + recency: op.recency.clone(), })) }), } @@ -1004,7 +1046,6 @@ mod tests { self_loc: PeerKeyLocation, accept: bool, next_hop: Option, - transient: bool, } impl TestRelayContext { @@ -1013,7 +1054,6 @@ mod tests { self_loc, accept: true, next_hop: None, - transient: false, } } @@ -1026,11 +1066,6 @@ mod tests { self.next_hop = hop; self } - - fn transient(mut self, transient: bool) -> Self { - self.transient = transient; - self - } } impl RelayContext for TestRelayContext { @@ -1046,13 +1081,10 @@ mod tests { &self, _desired_location: Location, _visited: &[PeerKeyLocation], + _recency: &HashMap, ) -> Option { self.next_hop.clone() } - - fn transient_hint(&self, _acceptor: &PeerKeyLocation, _joiner: &PeerKeyLocation) -> bool { - self.transient - } } fn make_peer(port: u16) -> PeerKeyLocation { @@ -1078,17 +1110,16 @@ mod tests { observed_addr: Some(joiner.peer.addr), }, forwarded_to: None, - transient_hint: false, observed_sent: false, accepted_locally: false, }; - let ctx = TestRelayContext::new(self_loc.clone()).transient(true); - let actions = state.handle_request(&ctx, &joiner); + let ctx = TestRelayContext::new(self_loc.clone()); + let recency = HashMap::new(); + let actions = state.handle_request(&ctx, &joiner, &recency); let response = actions.accept_response.expect("expected acceptance"); assert_eq!(response.acceptor.peer, self_loc.peer); - assert!(response.transient); assert_eq!(actions.expect_connection_from.unwrap().peer, joiner.peer); assert!(actions.forward.is_none()); } @@ -1108,7 +1139,6 @@ mod tests { observed_addr: Some(joiner.peer.addr), }, forwarded_to: None, - transient_hint: false, observed_sent: false, accepted_locally: false, }; @@ -1116,7 +1146,8 @@ mod tests { let ctx = TestRelayContext::new(self_loc) .accept(false) .next_hop(Some(next_hop.clone())); - let actions = state.handle_request(&ctx, &joiner); + let recency = HashMap::new(); + let actions = state.handle_request(&ctx, &joiner, &recency); assert!(actions.accept_response.is_none()); let (forward_to, request) = actions.forward.expect("expected forward"); @@ -1143,13 +1174,13 @@ mod tests { observed_addr: Some(observed_addr), }, forwarded_to: None, - transient_hint: false, observed_sent: false, accepted_locally: false, }; let ctx = TestRelayContext::new(self_loc); - let actions = state.handle_request(&ctx, &joiner); + let recency = HashMap::new(); + let actions = state.handle_request(&ctx, &joiner, &recency); let (target, addr) = actions .observed_address @@ -1171,13 +1202,11 @@ mod tests { let response = ConnectResponse { acceptor: acceptor.clone(), - transient: false, }; let result = state.register_acceptance(&response, Instant::now()); assert!(result.satisfied); let new = result.new_acceptor.expect("expected new acceptor"); assert_eq!(new.peer.peer, acceptor.peer); - assert!(!new.transient); } #[test] diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 897fd1871..2b4b718be 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -49,11 +49,15 @@ impl ConnectionManager { Ring::DEFAULT_MIN_CONNECTIONS }; - let max_connections = if let Some(v) = config.max_number_conn { + let mut max_connections = if let Some(v) = config.max_number_conn { v } else { Ring::DEFAULT_MAX_CONNECTIONS }; + // Gateways benefit from a wider neighbor set for forwarding; default to a higher cap when unset. + if config.is_gateway && config.max_number_conn.is_none() { + max_connections = 20; + } let max_upstream_bandwidth = if let Some(v) = config.max_upstream_bandwidth { v @@ -455,7 +459,7 @@ impl ConnectionManager { } pub fn update_peer_identity(&self, old_peer: &PeerId, new_peer: PeerId) -> bool { - if old_peer == &new_peer { + if old_peer.addr == new_peer.addr && old_peer.pub_key == new_peer.pub_key { tracing::debug!(%old_peer, "update_peer_identity: identical peers; skipping"); return false; } @@ -573,29 +577,67 @@ impl ConnectionManager { skip_list: impl Contains, router: &Router, ) -> Option { + let candidates = self.routing_candidates(target, requesting, skip_list); + + if candidates.is_empty() { + return None; + } + + router.select_peer(candidates.iter(), target).cloned() + } + + /// Gather routing candidates after applying skip/transient filters. + pub fn routing_candidates( + &self, + target: Location, + requesting: Option<&PeerId>, + skip_list: impl Contains, + ) -> Vec { let connections = self.connections_by_location.read(); - tracing::debug!( - total_locations = connections.len(), - self_peer = self - .get_peer_key() - .as_ref() - .map(|id| id.to_string()) - .unwrap_or_else(|| "unknown".into()), - "routing: considering connections" - ); - let peers = connections.values().filter_map(|conns| { - let conn = conns.choose(&mut rand::rng())?; - if self.is_transient(&conn.location.peer) { - return None; - } - if let Some(requester) = requesting { - if requester == &conn.location.peer { + let candidates: Vec = connections + .values() + .filter_map(|conns| { + let conn = conns.choose(&mut rand::rng())?; + if self.is_transient(&conn.location.peer) { return None; } - } - (!skip_list.has_element(conn.location.peer.clone())).then_some(&conn.location) - }); - router.select_peer(peers, target).cloned() + if let Some(requester) = requesting { + if requester == &conn.location.peer { + return None; + } + } + (!skip_list.has_element(conn.location.peer.clone())) + .then_some(conn.location.clone()) + }) + .collect(); + + if candidates.is_empty() { + tracing::info!( + total_locations = connections.len(), + candidates = 0, + target = %target, + self_peer = self + .get_peer_key() + .as_ref() + .map(|id| id.to_string()) + .unwrap_or_else(|| "unknown".into()), + "routing: no non-transient candidates" + ); + } else { + tracing::info!( + total_locations = connections.len(), + candidates = candidates.len(), + target = %target, + self_peer = self + .get_peer_key() + .as_ref() + .map(|id| id.to_string()) + .unwrap_or_else(|| "unknown".into()), + "routing: selecting next hop" + ); + } + + candidates } pub fn num_connections(&self) -> usize { From 25777f6e708856605e28949d15bc0ebee950073f Mon Sep 17 00:00:00 2001 From: Ian Clarke Date: Sun, 23 Nov 2025 09:45:39 -0600 Subject: [PATCH 25/25] fix(transient): tune defaults and logging per review --- crates/core/src/config/mod.rs | 2 +- crates/core/src/ring/connection_manager.rs | 50 ++++++---------------- crates/core/src/transport/crypto.rs | 12 ++++-- 3 files changed, 24 insertions(+), 40 deletions(-) diff --git a/crates/core/src/config/mod.rs b/crates/core/src/config/mod.rs index 907a8fa75..050b526ad 100644 --- a/crates/core/src/config/mod.rs +++ b/crates/core/src/config/mod.rs @@ -45,7 +45,7 @@ pub(crate) const PCK_VERSION: &str = env!("CARGO_PKG_VERSION"); // Initialize the executor once. static ASYNC_RT: LazyLock> = LazyLock::new(GlobalExecutor::initialize_async_rt); -const DEFAULT_TRANSIENT_BUDGET: usize = 32; +const DEFAULT_TRANSIENT_BUDGET: usize = 2048; const DEFAULT_TRANSIENT_TTL_SECS: u64 = 30; const QUALIFIER: &str = ""; diff --git a/crates/core/src/ring/connection_manager.rs b/crates/core/src/ring/connection_manager.rs index 2b4b718be..6300c9de7 100644 --- a/crates/core/src/ring/connection_manager.rs +++ b/crates/core/src/ring/connection_manager.rs @@ -14,8 +14,6 @@ use super::*; pub(crate) struct TransientEntry { /// Entry tracking a transient connection that hasn't been added to the ring topology yet. /// Transient connections are typically unsolicited inbound connections to gateways. - #[allow(dead_code)] - pub opened_at: Instant, /// Advertised location for the transient peer, if known at admission time. pub location: Option, } @@ -312,7 +310,6 @@ impl ConnectionManager { self.peer_key.lock().clone() } - #[allow(dead_code)] pub fn is_gateway(&self) -> bool { self.is_gateway } @@ -333,13 +330,8 @@ impl ConnectionManager { } let key = peer.clone(); - self.transient_connections.insert( - peer, - TransientEntry { - opened_at: Instant::now(), - location, - }, - ); + self.transient_connections + .insert(peer, TransientEntry { location }); let prev = self.transient_in_use.fetch_add(1, Ordering::SeqCst); if prev >= self.transient_budget { // Undo if we raced past the budget. @@ -413,7 +405,7 @@ impl ConnectionManager { let previous_location = lop.insert(peer.clone(), loc); drop(lop); - // Enforce the global cap when adding a new peer (not a relocation). + // Enforce the global cap when adding a new peer (relocations reuse the existing slot). if previous_location.is_none() && self.connection_count() >= self.max_connections { tracing::warn!( %peer, @@ -611,31 +603,17 @@ impl ConnectionManager { }) .collect(); - if candidates.is_empty() { - tracing::info!( - total_locations = connections.len(), - candidates = 0, - target = %target, - self_peer = self - .get_peer_key() - .as_ref() - .map(|id| id.to_string()) - .unwrap_or_else(|| "unknown".into()), - "routing: no non-transient candidates" - ); - } else { - tracing::info!( - total_locations = connections.len(), - candidates = candidates.len(), - target = %target, - self_peer = self - .get_peer_key() - .as_ref() - .map(|id| id.to_string()) - .unwrap_or_else(|| "unknown".into()), - "routing: selecting next hop" - ); - } + tracing::debug!( + total_locations = connections.len(), + candidates = candidates.len(), + target = %target, + self_peer = self + .get_peer_key() + .as_ref() + .map(|id| id.to_string()) + .unwrap_or_else(|| "unknown".into()), + "routing candidates for next hop (non-transient only)" + ); candidates } diff --git a/crates/core/src/transport/crypto.rs b/crates/core/src/transport/crypto.rs index 342d3791e..79cb3673b 100644 --- a/crates/core/src/transport/crypto.rs +++ b/crates/core/src/transport/crypto.rs @@ -6,7 +6,6 @@ use rsa::{ Pkcs1v15Encrypt, RsaPrivateKey, RsaPublicKey, }; use serde::{Deserialize, Serialize}; -use sha2::{Digest, Sha256}; #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] pub struct TransportKeypair { @@ -113,8 +112,15 @@ impl std::fmt::Display for TransportPublicKey { use pkcs8::EncodePublicKey; let encoded = self.0.to_public_key_der().map_err(|_| std::fmt::Error)?; - let digest = Sha256::digest(encoded.as_bytes()); - write!(f, "{}", bs58::encode(digest).into_string()) + if encoded.as_bytes().len() >= 16 { + let bytes = encoded.as_bytes(); + let first_six = &bytes[..6]; + let last_six = &bytes[bytes.len() - 6..]; + let to_encode = [first_six, last_six].concat(); + write!(f, "{}", bs58::encode(to_encode).into_string()) + } else { + write!(f, "{}", bs58::encode(encoded.as_bytes()).into_string()) + } } }