/
bpf_metadata.cc
418 lines (371 loc) · 16.8 KB
/
bpf_metadata.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
#include "cilium/bpf_metadata.h"
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string>
#include "envoy/network/listen_socket.h"
#include "envoy/registry/registry.h"
#include "envoy/singleton/manager.h"
#include "source/common/common/assert.h"
#include "source/common/common/fmt.h"
#include "source/common/common/utility.h"
#include "source/common/network/address_impl.h"
#include "source/common/network/socket_option_factory.h"
#include "cilium/api/bpf_metadata.pb.validate.h"
#include "cilium/socket_option.h"
namespace Envoy {
namespace Server {
namespace Configuration {
/**
* Config registration for the bpf metadata filter. @see
* NamedNetworkFilterConfigFactory.
*/
class BpfMetadataConfigFactory : public NamedListenerFilterConfigFactory {
public:
// NamedListenerFilterConfigFactory
Network::ListenerFilterFactoryCb createListenerFilterFactoryFromProto(
const Protobuf::Message& proto_config,
const Network::ListenerFilterMatcherSharedPtr& listener_filter_matcher,
Configuration::ListenerFactoryContext& context) override {
auto config = std::make_shared<Cilium::BpfMetadata::Config>(
MessageUtil::downcastAndValidate<const ::cilium::BpfMetadata&>(
proto_config, context.messageValidationVisitor()),
context);
return [listener_filter_matcher,
config](Network::ListenerFilterManager& filter_manager) mutable -> void {
filter_manager.addAcceptFilter(listener_filter_matcher,
std::make_unique<Cilium::BpfMetadata::Instance>(config));
};
}
ProtobufTypes::MessagePtr createEmptyConfigProto() override {
return std::make_unique<::cilium::BpfMetadata>();
}
std::string name() const override { return "cilium.bpf_metadata"; }
};
/**
* Static registration for the bpf metadata filter. @see RegisterFactory.
* Versioning started from 1.1.0 for Cilium version 1.12.0.
*/
REGISTER_FACTORY(BpfMetadataConfigFactory,
NamedListenerFilterConfigFactory){FACTORY_VERSION(1, 1, 0, {{}})};
} // namespace Configuration
} // namespace Server
namespace Cilium {
namespace BpfMetadata {
// Singleton registration via macro defined in envoy/singleton/manager.h
SINGLETON_MANAGER_REGISTRATION(cilium_bpf_conntrack);
SINGLETON_MANAGER_REGISTRATION(cilium_host_map);
SINGLETON_MANAGER_REGISTRATION(cilium_ipcache);
SINGLETON_MANAGER_REGISTRATION(cilium_network_policy);
namespace {
std::shared_ptr<const Cilium::PolicyHostMap>
createHostMap(Server::Configuration::ListenerFactoryContext& context) {
return context.singletonManager().getTyped<const Cilium::PolicyHostMap>(
SINGLETON_MANAGER_REGISTERED_NAME(cilium_host_map), [&context] {
auto map = std::make_shared<Cilium::PolicyHostMap>(
context.localInfo(), context.clusterManager(), context.mainThreadDispatcher(),
context.api().randomGenerator(), context.scope(), context.threadLocal());
map->startSubscription();
return map;
});
}
std::shared_ptr<const Cilium::NetworkPolicyMap>
createPolicyMap(Server::Configuration::FactoryContext& context, Cilium::CtMapSharedPtr& ct) {
return context.singletonManager().getTyped<const Cilium::NetworkPolicyMap>(
SINGLETON_MANAGER_REGISTERED_NAME(cilium_network_policy), [&context, &ct] {
auto map = std::make_shared<Cilium::NetworkPolicyMap>(context, ct);
map->startSubscription();
return map;
});
}
} // namespace
Config::Config(const ::cilium::BpfMetadata& config,
Server::Configuration::ListenerFactoryContext& context)
: proxy_id_(config.proxy_id()), is_ingress_(config.is_ingress()),
use_original_source_address_(config.use_original_source_address()),
is_l7lb_(config.is_l7lb()),
ipv4_source_address_(
Network::Utility::parseInternetAddressNoThrow(config.ipv4_source_address())),
ipv6_source_address_(
Network::Utility::parseInternetAddressNoThrow(config.ipv6_source_address())) {
if (is_l7lb_ && is_ingress_) {
throw EnvoyException("cilium.bpf_metadata: is_l7lb may not be set with is_ingress");
}
if ((ipv4_source_address_ &&
ipv4_source_address_->ip()->version() != Network::Address::IpVersion::v4) ||
(!ipv4_source_address_ && config.ipv4_source_address().length() > 0)) {
throw EnvoyException(
fmt::format("cilium.bpf_metadata: ipv4_source_address is not an IPv4 address: {}",
config.ipv4_source_address()));
}
if ((ipv6_source_address_ &&
ipv6_source_address_->ip()->version() != Network::Address::IpVersion::v6) ||
(!ipv6_source_address_ && config.ipv6_source_address().length() > 0)) {
throw EnvoyException(
fmt::format("cilium.bpf_metadata: ipv6_source_address is not an IPv6 address: {}",
config.ipv6_source_address()));
}
// Note: all instances use the bpf root of the first filter with non-empty
// bpf_root instantiated! Only try opening bpf maps if bpf root is explicitly
// configured
std::string bpf_root = config.bpf_root();
if (bpf_root.length() > 0) {
ct_maps_ = context.singletonManager().getTyped<Cilium::CtMap>(
SINGLETON_MANAGER_REGISTERED_NAME(cilium_bpf_conntrack), [&bpf_root] {
// Even if opening the global maps fail, local maps may still succeed
// later.
return std::make_shared<Cilium::CtMap>(bpf_root);
});
ipcache_ = context.singletonManager().getTyped<Cilium::IPCache>(
SINGLETON_MANAGER_REGISTERED_NAME(cilium_ipcache), [&bpf_root] {
auto ipcache = std::make_shared<Cilium::IPCache>(bpf_root);
if (!ipcache->Open()) {
ipcache.reset();
}
return ipcache;
});
if (bpf_root != ct_maps_->bpfRoot()) {
// bpf root may not change during runtime
throw EnvoyException(fmt::format("cilium.bpf_metadata: Invalid bpf_root: {}", bpf_root));
}
}
// Only create the hosts map if ipcache can't be opened
if (ipcache_ == nullptr) {
hosts_ = createHostMap(context);
}
// Get the shared policy provider, or create it if not already created.
// Note that the API config source is assumed to be the same for all filter
// instances!
npmap_ = createPolicyMap(context, ct_maps_);
}
uint32_t Config::resolvePolicyId(const Network::Address::Ip* ip) const {
uint32_t id = 0;
if (ipcache_ != nullptr) {
id = ipcache_->resolve(ip);
} else if (hosts_ != nullptr) {
id = hosts_->resolve(ip);
}
// default destination identity to the world if needed
if (id == 0) {
id = Cilium::ID::WORLD;
ENVOY_LOG(trace, "bpf_metadata: Identity for IP defaults to WORLD", ip->addressAsString());
}
return id;
}
const PolicyInstanceConstSharedPtr Config::getPolicy(const std::string& pod_ip) const {
auto& policy = npmap_->GetPolicyInstance(pod_ip);
if (policy == nullptr) {
// Allow all traffic for egress without a policy when 'is_l7lb_' is true.
// This is the case for L7 LB listeners only. This is needed to allow traffic forwarded by k8s
// Ingress (which is implemented as an egress listener!).
if (!is_ingress_ && is_l7lb_) {
return npmap_->AllowAllEgressPolicy;
}
}
return policy;
}
bool Config::getMetadata(Network::ConnectionSocket& socket) {
Network::Address::InstanceConstSharedPtr src_address =
socket.connectionInfoProvider().remoteAddress();
const auto sip = src_address->ip();
const auto dst_address = socket.ioHandle().localAddress();
const auto dip = dst_address->ip();
if (!sip || !dip) {
ENVOY_LOG_MISC(debug, "Non-IP addresses: src: {} dst: {}", src_address->asString(),
dst_address->asString());
return false;
}
// We do this first as this likely restores the destination address and
// lets the OriginalDstCluster know the destination address can be used.
socket.connectionInfoProvider().restoreLocalAddress(dst_address); // mark as `restored`
std::string pod_ip, other_ip;
if (is_ingress_) {
pod_ip = dip->addressAsString();
other_ip = sip->addressAsString();
ENVOY_LOG_MISC(debug, "INGRESS POD IP: {}, source IP: {}", pod_ip, other_ip);
} else {
pod_ip = sip->addressAsString();
other_ip = dip->addressAsString();
ENVOY_LOG_MISC(debug, "EGRESS POD IP: {}, destination IP: {}", pod_ip, other_ip);
}
auto policy = getPolicy(pod_ip);
if (policy == nullptr) {
ENVOY_LOG(warn, "cilium.bpf_metadata ({}): No policy found for {}",
is_ingress_ ? "ingress" : "egress", pod_ip);
return false;
}
uint32_t source_identity = 0;
// Resolve the source security ID from conntrack map, or from ip cache
if (ct_maps_ != nullptr) {
auto ct_name = policy->conntrackName();
if (ct_name.length() > 0) {
source_identity = ct_maps_->lookupSrcIdentity(ct_name, sip, dip, is_ingress_);
}
}
if (source_identity == 0) {
source_identity = resolvePolicyId(sip);
}
// Resolve the destination security ID for egress
uint32_t destination_identity = 0;
if (!is_ingress_) {
destination_identity = resolvePolicyId(dip);
}
Network::Address::InstanceConstSharedPtr ipv4_source_address = ipv4_source_address_;
Network::Address::InstanceConstSharedPtr ipv6_source_address = ipv6_source_address_;
// Use original source address with L7 LB for local endpoint sources if requested, as policy
// enforcement after the proxy depends on it (i.e., for "east/west" LB).
//
// NOTE: As L7 LB does not use the original destination, there is a possibility of a 5-tuple
// collision if the same source pod is communicating with the same backends on same destination
// port directly, maybe via some other, non-L7 LB service. We keep the original source port number
// to not allocate random source ports for the source pod in the host networking namespace that
// could then blackhole existing connections between the source pod and the backend. This means
// that the L7 LB backend connection may fail in case of a 5-tuple collision that the host
// networking namespace is aware of.
//
// NOTE: is_l7lb_ is only used for egress, so the local
// endpoint is the source, and the other node is the destination.
bool east_west_l7_lb = is_l7lb_ && use_original_source_address_ && policy->getEndpointID() != 0;
if (east_west_l7_lb) {
// Use source pod's IP address for east/west l7 LB
const auto& ips = policy->getEndpointIPs();
if (ips.ipv4_ && ips.ipv6_) {
// Keep the original source address for the matching IP version, create a new source IP for
// the other version (with the same source port number) in case an upstream of a different IP
// version is chosen.
switch (sip->version()) {
case Network::Address::IpVersion::v4: {
ipv4_source_address = src_address;
sockaddr_in6 sa6 = *reinterpret_cast<const sockaddr_in6*>(ips.ipv6_->sockAddr());
sa6.sin6_port = htons(sip->port());
ipv6_source_address = std::make_shared<Network::Address::Ipv6Instance>(sa6);
} break;
case Network::Address::IpVersion::v6: {
ipv6_source_address = src_address;
sockaddr_in sa4 = *reinterpret_cast<const sockaddr_in*>(ips.ipv4_->sockAddr());
sa4.sin_port = htons(sip->port());
ipv4_source_address = std::make_shared<Network::Address::Ipv4Instance>(&sa4);
} break;
}
src_address = nullptr;
}
} else if (is_l7lb_) {
// North/south L7 LB, assume the source security identity of the configured source addresses, if
// any and policy for this identity exists.
const Network::Address::Ip* ip = nullptr;
if (ipv4_source_address && ipv4_source_address->ip()) {
ip = ipv4_source_address->ip();
} else if (ipv6_source_address && ipv6_source_address->ip()) {
ip = ipv6_source_address->ip();
}
if (ip) {
auto new_id = resolvePolicyId(ip);
if (new_id != Cilium::ID::WORLD) {
auto new_pod_ip = ip->addressAsString();
// AllowAllEgressPolicy will be returned if no explicit Ingress policy exists
const auto& new_policy = getPolicy(new_pod_ip);
if (new_policy) {
source_identity = new_id;
pod_ip = new_pod_ip;
policy = new_policy;
}
} // The configured IP is used, but the original source identity, pod IP and policy are kept
}
// Original source address is never used for north/south LB
// This means that a local host IP is used if no IP is configured to be used instead of it
// ('ip' above is null).
src_address = nullptr;
// Otherwise only use the original source address if permitted, destination identity is not a
// locally allocated identity, is not classified as WORLD, and the destination is not in the
// same node.
} else if (!(use_original_source_address_ &&
!(destination_identity & Cilium::ID::LocalIdentityFlag) &&
destination_identity != Cilium::ID::WORLD && !npmap_->exists(other_ip))) {
// Original source address is not used
src_address = nullptr;
}
// Add transparent options if either original or explicitly set source address is used
if (src_address || ipv4_source_address || ipv6_source_address) {
socket.addOptions(Network::SocketOptionFactory::buildIpTransparentOptions());
socket.addOptions(Network::SocketOptionFactory::buildReusePortOptions());
}
// Add metadata for policy based listener filter chain matching.
// This requires the TLS inspector, if used, to run before us.
// Note: This requires egress policy be known before upstream host selection,
// so this feature only works with the original destination cluster.
// This means that L7 LB does not work with the experimental Envoy Metadata
// based policies (e.g., with MongoDB or MySQL filters).
std::string l7proto;
if (policy->useProxylib(is_ingress_, dip->port(),
is_ingress_ ? source_identity : destination_identity, l7proto)) {
const auto& old_protocols = socket.requestedApplicationProtocols();
std::vector<absl::string_view> protocols;
for (const auto& old_protocol : old_protocols) {
protocols.emplace_back(old_protocol);
}
protocols.emplace_back(l7proto);
socket.setRequestedApplicationProtocols(protocols);
ENVOY_LOG(info, "cilium.bpf_metadata: setRequestedApplicationProtocols(..., {})", l7proto);
}
// Pass the metadata to an Envoy socket option we can retrieve later in other
// Cilium filters.
uint32_t mark = 0;
if (!npmap_->is_sidecar_) {
// Mark with source endpoint ID for east/west l7 LB. This causes the upstream packets to be
// processed by the the source endpoint's policy enforcement in the datapath.
if (east_west_l7_lb) {
mark = 0x0900 | policy->getEndpointID() << 16;
} else {
// Mark with source identity
uint32_t cluster_id = (source_identity >> 16) & 0xFF;
uint32_t identity_id = (source_identity & 0xFFFF) << 16;
mark = ((is_ingress_) ? 0x0A00 : 0x0B00) | cluster_id | identity_id;
}
}
socket.addOption(std::make_shared<Cilium::SocketOption>(
policy, mark, source_identity, is_ingress_, is_l7lb_, dip->port(), std::move(pod_ip),
std::move(src_address), std::move(ipv4_source_address), std::move(ipv6_source_address),
shared_from_this(), proxy_id_));
return true;
}
Network::FilterStatus Instance::onAccept(Network::ListenerFilterCallbacks& cb) {
Network::ConnectionSocket& socket = cb.socket();
// Cilium socket option is not set if this fails, which causes 500 response from our l7policy
// filter. Our integration tests depend on this.
config_->getMetadata(socket);
// Set socket options for linger and keepalive (5 minutes).
struct ::linger lin {
true, 10
};
int keepalive = true;
int secs = 5 * 60; // Five minutes
auto status = socket.setSocketOption(SOL_SOCKET, SO_LINGER, &lin, sizeof(lin));
if (status.return_value_ < 0) {
ENVOY_LOG(critical, "Socket option failure. Failed to set SO_LINGER: {}",
Envoy::errorDetails(status.errno_));
}
status = socket.setSocketOption(SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive));
if (status.return_value_ < 0) {
ENVOY_LOG(critical, "Socket option failure. Failed to set SO_KEEPALIVE: {}",
Envoy::errorDetails(status.errno_));
} else {
status = socket.setSocketOption(IPPROTO_TCP, TCP_KEEPINTVL, &secs, sizeof(secs));
if (status.return_value_ < 0) {
ENVOY_LOG(critical, "Socket option failure. Failed to set TCP_KEEPINTVL: {}",
Envoy::errorDetails(status.errno_));
} else {
status = socket.setSocketOption(IPPROTO_TCP, TCP_KEEPIDLE, &secs, sizeof(secs));
if (status.return_value_ < 0) {
ENVOY_LOG(critical, "Socket option failure. Failed to set TCP_KEEPIDLE: {}",
Envoy::errorDetails(status.errno_));
}
}
}
return Network::FilterStatus::Continue;
}
Network::FilterStatus Instance::onData(Network::ListenerFilterBuffer&) {
return Network::FilterStatus::Continue;
};
size_t Instance::maxReadBytes() const { return 0; }
} // namespace BpfMetadata
} // namespace Cilium
} // namespace Envoy