Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ipcache: Fix deadlock when ipcache GC results in datapath reload
[ upstream commit 3285cbc ] The following deadlock can occur when the ipcache GC relies on map renames and datapath reloads to delete entries in combination with endpoint regenrations triggered by the FQDN proxy which perform ipcache upserts as part of regenerations: Routine 1: The following go routine holds the ipcache mutex while garbage collecting: ``` goroutine 779 [semacquire, 48 minutes]: sync.runtime_SemacquireMutex(0xc0008f4f68, 0xc007663600, 0x0) /usr/local/go/src/runtime/sema.go:71 +0x47 sync.(*RWMutex).Lock(0xc0008f4f60) /usr/local/go/src/sync/rwmutex.go:103 +0x88 github.com/cilium/cilium/pkg/datapath/loader.(*Loader).Reinitialize(0xc0003d70a0, 0x2759d20, 0xc000468bc0, 0x2771e60, 0xc000515680, 0x2329, 0x7f87768eac80, 0xc00022ff10, 0x26faf60, 0xc0004a18b0, ...) /go/src/github.com/cilium/cilium/pkg/datapath/loader/base.go:124 +0xcd main.(*Daemon).TriggerReloadWithoutCompile(0xc000515680, 0x2332316, 0x10, 0x12, 0xc008f75060, 0x16) /go/src/github.com/cilium/cilium/daemon/daemon.go:588 +0x202 github.com/cilium/cilium/pkg/datapath/ipcache.(*BPFListener).garbageCollect(0xc0006f8960, 0x2759d20, 0xc003748000, 0x0, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/datapath/ipcache/listener.go:306 +0xadc github.com/cilium/cilium/pkg/datapath/ipcache.(*BPFListener).OnIPIdentityCacheGC.func1(0x2759d20, 0xc003748000, 0x3a5bf00, 0x2) /go/src/github.com/cilium/cilium/pkg/datapath/ipcache/listener.go:340 +0x3e github.com/cilium/cilium/pkg/controller.(*Controller).runController(0xc001e1e000) /go/src/github.com/cilium/cilium/pkg/controller/controller.go:205 +0xa2a created by github.com/cilium/cilium/pkg/controller.(*Manager).updateController /go/src/github.com/cilium/cilium/pkg/controller/manager.go:120 +0xb09 ``` As part of this, Reinitialize() is called which will require the compilation mutex to be acquired. Routine 2 The following ongoing endpoint regeneration is holding the compilation lock and thus blocks Routine 1 from completing. It is itself blocked on the FQDN NameManager mutex. ``` goroutine 7227352 [semacquire, 48 minutes]: sync.runtime_SemacquireMutex(0xc0005c3744, 0xc009bf4c00, 0x1) /usr/local/go/src/runtime/sema.go:71 +0x47 sync.(*Mutex).lockSlow(0xc0005c3740) /usr/local/go/src/sync/mutex.go:138 +0xfc sync.(*Mutex).Lock(...) /usr/local/go/src/sync/mutex.go:81 github.com/cilium/cilium/pkg/fqdn.(*NameManager).Lock(0xc0005c3740) /go/src/github.com/cilium/cilium/pkg/fqdn/name_manager.go:93 +0x47 github.com/cilium/cilium/pkg/policy.(*SelectorCache).AddFQDNSelector(0xc0004691c0, 0x26faf00, 0xc00acd6e70, 0x0, 0x0, 0xc004106590, 0xb, 0x0, 0x0, 0x40ca00) /go/src/github.com/cilium/cilium/pkg/policy/selectorcache.go:580 +0x13f github.com/cilium/cilium/pkg/policy.(*L4Filter).cacheFQDNSelector(...) /go/src/github.com/cilium/cilium/pkg/policy/l4.go:392 github.com/cilium/cilium/pkg/policy.(*L4Filter).cacheFQDNSelectors(0xc00acd6e70, 0xc008c7d100, 0x38, 0x38, 0xc0004691c0) /go/src/github.com/cilium/cilium/pkg/policy/l4.go:387 +0xad github.com/cilium/cilium/pkg/policy.createL4Filter(0x2739660, 0xc001c49680, 0xc006beab60, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, ...) /go/src/github.com/cilium/cilium/pkg/policy/l4.go:489 +0x7ce github.com/cilium/cilium/pkg/policy.createL4EgressFilter(...) /go/src/github.com/cilium/cilium/pkg/policy/l4.go:599 github.com/cilium/cilium/pkg/policy.mergeEgressPortProto(0x2739660, 0xc001c49680, 0xc0052087d0, 0xc006beab60, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, ...) /go/src/github.com/cilium/cilium/pkg/policy/rule.go:637 +0x187 github.com/cilium/cilium/pkg/policy.mergeEgress(0x2739660, 0xc001c49680, 0xc0052087d0, 0xc006beab60, 0x1, 0x1, 0x0, 0x0, 0x0, 0xc00b91f2c0, ...) /go/src/github.com/cilium/cilium/pkg/policy/rule.go:572 +0xdbd github.com/cilium/cilium/pkg/policy.(*rule).resolveEgressPolicy(0xc0079a0400, 0x2739660, 0xc001c49680, 0xc0052087d0, 0xc005208418, 0xc0090738f0, 0x0, 0x0, 0x0, 0xc0090738f0, ...) /go/src/github.com/cilium/cilium/pkg/policy/rule.go:675 +0x265 github.com/cilium/cilium/pkg/policy.ruleSlice.resolveL4EgressPolicy(0xc008d0a9c0, 0x5, 0x8, 0x2739660, 0xc001c49680, 0xc0052087d0, 0xc008d6c300, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/policy/rules.go:106 +0x70b github.com/cilium/cilium/pkg/policy.(*Repository).resolvePolicyLocked(0xc000246460, 0xc0059fe820, 0xc0000066c3, 0xc0030b0960, 0xc006176201) /go/src/github.com/cilium/cilium/pkg/policy/repository.go:665 +0x3f9 github.com/cilium/cilium/pkg/policy.(*PolicyCache).updateSelectorPolicy(0xc000342b20, 0xc0059fe820, 0x3a5bf00, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/policy/distillery.go:135 +0x148 github.com/cilium/cilium/pkg/policy.(*PolicyCache).UpdatePolicy(...) /go/src/github.com/cilium/cilium/pkg/policy/distillery.go:170 github.com/cilium/cilium/pkg/endpoint.(*Endpoint).regeneratePolicy(0xc001427080, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/endpoint/policy.go:175 +0x255 github.com/cilium/cilium/pkg/endpoint.(*Endpoint).runPreCompilationSteps(0xc001427080, 0xc005d3a500, 0xc008d0a800, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/endpoint/bpf.go:736 +0x96a github.com/cilium/cilium/pkg/endpoint.(*Endpoint).regenerateBPF(0xc001427080, 0xc005d3a500, 0x0, 0x0, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/endpoint/bpf.go:508 +0x1e0 github.com/cilium/cilium/pkg/endpoint.(*Endpoint).regenerate(0xc001427080, 0xc005d3a500, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/endpoint/policy.go:349 +0x841 github.com/cilium/cilium/pkg/endpoint.(*EndpointRegenerationEvent).Handle(0xc003530210, 0xc0068115c0) /go/src/github.com/cilium/cilium/pkg/endpoint/events.go:58 +0x459 github.com/cilium/cilium/pkg/eventqueue.(*EventQueue).Run.func1() /go/src/github.com/cilium/cilium/pkg/eventqueue/eventqueue.go:260 +0x187 sync.(*Once).doSlow(0xc005f60e58, 0xc009bbb330) /usr/local/go/src/sync/once.go:66 +0xe3 sync.(*Once).Do(0xc005f60e58, 0xc009bbb330) /usr/local/go/src/sync/once.go:57 +0x45 created by github.com/cilium/cilium/pkg/eventqueue.(*EventQueue).Run /go/src/github.com/cilium/cilium/pkg/eventqueue/eventqueue.go:248 +0xa9 ``` Routine 3 This third routine is holding the FQDN NameManager mutex while itself waiting on the ipcache mutex which is held by routine 1. ``` goroutine 7229843 [select, 48 minutes]: golang.org/x/sync/semaphore.(*Weighted).Acquire(0xc000464230, 0x2759d60, 0xc000058018, 0x40000000, 0xc0008f9740, 0x0) /go/src/github.com/cilium/cilium/vendor/golang.org/x/sync/semaphore/semaphore.go:60 +0x28e github.com/cilium/cilium/pkg/lock.(*SemaphoredMutex).Lock(...) /go/src/github.com/cilium/cilium/pkg/lock/semaphored_mutex.go:41 github.com/cilium/cilium/pkg/ipcache.(*IPCache).Upsert(0xc000464280, 0xc008858dd0, 0xd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1000596, 0x231acf1, ...) /go/src/github.com/cilium/cilium/pkg/ipcache/ipcache.go:184 +0x93 github.com/cilium/cilium/pkg/ipcache.allocateCIDRs(0xc004bb2a00, 0x98, 0x98, 0x0, 0x0, 0x0, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/ipcache/cidr.go:97 +0x7aa github.com/cilium/cilium/pkg/ipcache.AllocateCIDRsForIPs(0xc0046a2000, 0x98, 0x200, 0x1, 0x1, 0x0, 0x1f4a440, 0xc00a768900) /go/src/github.com/cilium/cilium/pkg/ipcache/cidr.go:52 +0x65 main.identitiesForFQDNSelectorIPs(0xc00a7687e0, 0xc0008f9740, 0x0, 0x0, 0x0) /go/src/github.com/cilium/cilium/daemon/fqdn.go:87 +0x38e main.(*Daemon).updateSelectors(0xc000515680, 0x2759da0, 0xc000e09a40, 0xc00a7687e0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0) /go/src/github.com/cilium/cilium/daemon/fqdn.go:320 +0x40 github.com/cilium/cilium/pkg/fqdn.(*NameManager).UpdateGenerateDNS(0xc0005c3740, 0x2759da0, 0xc000e09a40, 0xbfae6eb0118dc6cd, 0x131b26c90d12, 0x3a5bf00, 0xc004cf9910, 0x0, 0x0, 0x0) /go/src/github.com/cilium/cilium/pkg/fqdn/name_manager.go:225 +0x48f main.(*Daemon).notifyOnDNSMsg(0xc000515680, 0xbfae6eb0118dc6cd, 0x131b26c90d12, 0x3a5bf00, 0xc001427080, 0xc00a2bdec0, 0x11, 0xc00ae52340, 0x10, 0xc003564f30, ...) /go/src/github.com/cilium/cilium/daemon/fqdn.go:584 +0x12cd github.com/cilium/cilium/pkg/fqdn/dnsproxy.(*DNSProxy).ServeDNS(0xc0008c8e80, 0x27818e0, 0xc00302e1e0, 0xc0072a4f30) /go/src/github.com/cilium/cilium/pkg/fqdn/dnsproxy/proxy.go:475 +0x238c github.com/miekg/dns.(*Server).serveDNS(0xc000563a40, 0xc001f8be00, 0x6b, 0x200, 0xc00302e1e0) /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:597 +0x20b github.com/miekg/dns.(*Server).serveUDPPacket(0xc000563a40, 0xc00047d060, 0xc001f8be00, 0x6b, 0x200, 0xc000df0380, 0xc0092cf2b0) /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:552 +0xb2 github.com/miekg/dns.(*Server).serveUDP.func2(0xc000563a40, 0xc00047d060, 0xc001f8be00, 0x6b, 0x200, 0xc000df0380, 0xc0092cf2b0) /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:478 +0x67 created by github.com/miekg/dns.(*Server).serveUDP /go/src/github.com/cilium/cilium/vendor/github.com/miekg/dns/server.go:477 +0x272 ``` Fixes: #11946 Signed-off-by: Thomas Graf <thomas@cilium.io> Signed-off-by: Joe Stringer <joe@cilium.io>
- Loading branch information