From 7fecec92668db541b242f85753f55c1bfa04f233 Mon Sep 17 00:00:00 2001 From: Marek Majkowski Date: Fri, 30 Jan 2015 23:26:49 +0000 Subject: [PATCH] Release v0.1 --- .gitignore | 2 + LICENSE-BSD-CloudFlare | 27 ++++ Makefile | 60 +++++++- README.md | 40 +++++ RELEASE_NOTES | 2 + TODO | 5 - src/csiphash.c | 127 ++++++++++++++++ src/debug.h | 19 --- src/hashlimit.c | 92 ++++++++++++ src/hashlimit.h | 8 + src/main.c | 327 +++++++++++++++++++++++++---------------- src/{pcap.c => net.c} | 27 ++++ src/pmtud.h | 5 + src/uevent.c | 7 +- src/utils.c | 5 + 15 files changed, 600 insertions(+), 153 deletions(-) create mode 100644 LICENSE-BSD-CloudFlare create mode 100644 README.md create mode 100644 RELEASE_NOTES delete mode 100644 TODO create mode 100644 src/csiphash.c delete mode 100644 src/debug.h create mode 100644 src/hashlimit.c create mode 100644 src/hashlimit.h rename src/{pcap.c => net.c} (87%) diff --git a/.gitignore b/.gitignore index 1571b70..e873948 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ libpcap.a pmtud +core + diff --git a/LICENSE-BSD-CloudFlare b/LICENSE-BSD-CloudFlare new file mode 100644 index 0000000..876c4b6 --- /dev/null +++ b/LICENSE-BSD-CloudFlare @@ -0,0 +1,27 @@ +Copyright (c) 2015 CloudFlare, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of the CloudFlare, Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile index 012b3ac..9a6ea85 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,8 @@ all: pmtud pmtud: libpcap.a src/*.c src/*.h Makefile $(CC) $(COPTS) \ - src/main.c src/utils.c src/pcap.c src/uevent.c \ + src/main.c src/utils.c src/net.c src/uevent.c \ + src/hashlimit.c src/csiphash.c \ libpcap.a \ $(LDOPTS) \ -o pmtud @@ -51,3 +52,60 @@ distclean: clean format: clang-format-3.5 -i src/*.c src/*.h + + +# Release process +# --------------- +GITVER := $(shell git describe --tags --always --dirty=-dev) +VERSION := $(shell python -c 'print "$(GITVER)"[1:].partition("-")[0]') +ITERATION := $(shell python -c 'print ("$(GITVER)"[1:].partition("-")[2] or "0")') +NEXT_VERSION := v0.$(shell python -c 'print int("$(GITVER)"[1:].partition("-")[0][2:]) + 1') + +.PHONY: release + +release: + @echo "[*] Curr version: $(VERSION)-$(ITERATION)" + @echo "[*] Next version: $(NEXT_VERSION)" + echo "$(NEXT_VERSION) (`date '+%Y%m%d-%H%M'`)" > RELEASE_NOTES.tmp + git log --reverse --date=short --format="- %ad %s" tags/v$(VERSION)..HEAD >> RELEASE_NOTES.tmp + echo "" >> RELEASE_NOTES.tmp + cat RELEASE_NOTES >> RELEASE_NOTES.tmp + mv RELEASE_NOTES.tmp RELEASE_NOTES + git add RELEASE_NOTES + git commit -m "Release $(NEXT_VERSION)" + git tag $(NEXT_VERSION) + @echo "[*] To push the release run:" + @echo "git push origin master; git push origin $(NEXT_VERSION)" + +# Build process +# ------------- + +BIN_PREFIX ?= /usr/local/bin + +.PHONY: print-builddeps cf-package + +CFDEPENDENCIES = python flex bison valgrind gcc make + +print-builddeps: + @echo $(CFDEPENDENCIES) $(DEPENDENCIES) + + +cf-package: + @echo "[*] resetting submodules" + git submodule sync --quiet + git submodule update --init --recursive --quiet + @echo "[*] rebuilding" + -$(MAKE) clean + -$(MAKE) distclean + $(MAKE) pmtud BUILD=release CC=gcc + cp pmtud $(BIN_PREFIX) + + fakeroot fpm -C / \ + -s dir \ + -t deb \ + --deb-compression bzip2 \ + -v $(VERSION) \ + --iteration $(ITERATION) \ + -n pmtud \ + $(BIN_PREFIX)/pmtud + rm $(BIN_PREFIX)/pmtud diff --git a/README.md b/README.md new file mode 100644 index 0000000..b24bce5 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +Path MTU daemon +=============== + +With ECMP enabled the ICMP messages are routed mostly to wrong +server. To fix that let's broadcast the ICMP messages that we think +are worth it to every machine in colo. Some reading: + + * https://tools.ietf.org/html/draft-jaeggli-v6ops-pmtud-ecmp-problem-00 + + +Path MTU daemon is a program that captures and broadcasts ICMP +messages related to MTU detection. It listens on an interface +waiting for ICMP messages (ip type 3 code 4 or ipv6 type 2 code 0) +and it forwards them verbatim to broadcast ethernet address. + +Once again, it listens waiting for packets matching: + + ((icmp and icmp[0] == 3 and icmp[1] == 4) or + (icmp6 and ip6[40+0] == 2 and ip6[40+1] == 0)) and + (ether dst not ff:ff:ff:ff:ff:ff) + +And having appropriate length, and forwards them to ethernet broadcast +ff:ff:ff:ff:ff:ff. + +To debug use tcpdump: + + sudo tcpdump -s0 -e -ni eth0 '((icmp and icmp[0] == 3 and icmp[1] == 4) or + (icmp6 and ip6[40+0] == 2 and ip6[40+1] == 0))' + + +To build type: + + git submodule update --init --recursive + make + + +To test run it in dry-run and verbose mode: + + sudo ./pmtud --iface=eth0 --dry-run -v -v -v + diff --git a/RELEASE_NOTES b/RELEASE_NOTES new file mode 100644 index 0000000..e62e921 --- /dev/null +++ b/RELEASE_NOTES @@ -0,0 +1,2 @@ +v0.1 (20150131-0037) + diff --git a/TODO b/TODO deleted file mode 100644 index 58781b3..0000000 --- a/TODO +++ /dev/null @@ -1,5 +0,0 @@ -1. code -2. netstat -6 -3. package -4. salt -5. 25m5 diff --git a/src/csiphash.c b/src/csiphash.c new file mode 100644 index 0000000..3899f0d --- /dev/null +++ b/src/csiphash.c @@ -0,0 +1,127 @@ +/* + Copyright (c) 2013 Marek Majkowski + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + + Original location: + https://github.com/majek/csiphash/ + + Solution inspired by code from: + Samuel Neves (supercop/crypto_auth/siphash24/little) + djb (supercop/crypto_auth/siphash24/little2) + Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) +*/ + +#include + +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ + __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define _le64toh(x) ((uint64_t)(x)) +#elif defined(_WIN32) +/* Windows is always little endian, unless you're on xbox360 + http://msdn.microsoft.com/en-us/library/b0084kay(v=vs.80).aspx */ +#define _le64toh(x) ((uint64_t)(x)) +#elif defined(__APPLE__) +#include +#define _le64toh(x) OSSwapLittleToHostInt64(x) +#else + +/* See: http://sourceforge.net/p/predef/wiki/Endianness/ */ +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +#include +#else +#include +#endif +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \ + __BYTE_ORDER == __LITTLE_ENDIAN +#define _le64toh(x) ((uint64_t)(x)) +#else +#define _le64toh(x) le64toh(x) +#endif + +#endif + +#define ROTATE(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b)))) + +#define HALF_ROUND(a, b, c, d, s, t) \ + a += b; \ + c += d; \ + b = ROTATE(b, s) ^ a; \ + d = ROTATE(d, t) ^ c; \ + a = ROTATE(a, 32); + +#define DOUBLE_ROUND(v0, v1, v2, v3) \ + HALF_ROUND(v0, v1, v2, v3, 13, 16); \ + HALF_ROUND(v2, v1, v0, v3, 17, 21); \ + HALF_ROUND(v0, v1, v2, v3, 13, 16); \ + HALF_ROUND(v2, v1, v0, v3, 17, 21); + +uint64_t siphash24(const void *src, unsigned long src_sz, const char key[16]) +{ + const uint64_t *_key = (uint64_t *)key; + uint64_t k0 = _le64toh(_key[0]); + uint64_t k1 = _le64toh(_key[1]); + uint64_t b = (uint64_t)src_sz << 56; + const uint64_t *in = (uint64_t *)src; + + uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; + uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; + uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; + uint64_t v3 = k1 ^ 0x7465646279746573ULL; + + while (src_sz >= 8) { + uint64_t mi = _le64toh(*in); + in += 1; + src_sz -= 8; + v3 ^= mi; + DOUBLE_ROUND(v0, v1, v2, v3); + v0 ^= mi; + } + + uint64_t t = 0; + uint8_t *pt = (uint8_t *)&t; + uint8_t *m = (uint8_t *)in; + switch (src_sz) { + case 7: + pt[6] = m[6]; + case 6: + pt[5] = m[5]; + case 5: + pt[4] = m[4]; + case 4: + *((uint32_t *)&pt[0]) = *((uint32_t *)&m[0]); + break; + case 3: + pt[2] = m[2]; + case 2: + pt[1] = m[1]; + case 1: + pt[0] = m[0]; + } + b |= _le64toh(t); + + v3 ^= b; + DOUBLE_ROUND(v0, v1, v2, v3); + v0 ^= b; + v2 ^= 0xff; + DOUBLE_ROUND(v0, v1, v2, v3); + DOUBLE_ROUND(v0, v1, v2, v3); + return (v0 ^ v1) ^ (v2 ^ v3); +} diff --git a/src/debug.h b/src/debug.h deleted file mode 100644 index 3e0564c..0000000 --- a/src/debug.h +++ /dev/null @@ -1,19 +0,0 @@ -#define ERRORF(x...) fprintf(stderr, x) - -#define FATAL(x...) \ - do { \ - ERRORF("[-] PROGRAM ABORT : " x); \ - ERRORF("\n\tLocation : %s(), %s:%u\n\n", __FUNCTION__, \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } while (0) - -#define PFATAL(x...) \ - do { \ - ERRORF("[-] SYSTEM ERROR : " x); \ - ERRORF("\n\tLocation : %s(), %s:%u\n", __FUNCTION__, __FILE__, \ - __LINE__); \ - perror(" OS message "); \ - ERRORF("\n"); \ - exit(EXIT_FAILURE); \ - } while (0) diff --git a/src/hashlimit.c b/src/hashlimit.c new file mode 100644 index 0000000..ad74c90 --- /dev/null +++ b/src/hashlimit.c @@ -0,0 +1,92 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. +// +// Rate limiting algorithm inspired by linux iptables hashlimit module. +// http://lxr.free-electrons.com/source/net/netfilter/xt_hashlimit.c?v=3.17#L383 +// http://lxr.free-electrons.com/source/net/sched/sch_tbf.c?v=3.17#L26 + +#include +#include +#include +#include +#include +#include +#include + +uint64_t siphash24(const void *src, unsigned long src_sz, + const unsigned char key[16]); + +#define TIMESPEC_NSEC(ts) ((ts)->tv_sec * 1000000000ULL + (ts)->tv_nsec) +#define MSEC_NSEC(ms) ((ms)*1000000ULL) + +inline static uint64_t realtime_now() +{ + struct timespec now; + clock_gettime(CLOCK_REALTIME, &now); + return TIMESPEC_NSEC(&now); +} + +struct hl_item +{ + uint64_t credit; + uint64_t prev; +}; + +struct hashlimit +{ + unsigned size; + + uint64_t credit_max; + uint64_t touch_cost; + uint8_t key[16]; + + struct hl_item items[0]; +}; + +struct hashlimit *hashlimit_alloc(unsigned size, double rate_pps, double burst) +{ + struct hashlimit *hl = calloc(1, sizeof(struct hashlimit) + + size * sizeof(struct hl_item)); + + hl->size = size; + hl->touch_cost = (double)(MSEC_NSEC(1000ULL)) / rate_pps; + hl->credit_max = burst * hl->touch_cost; + + /* Random numbers for poor */ + uint64_t a = realtime_now() | getpid(); + memcpy(&hl->key[0], &a, 8); + a = realtime_now() | getppid(); + memcpy(&hl->key[8], &a, 8); + + return hl; +} + +void hashlimit_free(struct hashlimit *hl) { free(hl); } + +int hashlimit_touch(struct hashlimit *hl, unsigned idx) +{ + struct hl_item *item = &hl->items[idx]; + + uint64_t now = realtime_now(); + uint64_t delta = now - item->prev; + item->credit += delta; + item->prev = now; + + if (item->credit > hl->credit_max) { + item->credit = hl->credit_max; + } + + if (item->credit >= hl->touch_cost) { + item->credit -= hl->touch_cost; + return 1; + } + return 0; +} + +int hashlimit_touch_hash(struct hashlimit *hl, const uint8_t *h, int h_len) +{ + + uint64_t hash = siphash24(h, h_len, hl->key); + return hashlimit_touch(hl, hash % hl->size); +} diff --git a/src/hashlimit.h b/src/hashlimit.h new file mode 100644 index 0000000..aab648e --- /dev/null +++ b/src/hashlimit.h @@ -0,0 +1,8 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. + +struct hashlimit *hashlimit_alloc(unsigned size, double rate_pps, double burst); +int hashlimit_touch(struct hashlimit *hl, unsigned idx); +int hashlimit_touch_hash(struct hashlimit *hl, const uint8_t *h, int h_len); +void hashlimit_free(struct hashlimit *hl); diff --git a/src/main.c b/src/main.c index 4d89fbc..f6872d6 100644 --- a/src/main.c +++ b/src/main.c @@ -1,3 +1,7 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. + #include #include #include @@ -9,9 +13,13 @@ #include #include +#include "hashlimit.h" #include "pmtud.h" #include "uevent.h" +#define IFACE_RATE_PPS 10.0 +#define SRC_RATE_PPS 1.0 + static void usage() { fprintf(stderr, @@ -19,28 +27,30 @@ static void usage() "\n" " pmtud [options] \n" "\n" - "Path MTU daemon is a program that captures and broadcasts " - "ICMP\n" - "messages related to MTU detection. It listens on an " - "interface,\n" - "waiting for ICMP messages (ip type 3 code 4 or ipv6 type 2 " - "code 0)\n" - "and it forwards them verbatim to broadcast ethernet address.\n" + "Path MTU daemon is a program that captures and broadcasts\n" + "ICMP messages related to MTU detection. It listens on an\n" + "interface, waiting for ICMP messages (IPv4 type 3 code 4 or\n" + "IPv6 type 2 code 0) and it forwards them verbatim to the\n " + "broadcast ethernet address.\n" "\n" "Options:\n" "\n" " --iface Network interface to listen on\n" " --src-rate Pps limit from single source " - "(default=10 pss)\n" + "(default=%.1f pss)\n" " --iface-rate Pps limit to send on a single " "interface " - "(default=100 pps)\n" + "(default=%.1f pps)\n" + " --verbose Print forwarded packets on screen\n" + " --dry-run Don't inject packets, just dry run\n" " --help Print this message\n" "\n" "Example:\n" "\n" - " pmtud --iface=eth2 --src-rate=1.0 --iface-rate=10.0\n" - "\n"); + " pmtud --iface=eth2 --src-rate=%.1f --iface-rate=%.1f\n" + "\n", + SRC_RATE_PPS, IFACE_RATE_PPS, SRC_RATE_PPS, IFACE_RATE_PPS); + exit(-1); } #define SNAPLEN 2048 @@ -51,7 +61,7 @@ static void usage() static int on_signal(struct uevent *uevent, int sfd, int mask, void *userdata) { - int *done = userdata; + volatile int *done = userdata; int buf[512]; /* Drain. Socket should be NONBLOCK */ int r = read(sfd, buf, sizeof(buf)); @@ -62,30 +72,167 @@ static int on_signal(struct uevent *uevent, int sfd, int mask, void *userdata) *done = 1; return 0; } -static int handle_pcap(struct uevent *uevent, int sfd, int mask, - void *userdata); struct state { pcap_t *pcap; int raw_sd; + struct hashlimit *sources; + struct hashlimit *ifaces; + int verbose; + int dry_run; }; -int main(int argc, char *argv[]) +static int handle_packet(struct state *state, const uint8_t *p, int data_len) { + /* assumming DLT_EN10MB */ + + /* 14 ethernet, 20 ipv4, 8 icmp, 8 IPv4 on payload */ + if (data_len < 14 + 20 + 8 + 8) { + return 0; + } + + if (p[0] == 0xff && p[1] == 0xff && p[2] == 0xff && p[3] == 0xff && + p[4] == 0xff && p[5] == 0xff) { + return -1; + } + + const uint8_t *hash = NULL; + int hash_len; + + int l3_offset = 14; + uint16_t eth_type = (((uint16_t)p[12]) << 8) | (uint16_t)p[13]; + if (eth_type == 0x8100) { + eth_type = (((uint16_t)p[16]) << 8) | (uint16_t)p[17]; + l3_offset = 18; + } + + int valid = 0; + if (eth_type == 0x0800 && (p[l3_offset] & 0xF0) == 0x40) { + uint8_t protocol = p[l3_offset + 9]; + /* header: 20 bytes of IPv4, 8 bytes of ICMP, + * payload: 20 bytes of IPv4, 8 bytes of TCP */ + if (protocol == 1 && data_len >= l3_offset + 20 + 8 + 20 + 8) { + valid = 1; + hash = &p[l3_offset + 12]; + hash_len = 4; + } + } + + if (eth_type == 0x86dd && (p[l3_offset] & 0xF0) == 0x60) { + uint8_t protocol = p[l3_offset + 6]; + /* header, 40 bytes of IPv6, 8 bytes of ICMP + * payload: 32 bytes of IPv6 payload */ + if (protocol == 58 && data_len >= l3_offset + 40 + 8 + 32) { + valid = 1; + hash = &p[l3_offset + 8]; + hash_len = 16; + } + } + + if (valid == 0 || hash == NULL || hash_len == 0) { + return -1; + } + + uint8_t dst_mac[6]; + memcpy(dst_mac, p, 6); + + /* alright, write there anyway */ + uint8_t *pp = (uint8_t *)p; + + int i; + for (i = 0; i < 6; i++) { + pp[i] = 0xff; + } + + for (i = 0; i < 6; i++) { + pp[6 + i] = dst_mac[i]; + } + int sources_ok = hashlimit_touch_hash(state->sources, hash, hash_len); + int ifaces_ok = 0; + if (sources_ok) { + ifaces_ok = hashlimit_touch(state->ifaces, 0); + } + + if (state->verbose > 2) { + printf("%s hashlimits{src=%i if=%i} %s\n", + ip_to_string(hash, hash_len), sources_ok, ifaces_ok, + to_hex(pp, data_len)); + } else if (state->verbose > 1) { + printf("%s hashlimits{src=%i if=%i}\n", + ip_to_string(hash, hash_len), sources_ok, ifaces_ok); + } + + if (!sources_ok || !ifaces_ok) { + return 0; + } + + if (state->verbose == 1) { + printf("%s hashlimits{src=%i if=%i}\n", + ip_to_string(hash, hash_len), sources_ok, ifaces_ok); + } + + if (state->dry_run == 0) { + int r = send(state->raw_sd, pp, data_len, 0); + if (r < 0) { + PFATAL("send()"); + } + } + return 1; +} + +static int handle_pcap(struct uevent *uevent, int sfd, int mask, void *userdata) +{ + struct state *state = userdata; + + while (1) { + struct pcap_pkthdr *hdr; + const uint8_t *data; + + int r = pcap_next_ex(state->pcap, &hdr, &data); + + switch (r) { + case 1: + if (hdr->len == hdr->caplen) { + handle_packet(state, data, hdr->caplen); + } else { + /* Partial caputre */ + } + break; + + case 0: + /* Timeout */ + return 0; + + case -1: + FATAL("pcap_next_ex(): %s", pcap_geterr(state->pcap)); + break; + + case -2: + return 0; + } + } +} + +int main(int argc, char *argv[]) +{ static struct option long_options[] = { {"iface", required_argument, 0, 'i'}, {"src-rate", required_argument, 0, 's'}, - {"iface-rate", no_argument, 0, 'r'}, + {"iface-rate", required_argument, 0, 'r'}, + {"verbose", no_argument, 0, 'v'}, + {"dry-run", no_argument, 0, 'd'}, {"help", no_argument, 0, 'h'}, {NULL, 0, 0, 0}}; const char *optstring = optstring_from_long_options(long_options); const char *iface = NULL; - double src_rate = 10; - double iface_rate = 100; + double src_rate = SRC_RATE_PPS; + double iface_rate = IFACE_RATE_PPS; + int verbose = 0; + int dry_run = 0; optind = 1; while (1) { @@ -101,9 +248,11 @@ int main(int argc, char *argv[]) case 0: FATAL("Unknown option: %s", argv[optind]); break; + case 'h': usage(); break; + case '?': exit(-1); break; @@ -111,11 +260,27 @@ int main(int argc, char *argv[]) case 'i': iface = optarg; break; + case 's': src_rate = atof(optarg); + if (src_rate <= 0.0) { + FATAL("Rates must be greater than zero"); + } break; + case 'r': iface_rate = atof(optarg); + if (iface_rate <= 0.0) { + FATAL("Rates must be greater than zero"); + } + break; + + case 'v': + verbose++; + break; + + case 'd': + dry_run = 1; break; default: @@ -140,23 +305,31 @@ int main(int argc, char *argv[]) struct state state; state.pcap = setup_pcap(iface, BPF_FILTER, SNAPLEN, &stats); state.raw_sd = setup_raw(iface); + state.sources = hashlimit_alloc(8191, src_rate, src_rate * 1.9); + state.ifaces = hashlimit_alloc(1, iface_rate, iface_rate * 1.9); + state.verbose = verbose; + state.dry_run = dry_run; int pcap_fd = pcap_get_selectable_fd(state.pcap); if (pcap_fd < 0) { PFATAL("pcap_get_selectable_fd()"); } - int done = 0; + volatile int done = 0; struct uevent uevent; uevent_new(&uevent); uevent_yield(&uevent, signal_desc(SIGINT), UEVENT_READ, on_signal, - &done); + (void *)&done); uevent_yield(&uevent, signal_desc(SIGTERM), UEVENT_READ, on_signal, - &done); + (void *)&done); uevent_yield(&uevent, pcap_fd, UEVENT_READ, handle_pcap, &state); - fprintf(stderr, "[*] #%i, Started pmtud iface=%s\n", getpid(), - str_quote(iface)); + fprintf(stderr, + "[*] #%i Started pmtud on %s rates={iface=%.1f pps source=%.1f " + "pps}, " + "verbose=%i, dry_run=%i\n", + getpid(), str_quote(iface), iface_rate, src_rate, verbose, + dry_run); while (done == 0) { struct timeval timeout = @@ -166,115 +339,15 @@ int main(int argc, char *argv[]) continue; } } - fprintf(stderr, "[*] #%i, Quitting\n", getpid()); + fprintf(stderr, "[*] #%i Quitting\n", getpid()); unsetup_pcap(state.pcap, iface, &stats); - fprintf(stderr, "[*] #%i, recv=%i drop=%i ifdrop=%i\n", getpid(), + fprintf(stderr, "[*] #%i recv=%i drop=%i ifdrop=%i\n", getpid(), stats.ps_recv, stats.ps_drop, stats.ps_ifdrop); close(state.raw_sd); + hashlimit_free(state.sources); + hashlimit_free(state.ifaces); return 0; } - -static int handle_packet(struct state *state, const uint8_t *p, int data_len); - -static int handle_pcap(struct uevent *uevent, int sfd, int mask, void *userdata) -{ - struct state *state = userdata; - - while (1) { - struct timespec ts = (struct timespec){0, 0}; - - struct pcap_pkthdr *hdr; - const uint8_t *data; - - int r = pcap_next_ex(state->pcap, &hdr, &data); - - switch (r) { - case 1: - ts = NSEC_TIMESPEC(TIMEVAL_NSEC(&hdr->ts)); - if (hdr->len == hdr->caplen) { - handle_packet(state, data, hdr->caplen); - } else { - /* Partial caputre */ - } - break; - - case 0: - /* Timeout */ - return 0; - - case -1: - FATAL("pcap_next_ex(): %s", pcap_geterr(state->pcap)); - break; - - case -2: - return 0; - } - } -} - -static int handle_packet(struct state *state, const uint8_t *p, int data_len) -{ - /* assumming DLT_EN10MB */ - - /* 14 ethernet, 20 ipv4, 8 icmp, 8 IPv4 on payload */ - if (data_len < 14 + 20 + 8 + 8) { - return 0; - } - - if (p[0] == 0xff && p[1] == 0xff && p[2] == 0xff && p[3] == 0xff && - p[4] == 0xff && p[5] == 0xff) { - return 0; - } - - int l3_offset = 14; - uint16_t eth_type = (((uint16_t)p[12]) << 8) | (uint16_t)p[13]; - if (eth_type == 0x8100) { - eth_type = (((uint16_t)p[16]) << 8) | (uint16_t)p[17]; - l3_offset = 18; - } - - int valid = 0; - if (eth_type == 0x0800 && (p[l3_offset] & 0xF0) == 0x40) { - uint8_t protocol = p[l3_offset + 9]; - if (protocol == 1 && data_len >= l3_offset + 20 + 8 + 8) { - valid = 1; - } - } - - if (eth_type == 0x86dd && (p[l3_offset] & 0xF0) == 0x60) { - uint8_t protocol = p[l3_offset + 6]; - if (protocol == 58 && data_len >= l3_offset + 40 + 8 + 32) { - valid = 1; - } - } - - if (valid == 0) { - return 0; - } - - uint8_t dst_mac[6]; - memcpy(dst_mac, p, 6); - - /* allright, write there anyway */ - uint8_t *pp = (uint8_t *)p; - - int i; - for (i = 0; i < 6; i++) { - pp[i] = 0xff; - } - - for (i = 0; i < 6; i++) { - pp[6 + i] = dst_mac[i]; - } - - /* printf("> %s\n", to_hex(pp, data_len)); */ - - int r = send(state->raw_sd, pp, data_len, 0); - if (r < 0) { - PFATAL("send()"); - } - return 1; -} diff --git a/src/pcap.c b/src/net.c similarity index 87% rename from src/pcap.c rename to src/net.c index 600fd02..692898b 100644 --- a/src/pcap.c +++ b/src/net.c @@ -1,3 +1,7 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. + #include #include #include @@ -147,3 +151,26 @@ int setup_raw(const char *iface) /* } */ return s; } + +const char *ip_to_string(const uint8_t *p, int p_len) +{ + static char dst[INET6_ADDRSTRLEN + 1]; + const char *r = NULL; + + if (p_len == 4) { + struct in_addr addr; + memcpy(&addr, p, 4); + r = inet_ntop(AF_INET, &addr, dst, INET6_ADDRSTRLEN); + } + if (p_len == 16) { + struct in6_addr addr; + memcpy(&addr, p, 16); + r = inet_ntop(AF_INET6, &addr, dst, INET6_ADDRSTRLEN); + } + + if (r == NULL) { + dst[0] = '?'; + dst[1] = 0x00; + } + return dst; +} diff --git a/src/pmtud.h b/src/pmtud.h index fe02ad6..e16fc2f 100644 --- a/src/pmtud.h +++ b/src/pmtud.h @@ -1,3 +1,7 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. + #define ERRORF(x...) fprintf(stderr, x) #define FATAL(x...) \ @@ -42,3 +46,4 @@ pcap_t *setup_pcap(const char *iface, const char *bpf_filter, int snap_len, struct pcap_stat *stats); void unsetup_pcap(pcap_t *pcap, const char *iface, struct pcap_stat *stats); int setup_raw(const char *iface); +const char *ip_to_string(const uint8_t *p, int p_len); diff --git a/src/uevent.c b/src/uevent.c index 8918f57..27bc3b7 100644 --- a/src/uevent.c +++ b/src/uevent.c @@ -1,3 +1,7 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. + #include #include #include @@ -11,8 +15,9 @@ struct timespec uevent_now; struct uevent *uevent_new(struct uevent *uevent) { - if (!uevent) + if (!uevent) { uevent = malloc(sizeof(struct uevent)); + } memset(uevent, 0, sizeof(struct uevent)); uevent->used_slots = 0; uevent->max_fd = 0; diff --git a/src/utils.c b/src/utils.c index 16b5bee..ab77b49 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1,3 +1,7 @@ +// PMTUD +// +// Copyright (c) 2015 CloudFlare, Inc. + #include #include #include @@ -34,6 +38,7 @@ const char *optstring_from_long_options(const struct option *opt) if (osp - optstring >= (int)sizeof(optstring)) { abort(); } + return optstring; }