initial commit

fastly · May 6, 2014 · 02384d0 · 02384d0
commit 02384d0
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*\~
+*.pyc
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# Avalanche
+
+Avalanche is a script that injects random, repeatable network faults on specific ports. It is useful for testing distributed systems.
+
+## Running
+
+    sudo ./avalanche
+
+## Possible Faults:
+
+By default, Avalanche inserts a fault with probability specified in settings.py (p_fault). Given that a fault is inserted, one of the following faults is picked with the probability specified in the config file:
+
+- High latency
+- 100% packet loss
+- Smaller percentage of packet loss
+- Reorder packets
diff --git a/avalanche b/avalanche
@@ -0,0 +1,116 @@
+#! /usr/bin/env python
+
+import os
+import sys
+import json
+import time
+import atexit
+import random
+import logging
+import settings
+import subprocess
+from optparse import OptionParser
+
+VERSION = "0.1.0"
+
+log = logging.getLogger('avalanche')
+log.addHandler(logging.StreamHandler())
+
+def die(msg):
+    log.error(msg)
+    exit(1)
+
+def call(cmd, exit_on_fail=True):
+    log.debug(cmd)
+    res = subprocess.call(cmd, shell=True, stderr=subprocess.STDOUT)
+    if res != 0 and exit_on_fail:
+        die("error: subprocess returned %d (not 0)"%(res))
+        return res
+
+def tc(iface, args):
+    return "tc qdisc add dev %s parent 1:3 handle 30: %s"%(iface, args)
+
+def generate_fault():
+    """pick a fault from faults uniformly at random with probability p_fault"""
+
+    if isinstance(settings.faults, list):
+        faults = {a:1.0/len(settings.faults) for a in settings.faults}
+    elif isinstance(settings.faults, dict):
+        faults = settings.faults
+    else:
+        die("can't parse faults")
+
+    # iterate through each action, and check to see if the random number fell into a range
+    # corresponding to the probability of the action. if we don't pick an action,
+    # choose to undo all actions.
+    r = random.uniform(0,1)
+    s = 0
+    for a,p in faults.items():
+        # pick an action with uniform probability equal to P(fault)/(number of faults)
+        s += settings.p_fault*p
+        if s >= r:
+            return a()
+    return None
+
+def clear_faults():
+    """undo all currently enabled faults"""
+    if settings.debug: return
+    for iface in settings.interfaces:
+        call("tc qdisc del dev %s root"%(iface), exit_on_fail=False)
+
+def cleanup(active_faults):
+    """run on exit"""
+    if active_faults:
+        log.info("Cleaning up...")
+        clear_faults()
+    log.info("Exiting.")
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-d", "--debug", dest="debug", default=False,
+                      action="store_true", help="log the faults, but do not inject them")
+    parser.add_option("-v", "--version", dest="version", default=False,
+                      action="store_true", help="print the avalanche version and exit")
+    (opts, args) = parser.parse_args()
+
+    if opts.version:
+        print VERSION
+        exit(0)
+
+    log.setLevel(settings.log_level)
+
+    if opts.debug:
+        settings.debug = opts.debug
+        log.setLevel(logging.DEBUG)
+
+    if sum([v for v in settings.faults.values()]) != 1:
+        die("fault probabilities don't sum to 1")
+
+    active_faults = []
+    atexit.register(lambda: cleanup(active_faults))
+
+    log.info("Starting Avalanche v%s"%(VERSION))
+    log.info("seed=%d,delay=%dms,ports=%s"%(settings.seed, settings.delay,str(settings.ports)))
+
+    random.seed(settings.seed)
+    while True:
+        if active_faults:
+            clear_faults()
+            active_faults = []
+
+        fault = generate_fault()
+        if fault:
+            active_faults.append(fault)
+            log.info("fault: %s"%(fault.desc()))
+
+            if not settings.debug:
+                for iface in settings.interfaces:
+                    call("tc qdisc add dev %s root handle 1: prio"%(iface))
+                    call(tc(iface, fault.action()))
+                    for port in settings.ports:
+                        call("tc filter add dev %s parent 1:0 protocol ip u32 match ip dport %d 0xffff flowid 1:3"%(iface, port))
+                        call("tc filter add dev %s parent 1:0 protocol ip u32 match ip sport %d 0xffff flowid 1:3"%(iface, port))
+        else:
+            log.info("fault: none")
+
+        time.sleep(settings.delay)
diff --git a/faults.py b/faults.py
@@ -0,0 +1,55 @@
+import random
+
+class Partition:
+    # Partition the current server from all other servers
+
+    def action(self):
+        return "netem loss 100%"
+
+    def desc(self):
+        return "network partition"
+
+class PacketLoss:
+    # Drop packets with some probability
+
+    def __init__(self):
+        # percentage probability of dropping a packet
+        self.loss = random.randint(5, 10)
+
+    def action(self):
+        return "netem loss %d%%"%(self.loss)
+
+    def desc(self):
+        return "drop packets with probability %d%%"%(self.loss)
+
+class Latency:
+    # Add latency to all packets
+
+    def __init__(self):
+        # per-packet delay in ms
+        self.latency = random.randint(100, 1000)
+
+    def action(self):
+        return "netem delay %dms"%(self.latency)
+
+    def desc(self):
+        return "delay of %dms"%(self.latency)
+
+class Reorder:
+    # Reorder packets
+
+    def __init__(self):
+        # probability of continuing the delay
+        self.correlation = 50
+
+        # initial packet delay
+        self.delay = 10
+
+        # probability of reordering a packet
+        self.reorder = random.randint(10, 75)
+
+    def action(self):
+        return "netem delay %sms reorder %d%% %d%%" % (self.delay, 100-self.reorder, self.correlation)
+
+    def desc(self):
+        return "reorder after delay of %dms with probability %d and correlation %d" % (self.delay, 100-self.reorder, self.correlation)
diff --git a/settings.py b/settings.py
@@ -0,0 +1,32 @@
+import logging
+from faults import *
+
+# Seed for the random number generator. This makes tests repeatable.
+seed = 1
+
+# Time between faults in seconds
+delay = 1
+
+# Probability of a fault occuring.
+p_fault = 0.5
+
+# if debug is true, log which fault we would do, but don't inject the fault.
+debug = False
+
+# only inject faults on these interfaces
+interfaces = ["eth0"]
+
+# only inject faults on these  ports
+ports = [2001]
+
+# level of logging
+log_level = logging.INFO
+
+# List of faults to execute. If it's a list, the probability of each fault
+# is uniform. If it's a hash the probability of each fault is the hash value.
+faults = {
+    Partition: 0.2,
+    PacketLoss: 0.2,
+    Latency: 0.3,
+    Reorder: 0.3,
+}