### E10S Experiment Aurora: Top hang stacks

In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython

from __future__ import division
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test

%pylab inline
IPython.core.pylabtools.figsize(16, 7)

Unable to parse whitelist (/home/hadoop/anaconda/lib/python2.7/site-packages/moztelemetry/bucket-whitelist.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib


In [2]:
sc.defaultParallelism

128

#### Get e10s and non-e10s partitions

In [3]:
def is_in_e10s_experiment(ping):
    try:
        experiment = ping["environment"]["addons"]["activeExperiment"]
        return experiment["id"] == "e10s-enabled-aurora-20151020@experiments.mozilla.org" and \
               (experiment["branch"] == "control" or experiment["branch"] == "experiment")   
    except:
        return False

In [4]:
pings = get_pings(sc, app="Firefox", channel="aurora", version="43.0a2", submission_date=("20151023", "20151027"))\
        .filter(is_in_e10s_experiment)

How many pings do we have in each branch?

In [5]:
pings.map(lambda x: (x["environment"]["addons"]["activeExperiment"]["branch"], 1))\
     .reduceByKey(lambda x, y: x + y)\
     .collect()

[(u'experiment', 263044), (u'control', 258968)]

Get a single ping for each client, but make sure it's not the first one as the experiment branch will not be enforced until the next restart

In [6]:
first_ping_of_client = \
      pings.map(lambda p: (p["clientId"], p))\
     .reduceByKey(lambda x, y: x if x["meta"]["creationTimestamp"] < y["meta"]["creationTimestamp"] else y)\
     .map(lambda p: (p[0], p[1]["meta"]["documentId"]))\
     .collectAsMap()

In [7]:
def not_first(ping):
    return not ping["meta"]["documentId"] in first_ping_of_client[ping["clientId"]]

subset = get_one_ping_per_client(pings.filter(not_first))

How many clients do we have in each branch?

In [8]:
subset.map(lambda x: (x["environment"]["addons"]["activeExperiment"]["branch"], 1))\
      .reduceByKey(lambda x, y: x + y)\
      .collect()

[(u'experiment', 28310), (u'control', 28236)]

How many clients have a mismatching experiment state?

In [9]:
def e10s_status_mismatch(ping):
    branch = ping["environment"]["addons"]["activeExperiment"]["branch"]
    branch_status = True if branch == "experiment" else False
    return (branch, branch_status != ping["environment"]["settings"]["e10sEnabled"])

In [10]:
subset.map(e10s_status_mismatch).reduceByKey(lambda x, y: x + y).collect()

[(u'experiment', 0), (u'control', 59)]

Exclude mismatching clients

In [11]:
subset = subset.filter(lambda p: not e10s_status_mismatch(p)[1])

In [12]:
subset.map(e10s_status_mismatch).reduceByKey(lambda x, y: x + y).collect()

[(u'experiment', 0), (u'control', 0)]

#### Get top stacks

In [13]:
e_subset = subset.filter(lambda p: p["environment"]["settings"]["e10sEnabled"])
n_subset = subset.filter(lambda p: not p["environment"]["settings"]["e10sEnabled"])

In [14]:
def get_stacks(ping):
    threads = ping["payload"].get("threadHangStats", {})
    for thread in threads:
        if thread["name"] == "Gecko":
            for hang in thread["hangs"]:
                if hang["stack"]:
                    yield (tuple(hang["stack"]), sum(hang["histogram"]["values"].values()))

def get_top_frames(subset):
    stacks = subset.flatMap(get_stacks).reduceByKey(lambda a, b: a + b).collectAsMap()
    total_hits = 0
    top_frames = {}
    for stack, hits in stacks.iteritems():
        stack_top_frame = stack[-1]
        if not stack_top_frame in top_frames:
            top_frames[stack_top_frame] = { "frame": stack_top_frame, "stacks": [], "hits": 0 }

        top_frame = top_frames[stack_top_frame]

        # Keep stacks sorted by hits.
        top_frame["stacks"].append((stack, hits))
        top_frame["stacks"].sort(key=lambda d: d[1], reverse=True)

        top_frame["hits"] += hits
        total_hits += hits

    return top_frames, total_hits

def get_top_stacks(subset):
    top_frames, total_hits = get_top_frames(subset)
    top_stacks = sorted(top_frames.values(), key=lambda d: d["hits"], reverse=True)
    return top_stacks, total_hits

def get_stack_hits(stacks, stack):
    for s, h in stacks:
        if s == stack:
            return h
    return 0

In [15]:
e_top_stacks, e_total_hits = get_top_stacks(e_subset)

In [16]:
n_top_frames, n_total_hits = get_top_frames(n_subset)

In [17]:
def print_top_frames(frame_count, stack_count):
    for e_top_stack in e_top_stacks[0:frame_count]:
        n_frame = n_top_frames.get(e_top_stack["frame"], {})
        n_hits = n_frame.get("hits", 0)
        print "{:.2f}% ({:.2f}%): {} ({})".format(
            100.0 * e_top_stack["hits"] / e_total_hits,
            100.0 * n_hits / n_total_hits,
            e_top_stack["frame"],
            e_top_stack["hits"])
        for e_stack, e_stack_hits in e_top_stack["stacks"][0:stack_count]:
            n_stack_hits = get_stack_hits(n_frame.get("stacks", []), e_stack)
            print "  - {:.4f}% ({:.4f}%):".format(
                100.0 * e_stack_hits / e_total_hits,
                100.0 * n_stack_hits / n_total_hits)
            print "    {}\n".format("\n    ".join(e_stack))

#### Top frames in e10s crashes

The results are in the form: `e10s% (non-e10s%): top frame (total e10s hits)`

In [18]:
print_top_frames(25, 0)

43.95% (41.52%): Timer::Fire (19843427)
32.70% (17.38%): Startup::XRE_Main (14762880)
7.11% (0.00%): IPDL::PCookieService::RecvGetCookieString (3209248)
2.00% (3.04%): nsCycleCollector::forgetSkippable (902254)
1.84% (0.00%): IPDL::PScreenManager::RecvScreenForBrowser (830302)
1.01% (1.66%): nsViewManager::DispatchEvent (457349)
0.94% (0.20%): nsHttpChannel::OnDataAvailable (424111)
0.93% (2.15%): ViewportFrame::BuildDisplayList (419515)
0.77% (0.26%): nsLayoutUtils::GetFramesForArea (348625)
0.59% (0.00%): IPDL::PJavaScript::SendGet (267344)
0.51% (4.48%): js::GCRuntime::collect (230967)
0.45% (0.59%): self-hosted:647 (203637)
0.40% (0.09%): nsHttpChannel::OnStartRequest (181215)
0.31% (0.00%): IPDL::PBrowser::RecvAsyncMessage (141679)
0.28% (1.70%): PresShell::DoReflow (125358)
0.27% (0.00%): IPDL::PNecko::RecvPHttpChannelConstructor (122972)
0.23% (0.03%): nsInputStreamPump::OnStateTransfer (103913)
0.22% (0.00%): IPDL::PCookieService::RecvSetCookieString (97497)
0.17% (0.00%): IPDL

#### Top stacks of each frame in e10s crashes

In [19]:
print_top_frames(25, 3)

43.95% (41.52%): Timer::Fire (19843427)
  - 43.8862% (41.4334%):
    Startup::XRE_Main
    Timer::Fire

  - 0.0189% (0.0000%):
    Startup::XRE_Main
    nsViewManager::DispatchEvent
    Timer::Fire

  - 0.0122% (0.0024%):
    Startup::XRE_Main
    EventDispatcher::Dispatch
    browser/content/browser.xul:1
    nsBrowserGlue.js:265
    browser/content/tabbrowser.xml:1987
    gre/components/nsPrompter.js:78
    Timer::Fire

32.70% (17.38%): Startup::XRE_Main (14762880)
  - 32.6988% (17.3843%):
    Startup::XRE_Main

7.11% (0.00%): IPDL::PCookieService::RecvGetCookieString (3209248)
  - 7.0972% (0.0002%):
    Startup::XRE_Main
    IPDL::PCookieService::RecvGetCookieString

  - 0.0079% (0.0000%):
    Startup::XRE_Main
    nsViewManager::DispatchEvent
    IPDL::PCookieService::RecvGetCookieString

  - 0.0007% (0.0000%):
    Startup::XRE_Main
    gre/modules/services-sync/service.js:1263
    services-sync/util.js:76
    gre/modules/services-sync/service.js:1270
    services-sync/util.js:76
 