In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import plotly.plotly as py
import IPython
import pyspark.sql.functions as fun
from pyspark.sql import Row
from datetime import date
import feather
from collections import defaultdict
import os

from __future__ import division
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test

%pylab inline
IPython.core.pylabtools.figsize(16, 7)

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib


In [2]:
sc.defaultParallelism

320

In [3]:
sc.version

u'2.0.0'

### Load nightly data to test

In [4]:
dataset_before = get_pings(sc, app="Firefox", channel="nightly", build_id=("20161009000000", "20161016000000"))
%time print("Num sessions before: {:,}\n".format(dataset_before.count()))

Num sessions before: 491,859

CPU times: user 64 ms, sys: 16 ms, total: 80 ms
Wall time: 1min 29s


In [6]:
dataset_after = get_pings(sc, app="Firefox", channel="nightly", build_id=("20161024000000", "20161101000000"))
%time print("Num sessions after: {:,}\n".format(dataset_after.count()))

Num sessions after: 446,832

CPU times: user 44 ms, sys: 28 ms, total: 72 ms
Wall time: 58.4 s


Combine data and restrict to data of interest.

In [7]:
def extract_data(ping):
    data = {
        "clientId": ping.get("clientId"),
        "buildId": ping.get("application", {}).get("buildId"),
    }
    data["period"] = "before" if data["buildId"] <= "20161016000000" else "after"
    
    env = ping.get("environment")
    if not env:
        return None
    data["architecture"] = env.get("build", {}).get("architecture")
    data["e10sEnabled"] = env.get("settings", {}).get("e10sEnabled")
    
    sys = env.get("system", {})
    ## Some system data won't be relevant here.
    for subfld in "hdd", "gfx":
        if subfld in sys:
            del sys[subfld]
    data["system"] = sys
    
    ## Only need IDs of installed add-ons, split according to whether or not
    ## they are system add-ons.
    addons = env.get("addons", {}).get("activeAddons", {})
    sys_addons = []
    nonsys_addons = []
    def addon_listing(guid, meta):
        return (guid, meta.get("version"))
    
    for guid, meta in addons.iteritems():
        if meta.get("isSystem"):
            sys_addons.append(addon_listing(guid, meta))
        else:
            nonsys_addons.append(addon_listing(guid, meta))
    data["addons"] = {"system": sys_addons, "non_system": nonsys_addons}
    
    payload = ping.get("payload", {})
    if not payload:
        return None
    data["histograms"] = payload.get("histograms", {})
    data["keyedHistograms"] = payload.get("keyedHistograms", {})
    data["simpleMeasurements"] = payload.get("simpleMeasurements", {})
    data["hangs"] = payload.get("threadHangStats", [])
    data["processes"] = payload.get("processes", {})
    data["childPayloads"] = payload.get("childPayloads", [])
    
    return data

def good_payload(data):
    return (data is not None and
            data["clientId"] is not None and
            data["buildId"] is not None and
            data["e10sEnabled"] is not None)

In [8]:
full_data = dataset_before.union(dataset_after)
dataset = full_data.map(extract_data).filter(good_payload)
## Key by clientID.
dataset = dataset.map(lambda d: (d["clientId"], d)).persist(StorageLevel.MEMORY_AND_DISK_SER)

In [9]:
%time print("Overall num sessions: {:,}\n".format(dataset.count()))

Overall num sessions: 938,691

CPU times: user 116 ms, sys: 64 ms, total: 180 ms
Wall time: 7min 28s


In [4]:
## Save this dataset to S3.
s3_path = "s3://mozilla-metrics/user/dzeber/tmp/addon-sdk-fix/nightly_{}/".format(date.today().isoformat())

In [None]:
#dataset.saveAsPickleFile(s3_path)

In [4]:
#s3_path = "s3://mozilla-metrics/user/dzeber/tmp/addon-sdk-fix/nightly_2016-11-01/"
#dataset = sc.pickleFile(s3_path)#.persist(StorageLevel.MEMORY_AND_DISK_SER)

### Create a dataset to analyze

#### Longitudinal properties

Ideally, we would compare metrics between builds before and after the changes, within each profile. For this we need profiles to have sessions both before and after the change, with other factors (add-ons and e10s setting) staying constant over the sessions we observe.

How many profiles have these properties?

In [11]:
def session_vals_for_check(session):
    return {
        "e10s": session["e10sEnabled"],
        "addons": session["addons"]["non_system"],
        "period": session["period"]
    }

def constant_e10s_setting(session_vals):
    e10s_settings = [s["e10s"] for s in session_vals]
    return len(set(e10s_settings)) == 1

def constant_active_addons(session_vals):
    addons = [s["addons"] for s in session_vals]
    ## All add-on lists must have the same length...
    if len(set(map(len, addons))) != 1:
        return False
    ## ...and they must contain the same add-ons.
    all_addons = set().union(*addons)
    return len(all_addons) == len(addons[0])

def both_periods(session_vals):
    periods = [s["period"] for s in session_vals]
    return len(set(periods)) == 2

In [12]:
prof_data = dataset.mapValues(session_vals_for_check).groupByKey()
n_prof = prof_data.count()
print("Num unique profiles represented in the dataset: {:,}".format(n_prof))

Num unique profiles represented in the dataset: 58,866


In [13]:
prof_data_1 = prof_data.filter(lambda (cid, vals): constant_e10s_setting(vals))
n_prof_1 = prof_data_1.count()
n_prof_dropped = n_prof - n_prof_1
print("Num profiles with changing e10s: {:,} ({:.2f}%)"\
          .format(n_prof_dropped, n_prof_dropped / n_prof * 100))

Num profiles with changing e10s: 4,997 (8.49%)


In [14]:
prof_data_2 = prof_data.filter(lambda (cid, vals): constant_active_addons(vals))
n_prof_2 = prof_data_2.count()
n_prof_dropped = n_prof - n_prof_2
print("Num profiles with changing (non-system) add-ons or add-on version: {:,} ({:.2f}%)"\
          .format(n_prof_dropped, n_prof_dropped / n_prof * 100))

Num profiles with changing (non-system) add-ons or add-on version: 19,183 (32.59%)


In [15]:
prof_data_3 = prof_data.filter(lambda (cid, vals): both_periods(vals))
n_prof_3 = prof_data_3.count()
n_prof_dropped = n_prof - n_prof_3
print("Num profiles without both periods: {:,} ({:.2f}%)"\
          .format(n_prof_dropped, n_prof_dropped / n_prof * 100))

Num profiles without both periods: 32,799 (55.72%)


In [None]:
#print("Num profiles remaining after filtering: {:,} ({:.2f}%)"\
#          .format(n_prof_3, n_prof_3 / n_prof * 100))

For now, keep all sessions.

In [27]:
#good_clients = prof_data_3.map(lambda (cid, d): cid).distinct().collect()
#dataset_longit = dataset.filter(lambda (cid, d): cid in good_clients)\
#    .persist(StorageLevel.MEMORY_AND_DISK_SER)
#dataset_longit = dataset

In [None]:
#print("Num sessions remaining: {:,}".format(dataset_longit.count()))

In [None]:
## Sanity check
#print("Num unique profiles: {:,}".format(dataset_longit.map(lambda (cid, d): cid).distinct().count()))

First create an RDD with all the measurements we will be working with.

In [5]:
def get_hist_values(hist):
    ## Keep only non-zero histogram values.
    return { k:v for k, v in hist.get("values", {}).iteritems() if v > 0 }    

def addon_sdk_hang(hang):
    ## Check the stack info for add-on sdk code.
    has_sdk_js = ["sdk/addon/runner.js" in line for line in hang.get("stack", [])]
    return any(has_sdk_js)

def get_addon_hang_data(data):
    ## Only need hangs from the main thread.
    hangs = filter(lambda h: h["name"] == "Gecko", data["hangs"])
    if not hangs:
        return None
    hangs = hangs[0]["hangs"]
    sdk_hangs = filter(addon_sdk_hang, hangs)
    return [ {"stack": h.get("stack"), "values": get_hist_values(h.get("histogram", {}))}
               for h in sdk_hangs ]


def get_hist_data(data, hist_name):
    ## Keep only non-zero histogram values.
    return get_hist_values(data["histograms"].get(hist_name, {}))

def longit_row(data):
    sm = data["simpleMeasurements"]
    return {
        "client_id": data["clientId"],
        "build_id": data["buildId"],
        "period": data["period"],
        "e10s": data["e10sEnabled"],
        "addon_nonsys": data["addons"]["non_system"],
        ## Keep only the count of system add-ons
        "addons_sys_num": len(data["addons"]["system"]),
        
        ## Some system covariates
        "sys_arch": data["architecture"],
        "sys_mem": data["system"].get("memoryMB"),
        "sys_cpu_count": data["system"].get("cpu", {}).get("count"),
        "sys_os": data["system"].get("os", {}).get("name"),
        "sys_os_version": data["system"].get("os", {}).get("version"),
        
        ## Startup times and info (missing times recorded as -1)
        "was_startup_interrupted": bool(sm.get("startupInterrupted", 0)),
        "startup_main": sm.get("main", -1),
        "startup_AMIstart": sm.get("AMI_startup_begin", -1),
        "startup_XPIstart": sm.get("XPI_bootstrap_addons_begin", -1),
        "startup_AMIend": sm.get("AMI_startup_end", -1),
        "startup_toplevelwindow": sm.get("createTopLevelWindow", -1),
        "startup_firstpaint": sm.get("firstPaint", -1),
        "startup_sessionrestored": sm.get("sessionRestored", -1),
        
        ## Shutdown times may also show an effect
        "shutdown": sm.get("shutdownDuration", -1),
        
        ## Some histograms
        "hist_compartments": get_hist_data(data, "MEMORY_JS_COMPARTMENTS_SYSTEM"),
        "hist_gc": get_hist_data(data, "GC_MS"),
        
        ## Thread hangs.
        "hangs": get_addon_hang_data(data)
    }

Shorten client IDs for convenience.

In [6]:
client_ids = dataset.map(lambda (cid, d): cid).distinct().zipWithIndex()

In [7]:
def replace_client_id(d_with_new_cid):
    d, new_cid = d_with_new_cid
    d["client_id"] = new_cid
    return d

dataset_rows = dataset.mapValues(longit_row)\
    .leftOuterJoin(client_ids)\
    .mapValues(replace_client_id)\
    .map(lambda (cid, d): d)

In [8]:
## Add a session ID.
def add_sess_id(d_with_i):
    d, i = d_with_i
    d["session_id"] = i
    return d
dataset_rows = dataset_rows.zipWithIndex().map(add_sess_id)

In [9]:
## Need to cache at this point to ensure stability of indices added with zipWithIndex().
## Otherwise, they keep getting recomputed.
dataset_rows.cache()
dataset_rows.count()

938691

In [10]:
dataset_rows.take(5)

[{'addon_nonsys': [(u'wrc@avast.com', u'10.3.3.44'),
   (u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', u'2.7.3')],
  'addons_sys_num': 5,
  'build_id': u'20161010030204',
  'client_id': 38,
  'e10s': True,
  'hangs': [{'stack': [u'Startup::XRE_Main',
     u'gre/modules/Promise-backend.js:750',
     u'self-hosted:903',
     u'gre/modules/commonjs/sdk/addon/runner.js:87',
     u'(chrome script)'],
    'values': {u'1023': 1}}],
  'hist_compartments': {u'492': 15},
  'hist_gc': {u'114': 2,
   u'135': 4,
   u'160': 12,
   u'190': 4,
   u'226': 4,
   u'268': 1,
   u'318': 1,
   u'68': 5,
   u'81': 8,
   u'96': 5},
  'period': 'before',
  'session_id': 0,
  'shutdown': 914,
  'startup_AMIend': 2182,
  'startup_AMIstart': 1933,
  'startup_XPIstart': 1982,
  'startup_firstpaint': 3839,
  'startup_main': 1823,
  'startup_sessionrestored': 4960,
  'startup_toplevelwindow': 2254,
  'sys_arch': u'x86-64',
  'sys_cpu_count': 8,
  'sys_mem': 8140,
  'sys_os': u'Windows_NT',
  'sys_os_version': u'6.1',
 

Separate complex fields into separate DFs for easier handling and summarization.

#### Add-ons

In [12]:
def addon_rows(r):
    return [Row(client_id = r["client_id"], session_id = r["session_id"],
        guid = guid, version = ver) for (guid, ver) in r["addon_nonsys"]]
    
rdd_addons = dataset_rows.flatMap(addon_rows)
DF_addons = spark.createDataFrame(rdd_addons)
DF_addons.printSchema()

root
 |-- client_id: long (nullable = true)
 |-- guid: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- version: string (nullable = true)



In [13]:
DF_addons.count()

2516163

In [26]:
## Double-check that the session IDs correspond.
s1 = DF_addons.select("session_id").distinct().rdd.map(lambda r: r.session_id)
s2 = dataset_rows.filter(lambda r: r["addon_nonsys"]).map(lambda r: r["session_id"]).distinct()
ss = s1.union(s2).distinct()
ss.count() == s1.count() and s1.count() == s2.count()

True

#### Hangs

In [27]:
dataset_rows.map(lambda r: r["hangs"]).filter(lambda r: r).take(3)

[[{'stack': [u'Startup::XRE_Main',
    u'gre/modules/Promise-backend.js:750',
    u'self-hosted:903',
    u'gre/modules/commonjs/sdk/addon/runner.js:87',
    u'(chrome script)'],
   'values': {u'1023': 1}}],
 [{'stack': [u'Startup::XRE_Main',
    u'gre/modules/Promise-backend.js:750',
    u'self-hosted:903',
    u'gre/modules/commonjs/sdk/addon/runner.js:87',
    u'gre/modules/commonjs/toolkit/loader.js:617',
    u'gre/modules/commonjs/sdk/l10n/html.js:1',
    u'gre/modules/commonjs/toolkit/loader.js:617',
    u'(chrome script)'],
   'values': {u'1023': 1}}],
 [{'stack': [u'Startup::XRE_Main',
    u'gre/modules/Promise-backend.js:750',
    u'self-hosted:903',
    u'self-hosted:946',
    u'self-hosted:1126',
    u'self-hosted:1227',
    u'gre/modules/Promise-backend.js:750',
    u'self-hosted:903',
    u'gre/modules/commonjs/sdk/addon/runner.js:87',
    u'gre/modules/commonjs/toolkit/loader.js:617',
    u'gre/modules/commonjs/sdk/l10n/prefs.js:1',
    u'gre/modules/commonjs/toolkit/load

What are the unique combinations of `runner.js` lines mentioned in the hang stack traces?

In [29]:
sorted(dataset_rows.flatMap(lambda r: [h["stack"] for h in r["hangs"]])\
    .map(lambda s: filter(lambda sl: "runner.js" in sl, s))\
    .map(lambda s: ",".join(sorted(set(map(lambda sl: sl.split(":")[-1], s)))))\
    .distinct().collect())

[u'1', u'1,87', u'41,87', u'66', u'68', u'78', u'84', u'87']

For now, lump all `runner.js` hangs for the session together, and convert the hang stats to a DF.

In [30]:
def hang_rows(r):
    if not r["hangs"]:
        return []
    hang_hist = defaultdict(int)
    for h in r["hangs"]:
        for k, v in h["values"].iteritems():
            hang_hist[k] += v
    return [Row(client_id = r["client_id"], session_id = r["session_id"],
               num_hang_stats = len(r["hangs"]), hang_time = int(t), count = n)
                for t, n in hang_hist.iteritems()]

rdd_hangs = dataset_rows.flatMap(hang_rows)
DF_hangs = spark.createDataFrame(rdd_hangs)
DF_hangs.printSchema()

root
 |-- client_id: long (nullable = true)
 |-- count: long (nullable = true)
 |-- hang_time: long (nullable = true)
 |-- num_hang_stats: long (nullable = true)
 |-- session_id: long (nullable = true)



In [31]:
DF_hangs.count()

464237

In [32]:
## Double-check that the session IDs correspond.
s1 = DF_hangs.select("session_id").distinct().rdd.map(lambda r: r.session_id)
s2 = dataset_rows.filter(lambda r: r["hangs"]).map(lambda r: r["session_id"]).distinct()
ss = s1.union(s2).distinct()
ss.count() == s1.count() and s1.count() == s2.count()

True

#### Histograms

Collect all histogram info in separate DF (maybe later)

#### Scalar data

Create a main DF for the remaining scalar measures.

In [33]:
scalar_flds = dataset_rows.first().keys()
for fld in ["addon_nonsys", "hangs", "hist_compartments", "hist_gc"]:
    scalar_flds.remove(fld)

def main_row(r):
    row_entries = {fld: r[fld] for fld in scalar_flds}
    row_entries["has_hangs"] = len(r["hangs"]) > 0
    row_entries["num_addons_nonsys"] = len(r["addon_nonsys"])
    return Row(**row_entries)

rdd_main = dataset_rows.map(main_row)
DF_main = spark.createDataFrame(rdd_main)
DF_main.printSchema()

root
 |-- addons_sys_num: long (nullable = true)
 |-- build_id: string (nullable = true)
 |-- client_id: long (nullable = true)
 |-- e10s: boolean (nullable = true)
 |-- has_hangs: boolean (nullable = true)
 |-- num_addons_nonsys: long (nullable = true)
 |-- period: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- shutdown: long (nullable = true)
 |-- startup_AMIend: long (nullable = true)
 |-- startup_AMIstart: long (nullable = true)
 |-- startup_XPIstart: long (nullable = true)
 |-- startup_firstpaint: long (nullable = true)
 |-- startup_main: long (nullable = true)
 |-- startup_sessionrestored: long (nullable = true)
 |-- startup_toplevelwindow: long (nullable = true)
 |-- sys_arch: string (nullable = true)
 |-- sys_cpu_count: long (nullable = true)
 |-- sys_mem: long (nullable = true)
 |-- sys_os: string (nullable = true)
 |-- sys_os_version: string (nullable = true)
 |-- was_startup_interrupted: boolean (nullable = true)



In [34]:
DF_main.count()

938691

Add some client-level summary stats to the main DF.

- Was the e10s setting consistent across all client sessions?
- Does the client have sessions both before and after the change?

In [35]:
client_stats = DF_main.groupBy("client_id").agg(
    (fun.countDistinct("e10s") == 1).alias("constant_e10s"),
    (fun.countDistinct("period") == 2).alias("both_periods")
)

DF_main = DF_main.join(client_stats, "client_id")

- Were add-ons consistent across client sessions?

In [36]:
def consistent_addons(rows_for_client, include_versions=True):
    addon_info = (lambda r: (r.guid, r.version)) if include_versions else (lambda r: r.guid)
    rows_for_client = list(rows_for_client)
    unique_addons = set([addon_info(r) for r in rows_for_client])
    single_session = filter(lambda r: r.session_id == rows_for_client[0].session_id, rows_for_client)
    single_sess_addons = [addon_info(r) for r in single_session]
    return len(unique_addons) == len(single_sess_addons)

addon_stats = DF_addons.rdd.groupBy(lambda r: r.client_id)\
    .map(lambda (cid, gp): Row(client_id = cid,
                               constant_addons = consistent_addons(gp),
                               constant_addons_guid = consistent_addons(gp, False)))
addon_stats = spark.createDataFrame(addon_stats)

DF_main = DF_main.join(addon_stats, "client_id", "outer")

In [37]:
## Double-check the count after joining.
print(DF_main.count())
DF_main.printSchema()

938691
root
 |-- client_id: long (nullable = true)
 |-- addons_sys_num: long (nullable = true)
 |-- build_id: string (nullable = true)
 |-- e10s: boolean (nullable = true)
 |-- has_hangs: boolean (nullable = true)
 |-- num_addons_nonsys: long (nullable = true)
 |-- period: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- shutdown: long (nullable = true)
 |-- startup_AMIend: long (nullable = true)
 |-- startup_AMIstart: long (nullable = true)
 |-- startup_XPIstart: long (nullable = true)
 |-- startup_firstpaint: long (nullable = true)
 |-- startup_main: long (nullable = true)
 |-- startup_sessionrestored: long (nullable = true)
 |-- startup_toplevelwindow: long (nullable = true)
 |-- sys_arch: string (nullable = true)
 |-- sys_cpu_count: long (nullable = true)
 |-- sys_mem: long (nullable = true)
 |-- sys_os: string (nullable = true)
 |-- sys_os_version: string (nullable = true)
 |-- was_startup_interrupted: boolean (nullable = true)
 |-- constant_e10s: boolean (nul

#### Write datasets to file

In [38]:
datasets_path = "addon-sdk-fix-data_{}".format(date.today().isoformat())
os.mkdir(datasets_path)

In [39]:
feather.write_dataframe(DF_main.toPandas(), datasets_path + "/main.feather")
feather.write_dataframe(DF_addons.toPandas(), datasets_path + "/addons.feather")
feather.write_dataframe(DF_hangs.toPandas(), datasets_path + "/hangs.feather")

In [40]:
os.system("tar cfz {}.tar.gz {}/*.feather".format(datasets_path, datasets_path))

0

In [82]:
#dataset_coll = dataset_df.map(lambda r: r.asDict()).collect()

In [86]:
#with open("addon-sdk-data.json", "w") as f:
 #   for row in dataset_coll:
  #      f.write(json.dumps(row) + "\n")        