In [None]:
sc.cancelAllJobs()

In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import plotly.plotly as py
import IPython
import pyspark.sql.functions as fun
from pyspark.sql import Row
from datetime import date
import feather

from __future__ import division
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test

%pylab inline
IPython.core.pylabtools.figsize(16, 7)

Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/histogram-whitelists.json). Assuming all histograms are acceptable.
Populating the interactive namespace from numpy and matplotlib


In [2]:
sc.defaultParallelism

320

In [3]:
sc.version

u'2.0.0'

### Load nightly data to test

In [4]:
dataset_before = get_pings(sc, app="Firefox", channel="nightly", build_id=("20161009000000", "20161016000000"))
%time print("Num sessions before: {:,}\n".format(dataset_before.count()))

Num sessions before: 483,495

CPU times: user 44 ms, sys: 32 ms, total: 76 ms
Wall time: 1min 14s


In [6]:
dataset_after = get_pings(sc, app="Firefox", channel="nightly", build_id=("20161024000000", "20161028000000"))
%time print("Num sessions after: {:,}\n".format(dataset_after.count()))

Num sessions after: 214,077

CPU times: user 56 ms, sys: 8 ms, total: 64 ms
Wall time: 21.6 s


Combine data and restrict to data of interest.

In [7]:
def extract_data(ping):
    data = {
        "clientId": ping.get("clientId"),
        "buildId": ping.get("application", {}).get("buildId"),
    }
    data["period"] = "before" if data["buildId"] <= "20161016000000" else "after"
    
    env = ping.get("environment")
    if not env:
        return None
    data["architecture"] = env.get("build", {}).get("architecture")
    data["e10sEnabled"] = env.get("settings", {}).get("e10sEnabled")
    
    sys = env.get("system", {})
    ## Some system data won't be relevant here.
    for subfld in "hdd", "gfx":
        if subfld in sys:
            del sys[subfld]
    data["system"] = sys
    
    ## Only need IDs of installed add-ons, split according to whether or not
    ## they are system add-ons.
    addons = env.get("addons", {}).get("activeAddons", {})
    sys_addons = []
    nonsys_addons = []
    def addon_listing(guid, meta):
        return (guid, meta.get("version"))
    
    for guid, meta in addons.iteritems():
        if meta.get("isSystem"):
            sys_addons.append(addon_listing(guid, meta))
        else:
            nonsys_addons.append(addon_listing(guid, meta))
    data["addons"] = {"system": sys_addons, "non_system": nonsys_addons}
    
    payload = ping.get("payload", {})
    if not payload:
        return None
    data["histograms"] = payload.get("histograms", {})
    data["keyedHistograms"] = payload.get("keyedHistograms", {})
    data["simpleMeasurements"] = payload.get("simpleMeasurements", {})
    data["hangs"] = payload.get("threadHangStats", [])
    data["processes"] = payload.get("processes", {})
    data["childPayloads"] = payload.get("childPayloads", [])
    
    return data

def good_payload(data):
    return (data is not None and
            data["clientId"] is not None and
            data["buildId"] is not None and
            data["e10sEnabled"] is not None)

In [8]:
full_data = dataset_before.union(dataset_after)
dataset = full_data.map(extract_data).filter(good_payload)
## Key by clientID.
dataset = dataset.map(lambda d: (d["clientId"], d)).persist(StorageLevel.MEMORY_AND_DISK_SER)

In [10]:
%time print("Overall num sessions: {:,}\n".format(dataset.count()))

Overall num sessions: 697,572

CPU times: user 64 ms, sys: 56 ms, total: 120 ms
Wall time: 58.1 s


In [13]:
## Save this dataset to S3.
s3_path = "s3://mozilla-metrics/user/dzeber/tmp/addon-sdk-fix/nightly_{}/".format(date.today().isoformat())
dataset.saveAsPickleFile(s3_path)

### Create a longitudinal dataset

We plan to compare metrics between builds before and after the changes, within each profile. Group sessions by profile, and make sure that add-ons and the e10s setting stays constant over the sessions we observe. Also, retain only profiles which have both before and after measurements.

In [14]:
def session_vals_for_check(session):
    return {
        "e10s": session["e10sEnabled"],
        "addons": session["addons"]["non_system"],
        "period": session["period"]
    }

def constant_e10s_setting(session_vals):
    e10s_settings = [s["e10s"] for s in session_vals]
    return len(set(e10s_settings)) == 1

def constant_active_addons(session_vals):
    addons = [s["addons"] for s in session_vals]
    ## All add-on lists must have the same length...
    if len(set(map(len, addons))) != 1:
        return False
    ## ...and they must contain the same add-ons.
    all_addons = set().union(*addons)
    return len(all_addons) == len(addons[0])

def both_periods(session_vals):
    periods = [s["period"] for s in session_vals]
    return len(set(periods)) == 2

In [15]:
prof_data = dataset.mapValues(session_vals_for_check).groupByKey()
n_prof = prof_data.count()
print("Num profiles represented in the dataset: {:,}".format(n_prof))

Num profiles represented in the dataset: 52,618


#### Edit:
Don't to the filtering here, as this cuts out a lot of profiles. Leave this to be handled during the analysis.

In [16]:
#prof_data_1 = prof_data.filter(lambda (cid, vals): constant_e10s_setting(vals))
#n_prof_1 = prof_data_1.count()
#n_prof_dropped = n_prof - n_prof_1
#print("Num profiles with changing e10s: {:,} ({:.2f}%)"\
#          .format(n_prof_dropped, n_prof_dropped / n_prof * 100))

Num profiles with changing e10s: 4,273 (8.12%)


In [17]:
#prof_data_2 = prof_data_1.filter(lambda (cid, vals): constant_active_addons(vals))
#n_prof_2 = prof_data_2.count()
#n_prof_dropped = n_prof_1 - n_prof_2
#print("Num remaining profiles with changing (non-system) add-ons: {:,} ({:.2f}%)"\
#          .format(n_prof_dropped, n_prof_dropped / n_prof * 100))

Num remaining profiles with changing (non-system) add-ons: 13,504 (25.66%)


In [None]:
#prof_data_3 = prof_data_2.filter(lambda (cid, vals): both_periods(vals))
#n_prof_3 = prof_data_3.count()
#n_prof_dropped = n_prof_2 - n_prof_3
#print("Num remaining profiles without both periods: {:,} ({:.2f}%)"\
#          .format(n_prof_dropped, n_prof_dropped / n_prof * 100))

In [None]:
#print("Num profiles remaining after filtering: {:,} ({:.2f}%)"\
#          .format(n_prof_3, n_prof_3 / n_prof * 100))

Restrict the overall dataset to include only these profiles.

In [32]:
#good_clients = prof_data_3.map(lambda (cid, d): cid).distinct().collect()
#dataset_longit = dataset.filter(lambda (cid, d): cid in good_clients)\
#    .persist(StorageLevel.MEMORY_AND_DISK_SER)
dataset_longit = dataset

In [None]:
#print("Num sessions remaining: {:,}".format(dataset_longit.count()))

In [None]:
## Sanity check
#print("Num unique profiles: {:,}".format(dataset_longit.map(lambda (cid, d): cid).distinct().count()))

Create a DataFrame to work with.

In [30]:
def longit_row(data):
    sm = data["simpleMeasurements"]
    return Row(
        client_id = data["clientId"],
        build_id = data["buildId"],
        period = data["period"],
        e10s = data["e10sEnabled"],
        ## Stringify the list of add-ons for serialization to feather.
        addons_nonsys = ",".join([":".join(a) for a in data["addons"]["non_system"]]),
        addons_sys_num = len(data["addons"]["system"]),
        
        ## Some system covariates
        sys_arch = data["architecture"],
        sys_mem = data["system"].get("memoryMB"),
        sys_cpu_count = data["system"].get("cpu", {}).get("count"),
        sys_os = data["system"].get("os", {}).get("name"),
        sys_os_version = data["system"].get("os", {}).get("version"),
        
        ## Startup times and info
        startup_interrupted = sm.get("startupInterrupted", 0),
        startup_main = sm.get("main"),
        startup_AMIstart = sm.get("AMI_startup_begin"),
        startup_AMIend = sm.get("AMI_startup_end"),
        startup_toplevelwindow = sm.get("createTopLevelWindow"),
        startup_firstpaint = sm.get("firstPaint"),
        startup_sessionrestored = sm.get("sessionRestored")   
    )

In [33]:
dataset_df = dataset_longit.map(lambda (cid, d): longit_row(d))
DF = spark.createDataFrame(dataset_df)
DF.printSchema()

root
 |-- addons_nonsys: string (nullable = true)
 |-- addons_sys_num: long (nullable = true)
 |-- build_id: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- e10s: boolean (nullable = true)
 |-- period: string (nullable = true)
 |-- startup_AMIend: long (nullable = true)
 |-- startup_AMIstart: long (nullable = true)
 |-- startup_firstpaint: long (nullable = true)
 |-- startup_interrupted: long (nullable = true)
 |-- startup_main: long (nullable = true)
 |-- startup_sessionrestored: long (nullable = true)
 |-- startup_toplevelwindow: long (nullable = true)
 |-- sys_arch: string (nullable = true)
 |-- sys_cpu_count: long (nullable = true)
 |-- sys_mem: long (nullable = true)
 |-- sys_os: string (nullable = true)
 |-- sys_os_version: string (nullable = true)



In [35]:
DFp = DF.toPandas()

In [37]:
feather.write_dataframe(DFp, "addon-sdk-startup-data_{}.feather".format(date.today().isoformat()))