### SHIELD Variations, Data Pull and Analysis

In [10]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

from pprint import pprint as pp

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
sc.defaultParallelism

16

In [4]:
# helpers and utils

import datetime as DT
DAYS=7
today = DT.date.today()
week_ago = today - DT.timedelta(days=DAYS)
week_ago_fmt = week_ago.strftime("%Y%m%d")
today_fmt = today.strftime("%Y%m%d")

PINGNAME = 'x-shield-trials'


### Extract, Transform, Analyze

0.  [docs](https://github.com/mozilla/python_moztelemetry/blob/master/moztelemetry/spark.py)
1.  Get all heartbeat pings


In [5]:
kwargs = dict(
    doc_type="OTHER", 
    submission_date=(week_ago_fmt,today_fmt),
    app="Firefox"
)

pings = get_pings(sc, channel="release", **kwargs).union(
        get_pings(sc, channel="aurora",  **kwargs)).union(
        get_pings(sc, channel="beta",    **kwargs)).union(
        get_pings(sc, channel="nightly", **kwargs))

pings = pings.filter(lambda p: p["meta"]["docType"] == PINGNAME)


In [6]:
pings.count()
pings.first()


{u'application': {u'architecture': u'x86-64',
  u'buildId': u'20160210153822',
  u'channel': u'release',
  u'name': u'Firefox',
  u'platformVersion': u'44.0.2',
  u'vendor': u'Mozilla',
  u'version': u'44.0.2',
  u'xpcomAbi': u'x86_64-gcc3'},
 u'creationDate': u'2016-02-28T17:08:11.942Z',
 u'id': u'5bec0b39-2c20-f445-9071-7b6af4083f4d',
 'meta': {u'DNT': u'1',
  u'Host': u'incoming.telemetry.mozilla.org',
  'Hostname': u'ip-172-31-38-72',
  u'Size': 431.0,
  'Timestamp': 1456679350433673984L,
  'Type': u'telemetry',
  u'appBuildId': u'20160210153822',
  u'appName': u'Firefox',
  u'appUpdateChannel': u'release',
  u'appVendor': u'Mozilla',
  u'appVersion': u'44.0.2',
  u'creationTimestamp': 1.4566792919419999e+18,
  u'docType': u'x-shield-trials',
  u'documentId': u'5bec0b39-2c20-f445-9071-7b6af4083f4d',
  u'geoCity': u'Portland',
  u'geoCountry': u'US',
  u'normalizedChannel': u'release',
  u'sourceName': u'telemetry',
  u'sourceVersion': u'4',
  u'submissionDate': u'20160228'},
 u'pay

In [7]:
## here is the final report.
def daysSinceLaunch(jsnow, jslaunch):
    # this can be affected by clockSkew
    
    if (jsnow < jslaunch): return -1  # problem.
    return int(divmod(jsnow - jslaunch, 86400)[0])  # n days.
    
def getFields(ping):
    payload = ping['payload']
    return (
        payload['who'],
        payload['name'],
        payload['branch'],
        
        ## this is buggy / wrong
        daysSinceLaunch(ping['meta']['Timestamp']/(10e5),int(payload['firstrun']))
    )
    return 

def reducedFields (ping_tuple):
    return ping_tuple[1:]  # drop who            
              
data = pings.map(getFields).distinct()
print data.collect()

data.map(reducedFields).countByValue()

# make a much much nicer report here, for each experiment, for each branch...  N ever seen, % alive on day 3, total hours, etc.


[(u'/Users/mgrimes/Downloads', u'gregg experiment 1', u'a', 0), (u'C:\\Users\\rjweiss\\Downloads', u'gregg experiment 1', u'b', 280), (u'/Users/mgrimes/Downloads', u'gregg experiment 1', u'a', 1983), (u'/Users/glind/Downloads', u'gregg experiment 1', u'a', 0), (u'C:\\Users\\rjweiss\\Downloads', u'gregg experiment 1', u'b', -1), (u'/Users/glind/Downloads', u'gregg experiment 1', u'b', 0), (u'C:\\Users\\rjweiss\\Downloads', u'gregg experiment 1', u'b', 931), (u'/var/folders/0z/4g3t_26s3gv835xswsslbn400000gq/T/f8683aca-209c-4e6b-8e08-5d2081219781', u'gregg experiment 1', u'a', 0)]


defaultdict(int,
            {(u'gregg experiment 1', u'a', 0): 3,
             (u'gregg experiment 1', u'a', 1983): 1,
             (u'gregg experiment 1', u'b', -1): 1,
             (u'gregg experiment 1', u'b', 0): 1,
             (u'gregg experiment 1', u'b', 280): 1,
             (u'gregg experiment 1', u'b', 931): 1})

In [12]:
from pprint import pprint as pp

df = pd.DataFrame(data.collect(),columns=['who','experiment','branch','days'])

pd.pivot_table(df, columns=('experiment','branch','days'),aggfunc=lambda x: len(set(x)))

     experiment          branch  days 
who  gregg experiment 1  a        0       3
                                  1983    1
                         b       -1       1
                                  0       1
                                  280     1
                                  931     1
dtype: int64

In [13]:
data.collect()

[(u'/Users/mgrimes/Downloads', u'gregg experiment 1', u'a', 0),
 (u'C:\\Users\\rjweiss\\Downloads', u'gregg experiment 1', u'b', 280),
 (u'/Users/mgrimes/Downloads', u'gregg experiment 1', u'a', 1983),
 (u'/Users/glind/Downloads', u'gregg experiment 1', u'a', 0),
 (u'C:\\Users\\rjweiss\\Downloads', u'gregg experiment 1', u'b', -1),
 (u'/Users/glind/Downloads', u'gregg experiment 1', u'b', 0),
 (u'C:\\Users\\rjweiss\\Downloads', u'gregg experiment 1', u'b', 931),
 (u'/var/folders/0z/4g3t_26s3gv835xswsslbn400000gq/T/f8683aca-209c-4e6b-8e08-5d2081219781',
  u'gregg experiment 1',
  u'a',
  0)]