In [74]:
import numpy
import scipy
import matplotlib
import pandas as pd
import csv
from matplotlib_venn import venn2
#import os

In [2]:
#experiment_date='2020-02-17T164323'
#folder_results='results'
#file_prefix='results-'
file_name='traces.csv'

In [3]:
#file=folder_results+'/'+file_prefix+experiment_date+'/'+file_name
cassandra_traces = pd.read_csv(file_name)
tracedata = pd.DataFrame(cassandra_traces)
#tracedata.head(5)

1. Identify root traces
1. For each root trace, find children
1. Merge transparency tags as "Sets"
1. Map tags to vocabulary

In [6]:
#tracedata.describe()
printable = tracedata.sort_values(by = ['trace_id']).reset_index(drop=True)
#print(printable.dtypes)
#conv = printable.astype({'span_id': 'object'})
printable['span_id'] = printable['span_id'].apply(str)
print(printable.head(9).to_latex(buf = 'traces-table.tex', columns = ['trace_id', 'span_id', 'duration', 'operation_name', 'refs', 'start_time', 'tags'], formatters = {'trace_id': lambda x: x[-16:], 'span_id': lambda x: x[-5:]}))
relevant = tracedata.filter(items=['trace_id','span_id','operation_name', 'refs', 'tags'])
relevant['is_root'] = relevant['refs'].isna()
relevant = relevant.sort_values(by = ['trace_id', 'is_root']).reset_index(drop=True)
relevant.head(30)
#print(relevant.head(7).to_latex())

None


Unnamed: 0,trace_id,span_id,operation_name,refs,tags,is_root
0,0x00000000000000000026ae8f865f614f,739911884519597765,routes-parent,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'legal-basis', value_type: 'string', va...",False
1,0x00000000000000000026ae8f865f614f,4314644364506468629,calories-call-0,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'internal.span.format', value_type: 'st...",False
2,0x00000000000000000026ae8f865f614f,6265394339847271427,distances-parent,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'legal-basis', value_type: 'string', va...",False
3,0x00000000000000000026ae8f865f614f,6877021559702812560,calories-call-1,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'internal.span.format', value_type: 'st...",False
4,0x00000000000000000026ae8f865f614f,7755013386926157903,userdata-parent,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'purposes', value_type: 'string', value...",False
5,0x00000000000000000026ae8f865f614f,8297995326111613259,distances-call-0,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'internal.span.format', value_type: 'st...",False
6,0x00000000000000000026ae8f865f614f,10887980572959055,calories-parent,,"[{key: 'sampler.type', value_type: 'string', v...",True
7,0x0000000000000000004f3bb9c94c6ebc,80198442607341457,routes-parent,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'legitimate-interest', value_type: 'str...",False
8,0x0000000000000000004f3bb9c94c6ebc,473209590189840552,routes-parent,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'storage-ttl', value_type: 'string', va...",False
9,0x0000000000000000004f3bb9c94c6ebc,1092097697981834539,distances-parent,"[{ref_type: 'child-of', trace_id: 0x0000000000...","[{key: 'purposes', value_type: 'string', value...",False


### Exporting from cassandra with JSON column
As it turns out, cassandra has its own internal model for JSON-formatted columns, which on export to CSV, are not properly formatted JSON (missing double quotes for keys). For SELECT queries in CQL, there is a JSON modifier, that fixes the output for columns, but there is none for CQL's COPY.

- [x] TODO: find a way to parse the column of the CSV or export manually by running SELECT and writing results..?
    - Solution: use regexp parsing
- [x] TODO: re-generate output and CSV, so purposes are separated by something else than comma, so we can regexp-parse the malformed JSON output easier.
    - used semicolon

In [9]:
import re
for i, row in relevant.iterrows():
    #preprocessing...
    relevant.at[i, 'refs'] = '{"refs": '+re.sub("(\w+):\s'?([\w*\-.;:\w*]*)'?", r'"\1": "\2"', str(row.refs))+'}'
    relevant.at[i, 'tags'] = '{"tags": '+re.sub("(\w+):\s'?([\w*\-.;:\w*]*)'?", r'"\1": "\2"', str(row.tags))+'}'
relevant['tags'].head(5)[1]

'{"tags": [{"key": "internal.span.format", "value_type": "string", "value_string": "proto", "value_bool": "False", "value_long": "0", "value_double": "0", "value_binary": ""}]}'

In [10]:
import json
## traverse dict a and merge dict b
def load_dict(val):
    #TODO: later remove artificially added key in preprocessing, so json.loads gets us a list of objects directly
    dict_raw = json.loads(val)
    #flatten dict
    listOfObjects = dict_raw['tags']
    new = {}
    for obj in listOfObjects:
        if len(obj['value_string'].split(';')) > 1:
            new[obj['key']] = []
            for item in obj['value_string'].split(';'):
                new[obj['key']].append(item)
        else:
            new[obj['key']] = obj['value_string']
    return new

#based on: https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries/7205107#7205107
def deep_merge(a, b, raw=True, path=None):
    "merges b into a"
    if raw:
        b = load_dict(b)
    if path is None: path = []
    for key in b:
        if key in a:
            if type(a[key]) is not list:
                a[key] = [a[key]]
            if type(b[key]) is list:
                for val in b[key]:
                    a[key].append(val)
            else:
                a[key].append(b[key])
        else:
            a[key] = b[key]
    return a

In [42]:
#for i, row in relevant.head(10).iterrows():
 #   objects = load_dict(row.tags)
 #   print(objects)

def split_colons_to_dict(items):
    all_items_as_dict = {}
    for item in items:
        keyval = item.split(':')
        tmp_dict = {keyval[0]: keyval[1]}
        deep_merge(all_items_as_dict, tmp_dict, raw=False)
    return all_items_as_dict

### Merge all tag data for each trace

In [44]:
trace_id_old = ''
tagset = {}
tagsets = {}
for i, row in relevant.iterrows():
    if 'operations' not in tagset.keys():
        tagset['operations'] = []
    tagset['operations'].append(row.operation_name)
    if row.trace_id == trace_id_old:
        tagset = deep_merge(tagset, row.tags)
    else:
        #remove duplicates from list
        for key, val in tagset.items():
            if type(val) is list:
                tagset[key] = list(set(val))
        tagsets[trace_id_old] = tagset
        tagset = load_dict(row.tags)
        trace_id_old = row.trace_id
    

### If there is a colon in the list of values, split these and merge them into a dict

In [45]:
for tagset in tagsets.values():
    for key, values in tagset.items():
        if type(values) is list:
            if ':' in values[0]:
                tagset[key] = split_colons_to_dict(values)
tagsets

{'': {'operations': ['routes-parent']},
 '0x00000000000000000026ae8f865f614f': {'legal-basis': ['01', '02', '00'],
  'legitimate-interest': ['', '01', '02'],
  'data-categories': ['01', '02', '03', '00'],
  'storage-ttl': {'01': ['01', '00'],
   '00': ['00', '02'],
   '03': '01',
   '02': '00'},
  'purposes': ['01', '02', '00'],
  'internal.span.format': ['proto'],
  'operations': ['userdata-parent',
   'calories-call-0',
   'calories-parent',
   'distances-call-0',
   'routes-parent',
   'calories-call-1',
   'distances-parent'],
  'sampler.type': 'probabilistic',
  'sampler.param': '',
  'automation': '01'},
 '0x0000000000000000004f3bb9c94c6ebc': {'legitimate-interest': ['',
   '01',
   '02'],
  'data-categories': ['01', '04', '02', '00', '03'],
  'storage-ttl': {'01': ['01', '00'],
   '00': ['00', '02'],
   '03': '01',
   '04': ['00', '01'],
   '02': '00'},
  'purposes': ['01', '02', '00'],
  'legal-basis': ['01', '02', '00'],
  'internal.span.format': ['proto'],
  'operations': ['u

### Apply 'max' filtering to storage-ttl values (because the policy should reflect worst-case storage durations)

In [46]:
for tagset in tagsets.values():
    for key, values in tagset.items():
        if key == "storage-ttl":
            for cat, ttls in values.items():
                tagset["storage-ttl"][cat] = max(ttls).zfill(2)
#tagsets

In [47]:
with open('vocab.json', 'r') as vocab_file:
    vocab = json.load(vocab_file)
#vocab

## From Traces, Lookup Values in the Vocab
* iterate over traces
    * iterate over transparency keys
        * for each key, lookup mapping in translation
        * for each key, lookup key in vocab
        * in category of vocab, lookup "id" for each item in list of values, add "name" field to policy.
        * if we get an error (e.g., a missing key), this hints at an issue in our policy description
    


In [48]:
lookup_translation = {
    "recipients": "third-parties",
    "sources": "third-parties",
    "legal-basis": "legal-bases",
    "data-categories": "categories",
    "storage-ttl": {"categories": "ttl"},
    "legitimate-interest": "legitimate-interests"
}

In [51]:
request_policies = {}
for traceid in tagsets.keys():
    tmp_policy = {}
    for cat_key, cat_value in tagsets[traceid].items():
        category = cat_key
        dict_mapping = False
        if cat_key in lookup_translation:
            category = lookup_translation[cat_key]
        if type(category) is dict:
            #if the looked up mapping is a dict, we assume policy items to be in dict format, too
            for key, val in category.items():
                tmp_policy[cat_key] = []
                for id_key, id_val in cat_value.items():
                    if id_val != '' and id_key != '':
                        tmp_policy[cat_key].append({vocab[key][id_key]['name']: vocab[val][id_val]['name']})
        if type(cat_value) is list:
            tmp_policy[cat_key] = cat_value
            if category in vocab.keys():
                tmp_policy[cat_key] = []
                for id_val in cat_value:
                    if id_val != "":
                        tmp_policy[cat_key].append(vocab[category][id_val]['name'])
        if type(cat_value) is str:
            if (dict_mapping == False and category in vocab.keys()):
                tmp_policy[cat_key] = vocab[category][cat_value]['name']
    #print(tmp_policy)
    request_policies[traceid] = tmp_policy

### Transform data to arrive at "categories by operation"
Note: we only take a simplified look at the end-to-end traces here, assuming we know how many features/methods exist

In [84]:
count = 0
by_operation = {}
for key, value in request_policies.items():
    if "leaderboards-parent" in value['operations'] and count < 2:
        by_operation['leaderboards-parent'] = value['data-categories']
        count = count+1
    if "calories-parent" in value['operations'] and count < 2:
        by_operation['calories-parent'] = value['data-categories']
        count = count+1     
#print(by_operation)
json.dumps(by_operation)

'{"calories-parent": ["Location Information", "Size Information", "Activity Information", "Identity Information"], "leaderboards-parent": ["Location Information", "Social Media Information", "Size Information", "Identity Information", "Activity Information"]}'