In [1]:
import numpy
import pandas as pd
import csv
#import matplotlib.pyplot as plt
#import os

In [2]:
#experiment_date='2020-02-17T164323'
#folder_results='results'
#file_prefix='results-'
file_name='traces.csv'

In [3]:
#file=folder_results+'/'+file_prefix+experiment_date+'/'+file_name
cassandra_traces = pd.read_csv(file_name)
tracedata = pd.DataFrame(cassandra_traces)
#tracedata.head(5)

1. Identify root traces
1. For each root trace, find children
1. Merge transparency tags as "Sets"
1. Map tags to vocabulary

In [4]:
#tracedata.describe()
printable = tracedata.sort_values(by = ['trace_id']).reset_index(drop=True)
#print(printable.dtypes)
#conv = printable.astype({'span_id': 'object'})
printable['span_id'] = printable['span_id'].apply(str)
print(printable.head(9).to_latex(buf = 'tex-table.txt', columns = ['trace_id', 'span_id', 'duration', 'operation_name', 'tags'], formatters = {'trace_id': lambda x: x[-14:], 'span_id': lambda x: x[-5:]}))
relevant = tracedata.filter(items=['trace_id','span_id','operation_name', 'refs', 'tags'])
relevant['is_root'] = relevant['refs'].isna()
relevant = relevant.sort_values(by = ['trace_id', 'is_root']).reset_index(drop=True)
#relevant.head()
#print(relevant.head(5).to_latex())

None


### Exporting from cassandra with JSON column
As it turns out, cassandra has its own internal model for JSON-formatted columns, which on export to CSV, are not properly formatted JSON (missing double quotes for keys). For SELECT queries in CQL, there is a JSON modifier, that fixes the output for columns, but there is none for CQL's COPY.

- [x] TODO: find a way to parse the column of the CSV or export manually by running SELECT and writing results..?
    - Solution: use regexp parsing (lol)
- [x] TODO: re-generate output and CSV, so purposes are separated by something else than comma, so we can regexp-parse the malformed JSON output easier.
    - used semicolon

In [5]:
import re
for i, row in relevant.iterrows():
    #preprocessing...
    relevant.at[i, 'refs'] = '{"refs": '+re.sub("(\w+):\s'?([\w*\-.;:\w*]*)'?", r'"\1": "\2"', str(row.refs))+'}'
    relevant.at[i, 'tags'] = '{"tags": '+re.sub("(\w+):\s'?([\w*\-.;:\w*]*)'?", r'"\1": "\2"', str(row.tags))+'}'
#relevant['tags'].head(5)[1]

In [6]:
import json
## traverse dict a and merge dict b
def load_dict(val):
    #TODO: later remove artificially added key in preprocessing, so json.loads gets us a list of objects directly
    dict_raw = json.loads(val)
    #flatten dict
    listOfObjects = dict_raw['tags']
    new = {}
    for obj in listOfObjects:
        if len(obj['value_string'].split(';')) > 1:
            new[obj['key']] = []
            for item in obj['value_string'].split(';'):
                new[obj['key']].append(item)
        else:
            new[obj['key']] = obj['value_string']
    return new

#based on: https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries/7205107#7205107
def deep_merge(a, b_raw, path=None):
    "merges b into a"
    b = load_dict(b_raw)
    if path is None: path = []
    for key in b:
        if key in a:
            if type(a[key]) is not list:
                a[key] = [a[key]]
            if type(b[key]) is list:
                for val in b[key]:
                    a[key].append(val)
            else:
                a[key].append(b[key])
        else:
            a[key] = b[key]
    return a

In [7]:
#for i, row in relevant.head(10).iterrows():
 #   objects = load_dict(row.tags)
 #   print(objects)

In [8]:
trace_id_old = ''
tagset = {}
tagsets = {}
for i, row in relevant.iterrows():
    if row.trace_id == trace_id_old:
        tagset = deep_merge(tagset, row.tags)
    else:
        #remove duplicates from list
        for key, val in tagset.items():
            if type(val) is list:
                tagset[key] = list(set(val))
        tagsets[trace_id_old] = tagset
        tagset = load_dict(row.tags)
        trace_id_old = row.trace_id

In [9]:
tagsets

{'': {},
 '0x00000000000000000015dbd83c348af1': {'internal.span.format': ['proto'],
  'purposes': ['02', '01', '00'],
  'legal-basis': ['02', '01', '00'],
  'categories-processing': ['', '01', '00', '03', '04'],
  'categories-storage': ['04', '03', '01', '00'],
  'storage-ttl': ['03:00', '01:00', '00:01', '04:00'],
  'recipients': '000',
  'sources': '000',
  'legitimate-interest': ['', '01', '02'],
  'sampler.type': 'probabilistic',
  'sampler.param': ''},
 '0x0000000000000000001752ca9c2f18d8': {'internal.span.format': ['proto'],
  'purposes': ['02', '01', '00'],
  'legal-basis': ['02', '01', '00'],
  'legitimate-interest': ['', '01', '02'],
  'categories-processing': ['', '01', '00', '03', '04'],
  'categories-storage': ['04', '03', '01', '00'],
  'storage-ttl': ['03:00', '01:00', '00:01', '04:00'],
  'recipients': '000',
  'sources': '000',
  'sampler.type': 'probabilistic',
  'sampler.param': ''},
 '0x0000000000000000002abb5c9debdc63': {'legal-basis': ['00', '01', '02'],
  'legitim

## TODO
* There might be something wrong with the combined data, e.g. traces which only contain recipients tags should not exist as this would imply that the facebook-service is called on its own
  * This fixed itself after a restart??????

In [10]:
with open('vocab.json', 'r') as vocab_file:
    vocab = json.load(vocab_file)
vocab

{'categories': {'00': {'name': 'Identity Information',
   'description': 'First and Last Name(s), date of birth, unique account identifier, gender, social media identifiers'},
  '01': {'name': 'Location Information', 'description': ''},
  '02': {'name': 'Size Information', 'description': ''},
  '03': {'name': 'Activity Information', 'description': ''},
  '04': {'name': 'Social Media Information', 'description': ''}},
 'purposes': {'00': {'name': 'Prodividing of Service', 'description': ''},
  '01': {'name': 'Business'},
  '02': {'name': 'Legal Compliance'}},
 'ttl': {'00': {'name': 'Account Lifetime', 'description': ''},
  '01': {'name': 'Unlimited', 'description': ''},
  '02': {'name': 'Processing Only', 'description': ''}},
 'third-parties': {'000': {'name': 'facebook Europe',
   'address': {'office': '',
    'country': '',
    'zip': '',
    'street': '',
    'number': '',
    'additional': ''}}},
 'legal-bases': {'00': {'name': 'Consent',
   'description': 'Where explicit consent b

## From Traces, Lookup Values in the Vocab
* iterate over traces
    * iterate over transparency keys
        * for each key, lookup mapping in translation
        * for each key, lookup key in vocab
        * in category of vocab, lookup "id" for each item in list of values, add "name" field to policy.
        * if we get an error (e.g., a missing key), this hints at an issue in our policy description
    


In [11]:
lookup_translation = {
    "recipients": "third-parties",
    "sources": "third-parties",
    "legal-basis": "legal-bases",
    "categories-processing": "categories",
    "categories-storage": "categories",
    "storage-ttl": {"categories": "ttl"},
    "legitimate-interest": "legitimate-interests"
}

In [21]:
request_policies = {}
for traceid in tagsets.keys():
    tmp_policy = {}
    for cat_key, cat_value in tagsets[traceid].items():
        category = cat_key
        dict_mapping = False
        if cat_key in lookup_translation:
            category = lookup_translation[cat_key]
        if type(category) is dict:
            dict_mapping = True
        if type(cat_value) is list:
            tmp_policy[cat_key] = []
            if dict_mapping:
                for key, val in category.items():
                    for id_val in cat_value:
                        if id_val != '':
                            map_keys = id_val.split(':')
                            tmp_policy[cat_key].append({vocab[key][map_keys[0]]['name']: vocab[val][map_keys[1]]['name']})
            else:
                if category in vocab.keys():
                    for id_val in cat_value:
                        if id_val != "":
                            tmp_policy[cat_key].append(vocab[category][id_val]['name'])
        else:
            if category in vocab.keys():
                tmp_policy[cat_key] = vocab[category][cat_value]['name']
    #print(tmp_policy)
    request_policies[traceid] = tmp_policy

In [22]:
request_policies

{'': {},
 '0x00000000000000000015dbd83c348af1': {'internal.span.format': [],
  'purposes': ['Legal Compliance', 'Business', 'Prodividing of Service'],
  'legal-basis': ['Legitimate Interest', 'Contractual', 'Consent'],
  'categories-processing': ['Location Information',
   'Identity Information',
   'Activity Information',
   'Social Media Information'],
  'categories-storage': ['Social Media Information',
   'Activity Information',
   'Location Information',
   'Identity Information'],
  'storage-ttl': [{'Activity Information': 'Account Lifetime'},
   {'Location Information': 'Account Lifetime'},
   {'Identity Information': 'Unlimited'},
   {'Social Media Information': 'Account Lifetime'}],
  'recipients': 'facebook Europe',
  'sources': 'facebook Europe',
  'legitimate-interest': ['Compliance', 'Business Improvement']},
 '0x0000000000000000001752ca9c2f18d8': {'internal.span.format': [],
  'purposes': ['Legal Compliance', 'Business', 'Prodividing of Service'],
  'legal-basis': ['Legit