In [1]:
import numpy
import pandas as pd
import csv
#import matplotlib.pyplot as plt
#import os

In [2]:
#experiment_date='2020-02-17T164323'
#folder_results='results'
#file_prefix='results-'
file_name='traces.csv'

In [3]:
#file=folder_results+'/'+file_prefix+experiment_date+'/'+file_name
cassandra_traces = pd.read_csv(file_name)
tracedata = pd.DataFrame(cassandra_traces)
#tracedata.head(5)

1. Identify root traces
1. For each root trace, find children
1. Merge transparency tags as "Sets"
1. Map tags to vocabulary

In [4]:
#tracedata.describe()
relevant = tracedata.filter(items=['trace_id','span_id','operation_name', 'refs', 'tags'])
relevant['is_root'] = relevant['refs'].isna()
relevant = relevant.sort_values(by = ['trace_id', 'is_root']).reset_index(drop=True)
#relevant.head()
#relevant['tags'].head(5)[1]

### Exporting from cassandra with JSON column
As it turns out, cassandra has its own internal model for JSON-formatted columns, which on export to CSV, are not properly formatted JSON (missing double quotes for keys). For SELECT queries in CQL, there is a JSON modifier, that fixes the output for columns, but there is none for CQL's COPY.

- [x] TODO: find a way to parse the column of the CSV or export manually by running SELECT and writing results..?
    - Solution: use regexp parsing (lol)
- [x] TODO: re-generate output and CSV, so purposes are separated by something else than comma, so we can regexp-parse the malformed JSON output easier.
    - used semicolon

In [5]:
import re
for i, row in relevant.iterrows():
    #preprocessing...
    relevant.at[i, 'refs'] = '{"refs": '+re.sub("(\w+):\s'?([\w*\-.;:\w*]*)'?", r'"\1": "\2"', str(row.refs))+'}'
    relevant.at[i, 'tags'] = '{"tags": '+re.sub("(\w+):\s'?([\w*\-.;:\w*]*)'?", r'"\1": "\2"', str(row.tags))+'}'
#relevant['tags'].head(5)[1]

In [6]:
import json
## traverse dict a and merge dict b
def load_dict(val):
    #TODO: later remove artificially added key in preprocessing, so json.loads gets us a list of objects directly
    dict_raw = json.loads(val)
    #flatten dict
    listOfObjects = dict_raw['tags']
    new = {}
    for obj in listOfObjects:
        if len(obj['value_string'].split(';')) > 1:
            new[obj['key']] = []
            for item in obj['value_string'].split(';'):
                new[obj['key']].append(item)
        else:
            new[obj['key']] = obj['value_string']
    return new

#based on: https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries/7205107#7205107
def deep_merge(a, b_raw, path=None):
    "merges b into a"
    b = load_dict(b_raw)
    if path is None: path = []
    for key in b:
        if key in a:
            if type(a[key]) is not list:
                a[key] = [a[key]]
            if type(b[key]) is list:
                for val in b[key]:
                    a[key].append(val)
            else:
                a[key].append(b[key])
        else:
            a[key] = b[key]
    return a

In [7]:
#for i, row in relevant.head(10).iterrows():
 #   objects = load_dict(row.tags)
 #   print(objects)

In [10]:
trace_id_old = ''
tagset = {}
tagsets = {}
for i, row in relevant.iterrows():
    if row.trace_id == trace_id_old:
        tagset = deep_merge(tagset, row.tags)
    else:
        #TODO: remove duplicates from list; this doesn't work?
        for key, val in tagset.items():
            if type(val) is list:
                val = list(set(val))
        tagsets[trace_id_old] = tagset
        tagset = load_dict(row.tags)
        trace_id_old = row.trace_id

In [11]:
tagsets

{'': {},
 '0x0000000000000000002c91df57b25d13': {'categories-processing': ['',
   '01',
   '03',
   '00',
   '00',
   '03',
   '04'],
  'categories-storage': ['01', '03', '00', '03', '04'],
  'storage-ttl': ['00', '00', '01', '00', '00'],
  'recipients': ['', '', '001', '', '000'],
  'automation': ['', '', '', ''],
  'purposes': ['00', '00', '01', '00', '01', '02', '00'],
  'legal-basis': ['00', '00', '01', '01', '02', '00'],
  'legitimate-interest': ['', '01', '02'],
  'internal.span.format': ['proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto'],
  'sources': ['', '000'],
  'sampler.type': 'probabilistic',
  'sampler.param': ''},
 '0x00000000000000000042db74fbb127d9': {'internal.span.format': ['proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto',
   'proto'],
  'purposes': ['00', '01', '00', '00', '01', '02', '00'],
  'legal-basis': ['00', '00', '01', '01', '02', '00'],
  'categories-processing'

## TODO
* There might be something wrong with the combined data, e.g. traces which only contain recipients tags should not exist as this would imply that the facebook-service is called on its own
  * This fixed itself after a restart??????

In [12]:
with open('vocab.json', 'r') as vocab_file:
    vocab = json.load(vocab_file)

In [13]:
vocab

{'categories': [{'id': '00',
   'name': 'Identity Information',
   'description': 'First and Last Name(s), date of birth, unique account identifier, gender, social media identifiers'},
  {'id': '01', 'name': 'Location Information', 'description': ''},
  {'id': '02', 'name': 'Size Information', 'description': ''},
  {'id': '03', 'name': 'Activity Information', 'description': ''},
  {'id': '04', 'name': 'Social Media Information', 'description': ''}],
 'purposes': [{'id': '00',
   'name': 'Prodividing of Service',
   'description': ''},
  {'id': '01', 'name': 'Business'},
  {'id': '02', 'name': 'Legal Compliance'}],
 'ttl': [{'id': '00', 'name': 'account-lifetime', 'description': ''},
  {'id': '01', 'name': 'unlimited', 'description': ''},
  {'id': '02', 'name': 'processing-only', 'description': ''}],
 'third-parties': [{'id': '000',
   'name': 'facebook Europe',
   'address': {'office': '',
    'country': '',
    'zip': '',
    'street': '',
    'number': '',
    'additional': ''}}],
 '