In [1]:
import base64
import requests

PREFIX = 'http://127.0.0.1:8080'

# Define a schema and the keys

The schema has two roles. Firstly, it defines the format of the data, such as the type of each column, minimum and maximum lengths, and permitted values. Secondly, it contains all the hashing settings, from the hash size, through global salts, to the tokenisation settings of each field.

The schema is designed to be portable between Clkhash and Anonlink. Two runs of Clkhash with the same schema and the same keys on the same data will produce the exact same hashes, even if they use different versions of Clkhash.

The keys can be thought of as a secret salt. They must be shared between organisations that are hashing their data, but must not be given to the authority performing the linkage.

In [2]:
SCHEMA = {
  'version': 1,
  'clkConfig': {
    'l': 1024,
    'k': 20,
    'hash': {
      'type': 'doubleHash'
    },
    'kdf': {
      'type': 'HKDF',
      'hash': 'SHA256',
      'salt': 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==',
      'info': 'c2NoZW1hX2V4YW1wbGU=',
      'keySize': 64
    }
  },
  'features': [
    {
      'identifier': 'NAME freetext',
      'format': {
        'type': 'string',
        'encoding': 'utf-8',
        'case': 'mixed',
        'minLength': 3
      },
      'hashing': {
        'ngram': 2,
        'weight': 0.5
      }
    },
    {
      'identifier': 'DOB YYYY/MM/DD',
      'format': {
        'type': 'string',
        'encoding': 'ascii',
        'description': 'Numbers separated by slashes, in the year, month, day order',
        'pattern': '\\d\\d\\d\\d/\\d\\d/\\d\\d'
      },
      'hashing': {
        'ngram': 1,
        'positional': True
      }
    },
    {
      'identifier': 'GENDER M or F',
      'format': {
        'type': 'enum',
        'values': ['M', 'F']
      },
      'hashing': {
        'ngram': 1,
        'weight': 2
      }
    }
  ]
}

In [3]:
KEYS = b'correct', b'horse'

# Make new project

In [4]:
r = requests.post(
    PREFIX + '/projects/',
    params=dict(
        project_id='demo-data',
        keys=b','.join(map(base64.b64encode, KEYS))),
    json=SCHEMA)

print('Status code:', r.status_code)

Status code: 201


# See all projects

In [5]:
r = requests.get(
    PREFIX + '/projects')

print('Status code:', r.status_code)
print('Data:', r.json())

Status code: 200
Data: {'projects': ['demo-data']}


# Upload private data

In [6]:
csv_data = (
    'NAME freetext,DOB YYYY/MM/DD,GENDER M or F\n'
    'Jane Doe,1968/05/19,F\n'
    'Peter Griffin,1998/12/20,M\n')

r = requests.post(
    PREFIX + '/projects/{project_id}/pii/'.format(
        project_id='demo-data'),
    params=dict(
        header='true',
        validate=True),
    data=csv_data)

json = r.json()
clk_range_start = json['dataIds']['rangeStart']
clk_range_end = json['dataIds']['rangeEnd']

print('Status code:', r.status_code)
print('Data:', r.json())

Status code: 202
Data: {'dataIds': {'rangeEnd': 2, 'rangeStart': 0}}


# Get clks

In [7]:
r = requests.get(
    PREFIX + '/projects/{project_id}/clks/'.format(
        project_id='demo-data',
        index_range_start=clk_range_start,
        index_range_end=clk_range_end))

print('Status code:', r.status_code)
print('Data:', r.json())

Status code: 200
Data: {'clks': [{'index': 0, 'status': 'queued', 'errMsg': None, 'hash': None}, {'index': 1, 'status': 'queued', 'errMsg': None, 'hash': None}], 'responseMetadata': {'nextCursor': None}}


# Upload lots of private data!

In [8]:
with open('fake-data.csv') as f:
    csv_data = f.read()

r = requests.post(
    PREFIX + '/projects/{project_id}/pii/'.format(
        project_id='demo-data'),
    params=dict(
        header='true'),
    data=csv_data)
print(r.text)

json = r.json()
clk_range_start = json['dataIds']['rangeStart']
clk_range_end = json['dataIds']['rangeEnd']

print('Status code:', r.status_code)
print('Data:', r.json())

{
  "dataIds": {
    "rangeEnd": 100002,
    "rangeStart": 2
  }
}

Status code: 202
Data: {'dataIds': {'rangeEnd': 100002, 'rangeStart': 2}}


# Check job status

In [9]:
r = requests.get(
    PREFIX + '/projects/{project_id}/clks/status'.format(
        project_id='demo-data'))

print('Status code:', r.status_code)
print('Data:', r.json())

Status code: 200
Data: {'clksStatus': [{'status': 'done', 'rangeStart': 0, 'rangeEnd': 2}, {'status': 'queued', 'rangeStart': 2, 'rangeEnd': 100002}]}


# Delete clks or abort jobs

If a particular clk has already been computed, then we simply delete it from the database. Otherwise, we cancel the scheduled job and delete the private data.

In [10]:
r = requests.delete(
    PREFIX + '/projects/{project_id}/clks/'.format(
        project_id='demo-data'),
    params=dict(
        index_range_start=clk_range_start,
        index_range_end=clk_range_end))

print('Status code:', r.status_code)

Status code: 204


# Delete project

This will also delete all clks associated with the project and abort all hashing jobs.

In [11]:
r = requests.delete(
    PREFIX + '/projects/{project_id}'.format(
        project_id='demo-data'))

print('Status code:', r.status_code)

Status code: 204
