# Lecture 39: Privacy

In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline
np.set_printoptions(legacy='1.13')

import datetime
import re

## Automated license plate readers in Oakland, CA

In [None]:
# source: https://data.oaklandnet.com/browse?q=alpr
alpr = Table.read_table('alpr_all.csv.gz', sep=',')\
  .relabeled('red_VRM', 'plate')\
  .relabeled('red_Timestamp', 'time')
alpr

In [None]:
p = re.compile('\((.*),(.*)\)')

def parse_latitude(s):
    m = p.match(s)
    return float(m.group(1))

def parse_longitude(s):
    m = p.match(s)
    return float(m.group(2))

alpr_latlon = alpr.with_columns(
    'latitude',  alpr.apply(parse_latitude,  'Location 1'),
    'longitude', alpr.apply(parse_longitude, 'Location 1')
).drop('Location 1')

alpr_latlon

In [None]:
def color_for_time(time):
    t = datetime.datetime.strptime(time, '%m/%d/%Y %I:%M:%S %p')
    if t.weekday() >= 6:
        return 'red' #weekend
    elif t.hour >= 6 and t.hour <= 17:
        return 'lightblue' #weekday daytime
    else:
        return 'darkblue' #weekday evening
    
alpr_latlon_color = alpr_latlon.with_column(
    'color', alpr.apply(color_for_time, 'time')
)
alpr_latlon_color

In [None]:
def map_plate(plate):
    sightings = alpr_latlon_color.where('plate', plate)
    t = sightings.select('latitude', 'longitude', 'time', 'color')
    return Marker.map_table(t)
    
most_seen = alpr_latlon_color.group('plate').sort('count', descending=True).column('plate').item(0)    
map_plate(most_seen)

In [None]:
the_mayor = '6FCH845'
map_plate(the_mayor)  

In [None]:
the_fire_chief = '1328354'
map_plate(the_fire_chief)

In [None]:
# unknown person
map_plate('6UZA652')

Is that kind of exploration ethical?

* For purposes of education?
* For purposes of curiousity?
* ...

## Voter Registration in Washington, DC

In [None]:
# source: https://github.com/ajschumacher/dc_voter_reg
dcvoters = Table.read_table('dcvoters.csv.zip', sep=',', na_filter=False, low_memory=False)
dcvoters

In [None]:
clarksons = dcvoters.where('LASTNAME', 'CLARKSON')
clarksons

Just learned address.  How about age?

In [None]:
pop = Table.read_table('population_by_zip_2010.csv.zip', sep=',', dtype={'zipcode':str})\
  .relabeled('population', 'count').drop('geo_id')
pop

In [None]:
ages = pop.where('zipcode', '20036')\
  .where('gender', 'male')\
  .where('minimum_age', are.above(0))\
  .select('minimum_age', 'count')

ages.sort('minimum_age').barh('minimum_age')

In [None]:
dcvoters.where('RES_ZIP', '20052')

Steven Knapp (Cornell PhD 1981):  President of GWU 2007-2017

<hr/>

Does political party ever uniquely identify someone?

In [None]:
zip_party = dcvoters.group(['RES_ZIP','PARTY']).sort('count')
zip_party.where('count', 1).show()

## Linking

Some synthetic data.

In [None]:
voters = Table.read_table('kanon_voter.csv.zip', sep=',', dtype={'Zip':str})
voters

In [None]:
hospital = Table.read_table('kanon_hospital.csv.zip', sep=',', dtype={'Zip':str})
hospital

In [None]:
unique_voters = voters.group(['Zip','DOB','Sex']).where('count', 1)
unique_voters

In [None]:
def de_anonymize_row(row):
    zip_code = row.item('Zip')
    dob = row.item('DOB')
    sex = row.item('Sex')
    matching = voters.where('Zip', zip_code).where('DOB', dob).where('Sex', sex)
    if matching.num_rows == 1:
        return matching.column('Name').item(0)
    else:
        return 'Not unique'
    
def de_anonymize_table(table):
    patients = table.with_column(
        'Name', table.apply(de_anonymize_row)
    ).where('Name', are.not_equal_to('Not unique'))
    return patients

In [None]:
first_10 = hospital.take(np.arange(0, 10))
de_anonymize_table(first_10)

In [None]:
last_10 = hospital.take(np.arange(9990, 10000))
de_anonymize_table(last_10)

## k-anonymity

In [None]:
def anonymity_size(table, attributes):
    t = table.group(attributes).sort('count')
    return t

anonymity_size(hospital, ['Zip','DOB','Sex'])

In [None]:
anonymity_size(hospital, ['DOB','Sex'])

In [None]:
def extract_year(s):
    return s[0:4]

hospital_birthyear = hospital.with_column(
    'DOB', hospital.apply(extract_year, 'DOB')
)

anonymity_size(hospital_birthyear, ['DOB','Sex'])

In [None]:
anonymized_hospital = hospital_birthyear.drop('Zip')
anonymized_hospital

In [None]:
voters_birthyear = voters.with_column(
    'DOB', voters.apply(extract_year, 'DOB')
)

def de_anonymize_row_nozip(row):
    dob = row.item('DOB')
    sex = row.item('Sex')
    matching = voters_birthyear.where('DOB', dob).where('Sex', sex)
    if matching.num_rows == 1:
        return matching.column('Name').item(0)
    else:
        return 'Not unique'
    
def de_anonymize_table_nozip(table):
    patients = table.with_column(
        'Name', table.apply(de_anonymize_row_nozip)
    ).where('Name', are.not_equal_to('Not unique'))
    return patients

last_10_nozip = anonymized_hospital.take(np.arange(9990, 10000))
de_anonymize_table_nozip(last_10_nozip)