Skip to content

Commit

Permalink
Merge pull request #39 from itamargal/master
Browse files Browse the repository at this point in the history
Updated push scripts
  • Loading branch information
johnclary committed Mar 10, 2017
2 parents 00e917f + 3ec59d2 commit c42043a
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 32 deletions.
143 changes: 117 additions & 26 deletions bt_anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
Individual Traffic Match File (ITMF) by replacing actual MAC addresses with
randomly generated MAC addresses.
Example usage:
python bt_anonymize.py -i /srv/awamdata/ -o /srv/anonymized/ --mdy=01-01-2017
IAF filename format:
[AWAM Host Instance Name]_bt_[month-day-year].txt
Expand Down Expand Up @@ -43,18 +47,19 @@
import os
from datetime import datetime
import pytz
import argparse
import hashlib


def get_timestamp(local_time_string):
def get_epoch_time(awam_time_string):
"""
Convert time string to UTC timestamp.
Convert AWAM time string to Unix time.
"""

# Create timezone object
local_tz = pytz.timezone("US/Central")

# Parse time string into naive/timezone-unaware datetime object
naive_dt = datetime.strptime(local_time_string, '%m/%d/%Y %I:%M:%S %p')
naive_dt = datetime.strptime(awam_time_string, '%m/%d/%Y %I:%M:%S %p')

# Convert naive datetime object to timezone aware datetime object
local_dt = local_tz.localize(naive_dt)
Expand All @@ -68,22 +73,66 @@ def get_timestamp(local_time_string):
# Return epoch time
return(str(epoch))

def get_iso_time(awam_time_string):
"""
Convert AWAM time string to ISO time string.
"""

# Create timezone object
local_tz = pytz.timezone("US/Central")

# Parse time string into naive/timezone-unaware datetime object
naive_dt = datetime.strptime(awam_time_string, '%m/%d/%Y %I:%M:%S %p')

# Convert naive datetime object to timezone aware datetime object
local_dt = local_tz.localize(naive_dt)

# Get local time in ISO format
iso_local_time = local_dt.isoformat()

# Return ISO time
return(iso_local_time)

def get_datetime(awam_time_string):
"""Convert AWAM time string to a timezone-aware datetime object."""

# Create timezone object
local_tz = pytz.timezone("US/Central")

# Parse time string into naive/timezone-unaware datetime object
naive_dt = datetime.strptime(awam_time_string, '%m/%d/%Y %I:%M:%S %p')

# Convert naive datetime object to timezone aware datetime object
local_dt = local_tz.localize(naive_dt)

# Return the local datetime object
return(local_dt)

def random_mac_address():
"""
Generates a random MAC address.
"""
_hex_digits = [random.randint(0x00, 0xff) for i in range(5)]
_address = ':'.join(map(lambda x: "%02x" % x, _hex_digits))
return(_address)

def randomize(input_dir, month_day_year, output_dir, awam_host_instance_name="Austin"):
def randomize(input_dir, month_day_year, output_dir, awam_host_instance_name="Austin", header=False):
"""
Removes personally identifying information from AWAM individual address
files (IAF) and individual traffic match files (ITMF).
"""

# Set IAF path parameters
iaf_filename = "%s_bt_%s.txt" % (awam_host_instance_name, month_day_year)
iaf_input_path = os.path.join(input_dir, iaf_filename)
iaf_output_path = os.path.join(output_dir, iaf_filename)
iaf_input_filename = "%s_bt_%s.txt" % (awam_host_instance_name, month_day_year)
iaf_input_path = os.path.join(input_dir, iaf_input_filename)
iaf_output_filename = "%s_bt_%s.csv" % (awam_host_instance_name, month_day_year)
iaf_output_path = os.path.join(output_dir, iaf_output_filename)

# Set ITMF path parameters
itmf_filename = "%s_btmatch_%s.txt" % (awam_host_instance_name, month_day_year)
itmf_input_path = os.path.join(input_dir, itmf_filename)
itmf_output_path = os.path.join(output_dir, itmf_filename)
itmf_input_filename = "%s_btmatch_%s.txt" % (awam_host_instance_name, month_day_year)
itmf_input_path = os.path.join(input_dir, itmf_input_filename)
itmf_output_filename = "%s_btmatch_%s.csv" % (awam_host_instance_name, month_day_year)
itmf_output_path = os.path.join(output_dir, itmf_output_filename)

# Create dictionary that maps MAC addresses to their replacements
newmacs = {}
Expand All @@ -102,21 +151,26 @@ def randomize(input_dir, month_day_year, output_dir, awam_host_instance_name="Au
newmacs[mac] = random_mac_address()
row[4] = newmacs[mac]

# Replace time strings with Unix times
row[0] = get_timestamp(row[0])
row[2] = get_timestamp(row[2])

# Add a unique row identifier
row.insert(0, row[0] + row[4])
# Replace AWAM time strings with ISO formatted time strings
row[0] = get_epoch_time(row[0])
row[2] = get_epoch_time(row[2])

# Remove the IP address
del row[2]
del row[1]

# Add a unique row identifier
row.insert(0, hashlib.md5(str(row)).hexdigest())

# Add the modified row
rows.append(row)

# Write the anonymized IAF data
with open(iaf_output_path, 'wb') as iaf_output_file:

# Optionally prepend a header row to the output
if header:
headers = ["record_id", "host_read_time", "field_device_read_time", "reader_identifier", "device_address"]
rows.insert(0, headers)
writer = csv.writer(iaf_output_file)
writer.writerows(rows)

Expand All @@ -135,20 +189,57 @@ def randomize(input_dir, month_day_year, output_dir, awam_host_instance_name="Au
row[0] = newmacs[mac]
rows.append(row)

# Replace time strings with Unix times
row[3] = get_timestamp(row[3])
row[4] = get_timestamp(row[4])
# Add a day-of-week string (NOTE: Python weekdays are indexed to start on Monday)
weekdays = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
weekday = weekdays[get_datetime(row[3]).weekday()]
row.append(weekday)

# Replace AWAM time strings with ISO formatted time strings
row[3] = get_iso_time(row[3])
row[4] = get_iso_time(row[4])

# Add a unique row identifier
row.insert(0, row[3] + row[0])
row.insert(0, hashlib.md5(str(row)).hexdigest())

# Write the anonymized ITMF data
with open(itmf_output_path, 'wb') as itmf_output_file:

# Optionally prepend a header row to the output
if header:
original_headers = [
'device_address','origin_reader_identifier','destination_reader_identifier','start_time',
'end_time','travel_time_seconds','speed_miles_per_hour','match_validity','filter_identifier'
]
headers = ['record_id'] + original_headers + ['day_of_week']
rows.insert(0, headers)
writer = csv.writer(itmf_output_file)
writer.writerows(rows)

def cli_args():
"""
Parse command-line arguments using argparse module.
"""
parser = argparse.ArgumentParser(prog='bt_anonymize.py', description='Anonymize AWAM data files.')
# parser.add_argument('--iaf', help="Read an Individual Address File and write an anonymized version to stdout")
# parser.add_argument('--itmf', help="Read an Individual Traffic-Match File and write an anonymized version to stdout")
parser.add_argument(
'--mdy', dest='mdy', required=True,
help="Specify a Month/Day/Year (mm-dd-yyyy) to anonymize an IAF and corresponding ITMF file"
)
parser.add_argument(
'-i', '--input-dir', dest='inputdir', required=True,
help="Path to input directory containing IAF and ITMF files"
)
parser.add_argument(
'-o', '--output-dir', dest='outputdir', required=True,
help="Path to output directory for anonymized IAF and ITMF file"
)
args = parser.parse_args()
return(args)

if __name__ == '__main__':
input_dir = sys.argv[1]
month_day_year = sys.argv[2]
output_dir = sys.argv[3]
randomize(input_dir, month_day_year, output_dir)
args = cli_args()
input_dir = args.inputdir
month_day_year = args.mdy
output_dir = args.outputdir
randomize(input_dir, month_day_year, output_dir, header=True)
3 changes: 2 additions & 1 deletion socrata_push_iaf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
def upsert(input_dir, month_day_year, awam_host_instance_name="Austin"):

# Set IAF path parameters
iaf_filename = "%s_bt_%s.txt" % (awam_host_instance_name, month_day_year)
# iaf_filename = "%s_bt_%s.txt" % (awam_host_instance_name, month_day_year)
iaf_filename = "%s_bt_%s.csv" % (awam_host_instance_name, month_day_year)
iaf_input_path = os.path.join(input_dir, iaf_filename)

# Upload the data
Expand Down
10 changes: 5 additions & 5 deletions socrata_push_itmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
def upsert(input_dir, month_day_year, awam_host_instance_name="Austin"):

# Set ITMF path parameters
itmf_filename = "%s_btmatch_%s.txt" % (awam_host_instance_name, month_day_year)
# itmf_filename = "%s_btmatch_%s.txt" % (awam_host_instance_name, month_day_year)
itmf_filename = "%s_btmatch_%s.csv" % (awam_host_instance_name, month_day_year)
itmf_input_path = os.path.join(input_dir, itmf_filename)

# Upload the data
with open(itmf_input_path, 'r') as itmf_input_file:
reader = csv.reader(itmf_input_file)
fieldnames = [
'record_id','device_address','origin_reader_identifier','destination_reader_identifier','start_time',
'end_time','travel_time_seconds','speed_miles_per_hour','match_validity','filter_identifier'
'end_time','travel_time_seconds','speed_miles_per_hour','match_validity','filter_identifier', 'day_of_week'
]

# Skip header row
# print(reader.next())
reader.next()
data = [dict(zip(fieldnames, record)) for record in reader]
# for row in data:
# print(row)
upsert_data(SOCRATA_CREDENTIALS, data, "x44q-icha")


if __name__ == '__main__':
input_dir = sys.argv[1]
month_day_year = sys.argv[2]
Expand Down

0 comments on commit c42043a

Please sign in to comment.