In [1]:
import pandas as pd

In [2]:
data = pd.DataFrame({
    'first_name': ["Diana", "La'Toya", "Anne Marie", "Billy-Jean"],
    'last_name': ["Milbank-Stapleton", "O'Shaughnessy", "Brown", "Queen"],
    'location': ['NYC', 'NYC', 'Chicago', 'Chicago'],
    'med_timestamp': [1495397700, 1495721700, 1496341020, 1497286980],
    'distance_walked': [1.4, 3.7, 11.3, 5.1],
    'distance_units': ['mi', 'mi', 'km', 'mi']
})

## Remove alphanumeric case dependencies

In [3]:
data.first_name = data.first_name.apply(lambda x: x.upper())
data.last_name = data.last_name.apply(lambda x: x.upper())

In [4]:
# instead of making an exhaustive list of characters to remove, require alphanumeric
def alphanum(s):
    return(''.join(lett for lett in s if lett.isalnum()))

In [5]:
data.first_name = data.first_name.apply(lambda x: alphanum(x.upper()))
data.last_name = data.last_name.apply(lambda x: alphanum(x.upper()))

In [6]:
data.head()

Unnamed: 0,first_name,last_name,location,med_timestamp,distance_walked,distance_units
0,DIANA,MILBANKSTAPLETON,NYC,1495397700,1.4,mi
1,LATOYA,OSHAUGHNESSY,NYC,1495721700,3.7,mi
2,ANNEMARIE,BROWN,Chicago,1496341020,11.3,km
3,BILLYJEAN,QUEEN,Chicago,1497286980,5.1,mi


## Convert timestamps to local time

In [7]:
location_tzs = {
    'Chicago': 'America/Chicago',
    'NYC': 'America/New_York'
}

In [8]:
data.head()

Unnamed: 0,first_name,last_name,location,med_timestamp,distance_walked,distance_units
0,DIANA,MILBANKSTAPLETON,NYC,1495397700,1.4,mi
1,LATOYA,OSHAUGHNESSY,NYC,1495721700,3.7,mi
2,ANNEMARIE,BROWN,Chicago,1496341020,11.3,km
3,BILLYJEAN,QUEEN,Chicago,1497286980,5.1,mi


In [9]:
import datetime

def process_row(r):
    t = datetime.datetime.utcfromtimestamp(r['med_timestamp'])
    t2 = pd.Timestamp(t)
    t2 = t2.tz_localize('UTC')
    t2 = t2.tz_convert(location_tzs[r['location']])
    return(t2)

In [10]:
data['actual_timestamp'] = data.apply(process_row, axis=1)

In [11]:
data.head()

Unnamed: 0,first_name,last_name,location,med_timestamp,distance_walked,distance_units,actual_timestamp
0,DIANA,MILBANKSTAPLETON,NYC,1495397700,1.4,mi,2017-05-21 16:15:00-04:00
1,LATOYA,OSHAUGHNESSY,NYC,1495721700,3.7,mi,2017-05-25 10:15:00-04:00
2,ANNEMARIE,BROWN,Chicago,1496341020,11.3,km,2017-06-01 13:17:00-05:00
3,BILLYJEAN,QUEEN,Chicago,1497286980,5.1,mi,2017-06-12 12:03:00-05:00


## Unit conversions

In [12]:
# say you want all distances in miles, but you have some holdouts
# again we'll process the entire row

In [13]:
CONVERSION_FACTOR_KM_TO_MI = .6213
def process_row(r):
    units = r['distance_units']
    value = r['distance_walked']
    if r['distance_units'].lower() == 'mi':
        pass
    elif r['distance_units'].lower() == 'km':
        # better to have this as a constant somewhere your
        # entire code base can share it
        value = value * CONVERSION_FACTOR_KM_TO_MI
    return(value)


data['miles_walked'] = data.apply(process_row, axis=1)

In [14]:
data.head()

Unnamed: 0,first_name,last_name,location,med_timestamp,distance_walked,distance_units,actual_timestamp,miles_walked
0,DIANA,MILBANKSTAPLETON,NYC,1495397700,1.4,mi,2017-05-21 16:15:00-04:00,1.4
1,LATOYA,OSHAUGHNESSY,NYC,1495721700,3.7,mi,2017-05-25 10:15:00-04:00,3.7
2,ANNEMARIE,BROWN,Chicago,1496341020,11.3,km,2017-06-01 13:17:00-05:00,7.02069
3,BILLYJEAN,QUEEN,Chicago,1497286980,5.1,mi,2017-06-12 12:03:00-05:00,5.1


## Look at your data

In [15]:
# think about how you got your data and
# what mistakes people are likely to make

In [16]:
# hint: bimodal plots are a good tipoff
# as are values that are outliers by an order of magnitude under/over

In [17]:
# also again your user interface, looking for likely mistakes
# talk to support folks or clinicians and ask what mistakes they notice