In [None]:
%pylab inline

import seaborn as sns
import pickle

df = pickle.load(open('ssoc_df_1.p','rb'))

In [None]:
df.head()

In [None]:
from datetime import datetime

# unixtime into datetime
def parse_timestamp(ts):
    ts = int(ts) / 1000
    return datetime.fromtimestamp(ts)

# round to the closest hour
def parse_hourly(dt):
    dt = dt.replace(minute=0)
    dt = dt.replace(second=0)
    return dt

# hour of day
def extract_hour(dt):
    return int(dt.hour)

# Monday == 0, Sunday == 6
def extract_weekday(dt):
    return int(dt.weekday())

# date: round to the closest day
def parse_date(dt):
    dt = dt.replace(hour=0)
    dt = dt.replace(minute=0)
    dt = dt.replace(second=0)
    return dt

In [None]:
df['dt']=df[['timestampMs']].apply(lambda x: parse_timestamp(x['timestampMs']), axis=1)

In [None]:
df['d']=df[['dt']].apply(lambda x: parse_date(x['dt']), axis=1)

In [None]:
df['hr']=df[['dt']].apply(lambda x: parse_hourly(x['dt']), axis=1)

In [None]:
df['h']=df[['dt']].apply(lambda x: extract_hour(x['dt']), axis=1)

In [None]:
df['w']=df[['dt']].apply(lambda x: extract_weekday(x['dt']), axis=1)

In [None]:
df.head()

In [None]:
# display data over time, grouped by day

df.groupby(df.d).size().plot(figsize=(14,4))
title('logged items over time (daily)')
xlabel('day')
ylabel('number of rows in data')

In [None]:
# lets zoom in

sm_df = df.set_index(df.dt)['2016-10-17':'2016-10-27']
sm_df.groupby(sm_df.d).size().plot(figsize=(14,4))

In [None]:
sm_df['2016-10-23']

In [None]:
# grouped by hourly activity

df.groupby(df.hr).size().plot(figsize=(14,4))
title('logged items over time (hourly)')

In [None]:
# total activity by hour of day

df.groupby(df.h).size().plot(figsize=(14,4))
title('logged items by hour of day')
xlabel('hour')
ylabel('total rows')

In [None]:
# average velocity per day

df.groupby(df.d)['velocity'].mean().plot(figsize=(14,4))
title('avg. velocity per day')
xlabel('day')
ylabel('avg velocity')

In [None]:
# by setting the index to the datetime field, we can select specific times like this:

sm_vel = df.set_index(df.dt)['2016-05':'2016-06']['velocity']
sm_vel.groupby(sm_vel.index).mean().plot(figsize=(14,4))

In [None]:
# now we further zoom into a week

sm_vel = df.set_index(df.dt)['2016-05-8':'2016-05-14']['velocity']
sm_vel.groupby(sm_vel.index).mean().plot(figsize=(14,4))

In [None]:
# total activity by day of the week
# Monday == 0, Sunday == 6

df.groupby(df.w).size().plot(figsize=(14,4))
title('aggregate activity by day of the week')
xlabel('day of week')
ylabel('total rows')

In [None]:
# filter by value of a row
DAY_OF_WEEK = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}

# plot total hourly activity
for k,v in df.groupby(df.w):
    print k, len(v)
    v.groupby(df.h).size().plot(figsize=(14,6), label=DAY_OF_WEEK[k])
    
legend()
title('hourly data by day of week')

In [None]:
grouped = df[['accuracy','altitude','velocity','w']].groupby('w')
grouped.get_group(4)[['accuracy','altitude','velocity']].hist(bins=30,figsize=(14,8))
print DAY_OF_WEEK[4]

In [None]:
# altitude by hour

df[['hr','altitude']].dropna().groupby(df.hr).mean().plot()
title('altitude (hourly mean)')

In [None]:
df[['hr','heading']].dropna().groupby(df.hr).mean().plot()
title('heading (hourly mean)')

In [None]:
# velocity - hourly mean

df[['hr','velocity']].dropna().groupby(df.hr).mean().plot()
title('velocity (hourly mean) over time')

In [None]:
# velocity - avg per hour of day
df[['velocity']].dropna().groupby(df.h).median().plot()
title('avg. velocity by hour of day')

In [None]:
df[['velocity']].dropna(0).groupby(df.w).mean().plot()
title('avg. velocity by day of week')

# Monday = 0

In [None]:
df[['velocity']].dropna(0).groupby(df.w).mean().plot()
title('avg. velocity by day of week')

# Monday = 0

In [None]:
# what does this plot tell us?

sns.boxplot(df.velocity, groupby=df.h)

In [None]:
sns.boxplot(df.altitude, groupby=df.w)

In [None]:
sns.boxplot(df.altitude[[x<2000 for x in df.altitude]], groupby=df.w)

In [None]:
#from pandas.tools.plotting import scatter_matrix

#scatter_matrix(df[['velocity','verticalAccuracy','heading','altitude','accuracy']], alpha=0.2, figsize=(16, 16), diagonal='kde')

## Top Locations

In [None]:
# rounding lat/lon to two decimal places

df['lat_r2'] = df.lat.apply(lambda x: round(x,2))
df['lon_r2'] = df.lon.apply(lambda x: round(x,2))

In [None]:
df.head()

In [None]:
# now lets group by top lat/lon pairs

df.groupby(['lat_r2','lon_r2']).size().order(ascending=False)[:30]


In [None]:
# why is the data strange here? what can we do to fix this problem?

In [None]:
# save our data

pickle.dump(df, open('ssoc_df_2.p','wb'))

## Questions

1. Describe temporal patterns that you see in your target's behavior.
    - Are they more active on certain days of the week?
    - When do they tend to wake up? Go to sleep?
    - Any other daily/weekly/monthly patterns?

2. Identify your target's top 10 locations using lat/lon pair groupings. How granular do you need to go (in terms of rounding to the nearest decimal point)?

3. Describe the demographics of your target. Do you have a good sense of their race, ethnicity, gender, age? 
    - Try to use census data to help answer this question.
    - The provided file (nyc_census.csv) was taken from here - http://www1.nyc.gov/site/planning/data-maps/nyc-population/census-2010.page - and includes counts per race for every NTA (Neighborhood Tabulation Area).
    - Feel free to use other data.