# Processing mobility data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from datetime import datetime 
from dateutil import tz
import gzip
import time
import pickle

In [2]:
# read the raw data
with open("../../data/01_raw/boston_stays_from_esteban.pickle", 'rb') as f:
    df_raw = pickle.load(f)


In [3]:
df_raw

Unnamed: 0,user,duration,ini_dat,lon_medoid,lat_medoid,GEOID,home_lon_med,home_lat_med,GEOID_home,quant,fsq_id,cat,distPOI,disthome
0,0000ff45f7f170db960e4e601167975f7559c5be147d69...,388,1.481736e+09,-71.258500,42.359965,250173684002,-71.284748,42.558568,250173164003,3,5390bb9c498e21833a58c208,Office,0.001016,22.188364
1,000277100d5593fec35a151e228f6a485210a3fa87cda7...,600,1.478995e+09,-71.149170,42.332710,250214011003,-71.175020,42.300842,250173739004,4,4e5e3cb31838f7255272d5e6,Arts & Entertainment,0.001723,4.132097
2,000277100d5593fec35a151e228f6a485210a3fa87cda7...,4086,1.481411e+09,-71.151375,42.336340,250250005022,-71.175020,42.300842,250173739004,4,4e0b572db61cf2466a0f3c6e,Pool,0.000166,4.399966
3,000277100d5593fec35a151e228f6a485210a3fa87cda7...,1211,1.481563e+09,-71.207125,42.349600,250173734002,-71.175020,42.300842,250173739004,4,51e1752f498ea376f1942680,Jewelry,0.000149,6.029946
4,000277100d5593fec35a151e228f6a485210a3fa87cda7...,621,1.482095e+09,-71.175305,42.318550,250173739001,-71.175020,42.300842,250173739004,4,52edbf4f498e284de9956efb,Supermarket,0.000012,1.969160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12106757,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,10297,1.486467e+09,-70.779474,43.073728,330150693002,-71.120296,42.977653,330150540002,2,4bd5f42ecfa7b713a8bf26da,Diner,0.000526,29.693304
12106758,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,2946,1.486508e+09,-71.164300,43.039123,330150550021,-71.120296,42.977653,330150540002,2,4f9ffdd4e4b03a4a3f829164,Deli / Bodega,0.000105,7.715061
12106759,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,797,1.486661e+09,-71.074862,43.024131,330150590001,-71.120296,42.977653,330150540002,2,4d642a848018548105694768,Pharmacy,0.000487,6.353051
12106760,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,2215,1.486828e+09,-70.883073,42.891006,330150630011,-71.120296,42.977653,330150540002,2,4b7866c0f964a52075cb2ee3,Warehouse Store,0.000262,21.582175


In [4]:
# 
print(df_raw.shape)
print(df_raw.columns)
print(len(np.unique(df_raw.GEOID)))
print(len(np.unique(df_raw.GEOID_home)))

(12053805, 14)
Index(['user', 'duration', 'ini_dat', 'lon_medoid', 'lat_medoid', 'GEOID',
       'home_lon_med', 'home_lat_med', 'GEOID_home', 'quant', 'fsq_id', 'cat',
       'distPOI', 'disthome'],
      dtype='object')
3115
3196


# Process stay data

In [5]:
# Need to process the ini_dat to get: hour of day, day of week, weekday vs. weekend.
print(np.min(df_raw['ini_dat']), np.max(df_raw['ini_dat']))
print(datetime.fromtimestamp(int(np.min(df_raw['ini_dat']))).strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.fromtimestamp(int(np.max(df_raw['ini_dat']))).strftime('%Y-%m-%d %H:%M:%S'))

# Use a baseline unix time to start 
baseline_unix_time = 1474862400 # Mon Sep 26 2016 00:00:00 GMT-0400 (Eastern Daylight Time) 
# one unit in unix time is one second. 
# 60 seconds, 60 min, 24 hours, 7 days. 

# get time info
minutes = (df_raw['ini_dat'] - baseline_unix_time)/60
hours = (df_raw['ini_dat'] - baseline_unix_time)/3600
days = (df_raw['ini_dat'] - baseline_unix_time)/(3600*24)

# hour of day, day of week, etc.
hour_of_day = (hours.astype(int) + 1)%24 # 0~23 hours
day_of_week = (days.astype(int) + 1)%7 # 0~6 days; 0: Sunday. 6: Saturday
weekday = day_of_week.isin([1,2,3,4,5]) # true/false
weekend = day_of_week.isin([6,0]) # true/false

# augment time info
df = pd.concat([df_raw, hour_of_day, day_of_week, weekday, weekend], axis = 1)
df.columns = list(df_raw.columns) + ['hour_of_day', 'day_of_week', 'weekday', 'weekend']


1475210333.0 1491091786.0
2016-09-30 00:38:53
2017-04-01 20:09:46


In [6]:
# change to string
df.GEOID_home = df.GEOID_home.astype('str')
df.GEOID = df.GEOID.astype('str')

In [7]:
df

Unnamed: 0,user,duration,ini_dat,lon_medoid,lat_medoid,GEOID,home_lon_med,home_lat_med,GEOID_home,quant,fsq_id,cat,distPOI,disthome,hour_of_day,day_of_week,weekday,weekend
0,0000ff45f7f170db960e4e601167975f7559c5be147d69...,388,1.481736e+09,-71.258500,42.359965,250173684002,-71.284748,42.558568,250173164003,3,5390bb9c498e21833a58c208,Office,0.001016,22.188364,14,3,True,False
1,000277100d5593fec35a151e228f6a485210a3fa87cda7...,600,1.478995e+09,-71.149170,42.332710,250214011003,-71.175020,42.300842,250173739004,4,4e5e3cb31838f7255272d5e6,Arts & Entertainment,0.001723,4.132097,20,6,False,True
2,000277100d5593fec35a151e228f6a485210a3fa87cda7...,4086,1.481411e+09,-71.151375,42.336340,250250005022,-71.175020,42.300842,250173739004,4,4e0b572db61cf2466a0f3c6e,Pool,0.000166,4.399966,20,6,False,True
3,000277100d5593fec35a151e228f6a485210a3fa87cda7...,1211,1.481563e+09,-71.207125,42.349600,250173734002,-71.175020,42.300842,250173739004,4,51e1752f498ea376f1942680,Jewelry,0.000149,6.029946,14,1,True,False
4,000277100d5593fec35a151e228f6a485210a3fa87cda7...,621,1.482095e+09,-71.175305,42.318550,250173739001,-71.175020,42.300842,250173739004,4,52edbf4f498e284de9956efb,Supermarket,0.000012,1.969160,17,0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12106757,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,10297,1.486467e+09,-70.779474,43.073728,330150693002,-71.120296,42.977653,330150540002,2,4bd5f42ecfa7b713a8bf26da,Diner,0.000526,29.693304,8,2,True,False
12106758,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,2946,1.486508e+09,-71.164300,43.039123,330150550021,-71.120296,42.977653,330150540002,2,4f9ffdd4e4b03a4a3f829164,Deli / Bodega,0.000105,7.715061,19,2,True,False
12106759,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,797,1.486661e+09,-71.074862,43.024131,330150590001,-71.120296,42.977653,330150540002,2,4d642a848018548105694768,Pharmacy,0.000487,6.353051,14,4,True,False
12106760,fffe58b2fb58c5940b85ba55b07eb4d1d11b36185a4afb...,2215,1.486828e+09,-70.883073,42.891006,330150630011,-71.120296,42.977653,330150540002,2,4b7866c0f964a52075cb2ee3,Warehouse Store,0.000262,21.582175,12,6,False,True


# Save

In [8]:
# save
with open("../../data/02_intermediate/boston_stays.pickle", 'wb') as f:
    pickle.dump(df, f)
