In [159]:
import pandas as pd 
import numpy as np
from datetime import datetime
import glob, os

In [160]:
files = glob.glob('./data/sf/*.txt')

In [161]:
sample_file = files[0]

In [162]:
data = pd.read_csv(sample_file, sep=" ", header=None)
data.columns = ["lat", "long", "occupied", "unixtime"]

In [163]:
data.head()

Unnamed: 0,lat,long,occupied,unixtime
0,37.61549,-122.38821,0,1213037028
1,37.61562,-122.38849,0,1213036968
2,37.61518,-122.39029,0,1213036903
3,37.61393,-122.39508,0,1213036843
4,37.60493,-122.38362,0,1213036783


In [164]:
file_name = os.path.basename(sample_file).split('.')[0]
file_name

'new_adkavy'

In [165]:
data['cab_name'] = pd.Series(file_name, index=data.index)

In [166]:
data.head()

Unnamed: 0,lat,long,occupied,unixtime,cab_name
0,37.61549,-122.38821,0,1213037028,new_adkavy
1,37.61562,-122.38849,0,1213036968,new_adkavy
2,37.61518,-122.39029,0,1213036903,new_adkavy
3,37.61393,-122.39508,0,1213036843,new_adkavy
4,37.60493,-122.38362,0,1213036783,new_adkavy


In [167]:
data_sort = data.sort_values(by=['cab_name', 'unixtime'])

In [168]:
data_sort['dt_ts'] = data_sort.apply(
lambda row: datetime.utcfromtimestamp(int(row['unixtime'])).strftime('%Y-%m-%d %H:%M:%S'),
    axis = 1
)
# data_sort = data_sort.drop(['unixtime'], axis=1)

In [169]:
data_sort = data_sort.reset_index(drop=True)
data_sort.head(10)

Unnamed: 0,lat,long,occupied,unixtime,cab_name,dt_ts
0,37.79826,-122.26613,0,1211034337,new_adkavy,2008-05-17 14:25:37
1,37.79596,-122.26824,0,1211034398,new_adkavy,2008-05-17 14:26:38
2,37.80031,-122.2792,0,1211034467,new_adkavy,2008-05-17 14:27:47
3,37.80219,-122.29588,0,1211034515,new_adkavy,2008-05-17 14:28:35
4,37.81305,-122.30235,0,1211034575,new_adkavy,2008-05-17 14:29:35
5,37.82369,-122.30235,0,1211034637,new_adkavy,2008-05-17 14:30:37
6,37.8252,-122.31218,0,1211034697,new_adkavy,2008-05-17 14:31:37
7,37.82521,-122.31258,0,1211034757,new_adkavy,2008-05-17 14:32:37
8,37.8252,-122.31322,0,1211034817,new_adkavy,2008-05-17 14:33:37
9,37.82517,-122.31362,0,1211034843,new_adkavy,2008-05-17 14:34:03


In [170]:
# # data_sort
# for i in range(1, len(data_sort)):
#     data_sort.loc[i, 'gap'] = data_sort.loc[i, 'unixtime'] - data_sort.loc[i-1, 'unixtime']
    

### Indicator of PickUp and DropOff location

In [171]:
for i in range(1, len(data_sort)):
    data_sort.loc[i, 'diff_occupied'] = data_sort.loc[i, 'occupied'] - data_sort.loc[i-1, 'occupied']

In [172]:
data_sort['is_pickup'] = np.where(data_sort['diff_occupied']==1, True, False)

In [173]:
data_sort['is_dropoff'] = np.where(data_sort['diff_occupied']==-1, True, False)

In [176]:
data_sort.tail(5)

Unnamed: 0,lat,long,occupied,unixtime,cab_name,dt_ts,diff_occupied,is_pickup,is_dropoff
19537,37.60493,-122.38362,0,1213036783,new_adkavy,2008-06-09 18:39:43,0.0,False,False
19538,37.61393,-122.39508,0,1213036843,new_adkavy,2008-06-09 18:40:43,0.0,False,False
19539,37.61518,-122.39029,0,1213036903,new_adkavy,2008-06-09 18:41:43,0.0,False,False
19540,37.61562,-122.38849,0,1213036968,new_adkavy,2008-06-09 18:42:48,0.0,False,False
19541,37.61549,-122.38821,0,1213037028,new_adkavy,2008-06-09 18:43:48,0.0,False,False


In [177]:
data_sliced = data_sort[(data_sort['is_pickup']) | (data_sort['is_dropoff'])]

In [178]:
data_sliced = data_sliced.reset_index(drop=True)
data_sliced = data_sliced.drop(['diff_occupied'], axis=1)
data_sliced.shape

(1274, 8)

In [179]:
data_sliced.head(5)

Unnamed: 0,lat,long,occupied,unixtime,cab_name,dt_ts,is_pickup,is_dropoff
0,37.78726,-122.4111,1,1211039593,new_adkavy,2008-05-17 15:53:13,True,False
1,37.61467,-122.38975,0,1211040739,new_adkavy,2008-05-17 16:12:19,False,True
2,37.61464,-122.3935,1,1211046267,new_adkavy,2008-05-17 17:44:27,True,False
3,37.78583,-122.42816,0,1211047542,new_adkavy,2008-05-17 18:05:42,False,True
4,37.78648,-122.43799,1,1211048016,new_adkavy,2008-05-17 18:13:36,True,False


### map cell id

In [180]:
from s2sphere import CellId, LatLng
cell_level = 13
data_sliced['cell_id'] = data_sliced.apply(
    lambda row: CellId.from_lat_lng(LatLng.from_degrees(float(row['lat']), float(row['long'])))\
    .parent(cell_level).to_token(),
    axis=1
    # lambda row: row['long']
)

In [182]:
data_sliced = data_sliced.drop(['occupied'], axis=1)
data_sliced.head(4)

Unnamed: 0,lat,long,unixtime,cab_name,dt_ts,is_pickup,is_dropoff,cell_id
0,37.78726,-122.4111,1211039593,new_adkavy,2008-05-17 15:53:13,True,False,8085808c
1,37.61467,-122.38975,1211040739,new_adkavy,2008-05-17 16:12:19,False,True,808f7794
2,37.61464,-122.3935,1211046267,new_adkavy,2008-05-17 17:44:27,True,False,808f779c
3,37.78583,-122.42816,1211047542,new_adkavy,2008-05-17 18:05:42,False,True,808580bc
