In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime

# RUN THESE

In [480]:
#load dictionary which maps a road segment to a bounding box
objID_to_bnd_box = pickle.load( open( "ID_to_bound_box_R2", "rb" ) )

In [599]:
#read in; had to use converters b/c the truck names had mixed types
df = pd.read_csv("20180104.csv", parse_dates=['date_fixed'], converters={'truck_name': str})

In [600]:
df.count()

truck_name    49382
date_fixed    49382
address       48449
longitude     49382
latitude      49382
dtype: int64

In [601]:
#stripping white space from the truck names
df['truck_name'] = df['truck_name'].str.strip()
df['address'] =df['address'].str.strip()

In [602]:
#adjust time offset by 5 hours
df['date_fixed'] = df['date_fixed'] - pd.Timedelta(hours=5)

In [603]:
#sorted by truck then date/time
df = df.sort_values(by=['date_fixed'],na_position='first')

In [604]:
#reorder the index after sorting
df = df.reset_index(drop=True)

In [605]:
#get rid of data points from DPW site. Gets rid of ~1300 points
df = df[~df.address.str.contains('1200 CANAL',na=False)]

In [606]:
df.count()

truck_name    47810
date_fixed    47810
address       46877
longitude     47810
latitude      47810
dtype: int64

In [607]:
#filter out points at DPW site
df = df[ (~df.longitude.between(-76.1136, -76.1059)) | (~df.latitude.between(43.0539, 43.0565)) ]

In [608]:
df.count()

truck_name    42058
date_fixed    42058
address       41817
longitude     42058
latitude      42058
dtype: int64

In [609]:
#keep only lon,lat
df = df[['date_fixed', 'longitude','latitude']]

In [610]:
#convert to numpy array
arr = df.values

In [611]:
#add a third column initialized to 'NaN'
arr = np.insert(arr, 3, None, axis=1)

In [612]:
#Add a column which maps Long & Lat to a road segment (i.e 'ID') 
cnt = 0
for row in arr:
    lon = row[1]
    lat = row[2]
    for k,v in objID_to_bnd_box.items():
        if lon > min(v[0][0],v[1][0]) and lon < max(v[0][0],v[1][0]) and lat > min(v[0][1],v[1][1]) and lat < max(v[0][1],v[1][1]):
            arr[cnt][3] = k
    cnt += 1
           

In [613]:
#convert Numpy array to DataFrame to remove nulls
df = pd.DataFrame(arr)

In [614]:
#remove rows with nulls
df = df[df[3].notnull()]

In [615]:
#total count of mapped road segments
df.count()

0    37792
1    37792
2    37792
3    37792
dtype: int64

In [616]:
df = df.iloc[:,[0,3]]

In [617]:
#convert back to Numpy array to be stacked with data from other days in that date cluster
j4 = df.values

In [621]:
print (j1.shape)
print (j2.shape)
print (j3.shape)
print (j4.shape)

(31045, 2)
(31792, 2)
(9298, 2)
(37792, 2)


In [622]:
#stack the arrays by date
j1_4 = np.vstack((j1,j2,j3,j4))

## Dump j1_4 to pickle

In [642]:
pickle.dump( j1_4, open( "j1_4", "wb" ) )

In [623]:
j1_4.shape

(109927, 2)

In [632]:
j1_4[90000][0].hour

14

In [633]:
cnt =0
for row in j1_4:
    if row[0].hour < 8:
        cnt+=1 
print (cnt)

26357


In [639]:
#create template dictionary for main data file
lapse_time = []
for i in range(1,5651):
    d = {"ID":str(i), "LT": ["null"]}
    lapse_time.append(d)
    

In [641]:
d

{'ID': '5650', 'LT': ['null']}

In [None]:
#dump final data file
with open("LT_J1_4.json", "w") as text_file:
    json.dump(lapse_time, text_file)

# To here

In [None]:
# get the manhattan distance between consecutive data points
#df['manh_dist'] = abs(df.longitude - df.longitude.shift()) + abs(df.latitude - df.latitude.shift())

In [None]:
#filter out close data points
#df = df[df.manh_dist > .0005]

In [None]:
#filter out unreasonably far data points
#df = df[df.manh_dist < .01]

In [None]:
#How to get counts of unique values for how many differnt days (there is only 1 day here, i.e. Jan 1)
df['date_fixed'].dt.day.value_counts()

In [None]:
#test iterrows; delete eventually
tst2 = []
for index, row in df.iterrows():
    tst2.append([index,row])
    

In [None]:
# an index and row
tst2[10]

In [None]:
# a row (as a Series)
tst2[10][1]

In [None]:
tst2[1000][1].latitude

In [None]:
objID_to_bnd_box[10]

# Below here is not used

In [None]:
#NOT USED; generate test dataframe
test_row_iter = pd.DataFrame(np.random.randn(20,3),columns=list('ABC'))

In [None]:
#NOT USED
#for testing: set column C to absolute values
test_row_iter['C'] = abs(test_row_iter['C'])

In [None]:
#NOT USED
#how to iterate over rows and set a boolean value based on a conditional
for i in range(1,len(test_row_iter)):
    if test_row_iter.loc[i, 'C'] > 0:
        test_row_iter.loc[i,'D'] = True

In [None]:
#NOT USED
#set first boolean of the 'Keep' column to 'True', since we always want to keep the first data point
test_row_iter.loc[0,'Keep'] = True
#create a boolean column based on an the accumulated value of a column
threshhold = 2.
cum_dist = 0.
for i in range(1,len(test_row_iter)):
    if test_row_iter.loc[i, 'C'] + cum_dist > threshhold:
        test_row_iter.loc[i,'Keep'] = True
        cum_dist = 0
    else:
        test_row_iter.loc[i,'Keep'] = False
        cum_dist += test_row_iter.loc[i, 'C']
            

In [None]:
#NOT USED
test_row_iter = test_row_iter[test_row_iter.Keep == True]

In [None]:
dft = pd.DataFrame(np.random.randn(4,3),columns = list("ABC"))

In [None]:
dft

In [None]:
dft = dft.sort_values(by = ['A'])

In [None]:
dft

In [None]:
for i, r in dft.iterrows():
    #print (i)
    dft.loc[i,'D'] = i

In [None]:
dft

In [None]:
dft.reset_index(drop=True)

In [None]:
ls = []

In [None]:
ls[3] = 3