In [801]:
import numpy as np
import pandas as pd
import pickle
import datetime
import json

# RUN THESE

In [837]:
#load dictionary which maps a road segment to a bounding box
objID_to_bnd_box = pickle.load( open( "ID_to_bound_box_R2", "rb" ) )

In [922]:
#read in; had to use converters b/c the truck names had mixed types
df = pd.read_csv("20180109.csv", parse_dates=['date_fixed'], converters={'truck_name': str})

In [923]:
df.count()

truck_name    37285
date_fixed    37285
address       36561
longitude     37285
latitude      37285
dtype: int64

In [924]:
#stripping white space from the truck names
df['truck_name'] = df['truck_name'].str.strip()
df['address'] =df['address'].str.strip()

In [925]:
#adjust time offset by 5 hours
df['date_fixed'] = df['date_fixed'] - pd.Timedelta(hours=5)

In [926]:
#sorted by truck then date/time
df = df.sort_values(by=['date_fixed'],na_position='first')

In [927]:
#reorder the index after sorting
df = df.reset_index(drop=True)

In [928]:
#get rid of data points from DPW site. Gets rid of ~1300 points
df = df[~df.address.str.contains('1200 CANAL',na=False)]

In [929]:
df.count()

truck_name    35734
date_fixed    35734
address       35010
longitude     35734
latitude      35734
dtype: int64

In [930]:
#filter out points at DPW site
df = df[ (~df.longitude.between(-76.1136, -76.1059)) | (~df.latitude.between(43.0539, 43.0565)) ]

In [931]:
df.count()

truck_name    31172
date_fixed    31172
address       31063
longitude     31172
latitude      31172
dtype: int64

In [932]:
#keep only lon,lat
df = df[['date_fixed', 'longitude','latitude']]

In [933]:
#convert to numpy array
arr = df.values

In [934]:
#add a third column initialized to 'NaN'
arr = np.insert(arr, 3, None, axis=1)

In [935]:
#Add a column which maps Long & Lat to a road segment (i.e 'ID') 
cnt = 0
for row in arr:
    lon = row[1]
    lat = row[2]
    for k,v in objID_to_bnd_box.items():
        if lon > min(v[0][0],v[1][0]) and lon < max(v[0][0],v[1][0]) and lat > min(v[0][1],v[1][1]) and lat < max(v[0][1],v[1][1]):
            arr[cnt][3] = k
    cnt += 1
           

In [936]:
#convert Numpy array to DataFrame to remove nulls
df = pd.DataFrame(arr)

In [937]:
#remove rows with nulls
df = df[df[3].notnull()]

In [938]:
#total count of mapped road segments
df.count()

0    28375
1    28375
2    28375
3    28375
dtype: int64

In [939]:
df = df.iloc[:,[0,3]]

In [940]:
#convert back to Numpy array to be stacked with data from other days in that date cluster
j9 = df.values

In [941]:
print (j6.shape)
print (j7.shape)
print (j8.shape)
print (j9.shape)

(47902, 2)
(39074, 2)
(39334, 2)
(28375, 2)


In [942]:
#stack the arrays by date
j6_9 = np.vstack((j6,j7,j8,j9))

In [943]:
#convert to DataFrame to sort by date
j = pd.DataFrame(j6_9,columns=['time','ID'])

In [944]:
# sort by date
j = j.sort_values(by=['time'])

## Dump time period final DF to pickle

In [945]:
pickle.dump( j, open( "jan6_9_final_df", "wb" ) )

## Load j file (stacked and sorted DF)

In [970]:
j = pickle.load( open( "jan1_4_final_df", "rb" ) )

## Create main data file template 

In [973]:
#USE THIS
#create dictionary to reference for lapse times
lapse_time = {}
for i in range(1,5651):
    lapse_time[i] = []

In [974]:
j.count()

time    109927
ID      109927
dtype: int64

# Function to insert new lapse times into main data file

In [975]:
def insert_LT(ls,cnt):
    
    if cnt == 0:
        for k,v in lapse_time.items():
            if k in ls:
                lapse_time[k].append(0)
            else:
                lapse_time[k].append('null')
    else:
        for k,v in lapse_time.items():
            if k in ls:
                lapse_time[k].append(0)
            elif lapse_time[k][cnt - 1] == 'null':
                lapse_time[k].append('null')
            else:
                lapse_time[k].append(lapse_time[k][cnt - 1] + 2)
    
                
        
    
    

## Create temp dataframes by time period and call function to add to main data file

In [976]:
#NOTE!: If we want to put a null value at the first index, we will need to have 49 values in the list of elapsed times
# (1 for each period and the first one being null). Would need to initialize cnt to 1
#ANOTHER NOTE: the range for 'day' needs to be adjusted for each dataset
cnt = 0
for day in range(1,5):
    for hour in range(2,26,2):
        temp_df = j[ (j.time.dt.day == day) & (j.time.dt.hour < hour) & (j.time.dt.hour >= hour - 2)]
        tlist = temp_df.ID.unique()
        insert_LT(tlist,cnt)
        cnt += 1
        

In [977]:
#convert lapse_times dict into a json array
LT = []
for i in range(1,5651):
    d = {"ID":str(i), "LT": lapse_time[i]}
    LT.append(d)

In [979]:
print (LT[478])

{'ID': '479', 'LT': ['null', 'null', 'null', 'null', 'null', 'null', 'null', 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60]}


# GeoJson File for comparing lapse times to a given road

In [802]:
with open('City_Streets_2011.geojson') as f_in:
    roads = json.load(f_in)

In [809]:
print (lapse_time[3045])

[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 0, 0, 0, 0, 0, 1, 2, 0]


# Dump final data file

In [980]:
#dump final data file
with open("LT_J1_4.json", "w") as text_file:
    json.dump(LT, text_file)

In [663]:
j1_4[90000][0].hour

14

In [712]:
#convert to DataFrame to sort by date
j = pd.DataFrame(j1_4,columns=['time','ID'])

In [713]:
j = j.sort_values(by=['time'])

In [714]:
j1_4 = j.values

In [748]:
j.loc[1000,][0]

Timestamp('2018-01-01 01:13:28')

In [751]:
cnt =0
for row in j1_4:
    if row[0].day == 1 & row[0].hour < 12:
        cnt+=1 
print (cnt)

15101


In [752]:
ts = j[j.time.dt.hour == 3]

In [753]:
ts.count()

time    3332
ID      3332
dtype: int64

In [785]:
lapse_time[1][1-0]

'null'

# To here

In [None]:
# get the manhattan distance between consecutive data points
#df['manh_dist'] = abs(df.longitude - df.longitude.shift()) + abs(df.latitude - df.latitude.shift())

In [None]:
#filter out close data points
#df = df[df.manh_dist > .0005]

In [None]:
#filter out unreasonably far data points
#df = df[df.manh_dist < .01]

In [None]:
#How to get counts of unique values for how many differnt days (there is only 1 day here, i.e. Jan 1)
df['date_fixed'].dt.day.value_counts()

In [None]:
#test iterrows; delete eventually
tst2 = []
for index, row in df.iterrows():
    tst2.append([index,row])
    

In [None]:
# an index and row
tst2[10]

In [None]:
# a row (as a Series)
tst2[10][1]

In [None]:
tst2[1000][1].latitude

In [None]:
objID_to_bnd_box[10]

# Below here is not used

In [None]:
#NOT USED; generate test dataframe
test_row_iter = pd.DataFrame(np.random.randn(20,3),columns=list('ABC'))

In [None]:
#NOT USED
#for testing: set column C to absolute values
test_row_iter['C'] = abs(test_row_iter['C'])

In [None]:
#NOT USED
#how to iterate over rows and set a boolean value based on a conditional
for i in range(1,len(test_row_iter)):
    if test_row_iter.loc[i, 'C'] > 0:
        test_row_iter.loc[i,'D'] = True

In [None]:
#NOT USED
#set first boolean of the 'Keep' column to 'True', since we always want to keep the first data point
test_row_iter.loc[0,'Keep'] = True
#create a boolean column based on an the accumulated value of a column
threshhold = 2.
cum_dist = 0.
for i in range(1,len(test_row_iter)):
    if test_row_iter.loc[i, 'C'] + cum_dist > threshhold:
        test_row_iter.loc[i,'Keep'] = True
        cum_dist = 0
    else:
        test_row_iter.loc[i,'Keep'] = False
        cum_dist += test_row_iter.loc[i, 'C']
            

In [None]:
#NOT USED
test_row_iter = test_row_iter[test_row_iter.Keep == True]

In [758]:
dft = pd.DataFrame(np.random.randn(4,3),columns = list("ABC"))

In [759]:
dft

Unnamed: 0,A,B,C
0,-0.763806,-0.46005,1.95719
1,-0.144945,-0.009399,0.15378
2,-0.526902,-1.139085,0.038688
3,-0.128784,0.836668,-0.537935


In [None]:
dft = dft.sort_values(by = ['A'])

In [762]:
dft.B.unique()

array([-0.46004974, -0.00939874, -1.13908498,  0.83666806])

In [None]:
for i, r in dft.iterrows():
    #print (i)
    dft.loc[i,'D'] = i

In [None]:
dft

In [None]:
dft.reset_index(drop=True)

In [None]:
ls = []

In [None]:
ls[3] = 3