In [1]:
import pandas as pd
import os
import time
import numpy as np
import arcgis
from datetime import datetime, timedelta

# 1. Load/Condition Data from GDB


With the prepwork done in Pro or Arcpy, we can use the ArcGIS Python API to easily load this data into a SpatialDataFrame, which will allow us to do any geospatial manipulations we need to do. In this example, we'll mostly use the ``pandas`` functionality

In [2]:
project_gdb = r'utah.gdb'
collisions_path = os.path.join(project_gdb,'collisions_joined')
road_features_path = os.path.join(project_gdb,'static_features')

## 1.1 Load Data

In [3]:
df = arcgis.SpatialDataFrame().from_featureclass(collisions_path)

In [7]:
rdf = arcgis.SpatialDataFrame().from_featureclass(road_features_path)

## 1.2 Remove some bad data

In [9]:
# We don't care about accidents that happened far from known major/minor roads
df = df.dropna(how='any',subset=['segment_id'])

# Convert some types
df['segment_id'] = df.segment_id.astype('int64')
rdf['segment_id'] = rdf.segment_id.astype('int64') 
rdf['station_id'] = rdf.station_id.astype('int64') # Weather statiion ID

## 1.3 Parse Dates

In [10]:
df['CRASHDATE'] = df['CRASHDATE'].astype('str')
df['CRASHTIME'] = df['CRASHTIME'].astype('str')

df['timestamp'] = pd.to_datetime(df['CRASHDATE']+' '+df['CRASHTIME'])

time_index = pd.DatetimeIndex(df.timestamp).floor('1h')
df['hour'] = time_index.hour
df['weekday'] = time_index.weekday
df['month'] = time_index.month

## 1.4 Remove/Replace missing data

In [11]:
rdf.loc[pd.isna(rdf.speed_limit),'speed_limit'] = np.median(rdf.speed_limit)
rdf.loc[pd.isna(rdf.aadt),'aadt'] = 0.0 # AKA, unknown, zero, etc. This will help differentiate major/minor roadways

# 2. Expand Feature Set
Once the data is in a DataFrame, it's very easy to manipulate to add certain features. Here we will add a accident count field by doing some joins and aggregations

In [12]:
accident_counts_per_segment = df.groupby('segment_id').size().reset_index(name='accident_counts').set_index('segment_id')

In [13]:
rdf = rdf.set_index('segment_id').join(accident_counts_per_segment)

In [14]:
# If there was no accident, set value to 0, not NaN
rdf.loc[pd.isna(rdf.accident_counts),'accident_counts'] = 0.0

# 3. Select Features

## 3.1 Feature Names
We can do most of our exploratory data analysis in ArcGIS Pro to determine a set of candidate features. Once we build our training set, we can do further EDA to determine the best feature set. Here we select the candidate features that we found in Pro 

In [15]:
print(rdf.columns.tolist())

['OBJECTID_1', 'Join_Count', 'TARGET_FID', 'Join_Count_1', 'TARGET_FID_1', 'OBJECTID', 'pre_dir', 'street_type', 'suf_dir', 'one_way', 'speed_limit', 'surface_type', 'surface_width', 'aadt', 'sinuosity', 'euclidean_length', 'segment_length', 'at_intersection', 'near_billboard', 'road_orient_approx', 'near_major_road', 'station_id', 'SHAPE', 'accident_counts']


In [18]:
static_feature_names = [
    'pre_dir', 
    'street_type',
    'suf_dir', 
    'one_way', 
    'speed_limit', 
    'surface_type', 
    'surface_width', 
    'aadt', 
    'sinuosity',          # \
    'euclidean_length',   # |  These together define road curvature
    'segment_length',     # /
    'near_major_road', 
    'road_orient_approx', 
    'at_intersection', 
    'near_billboard',
    'accident_counts'
]
temporal_feature_names = [
    'hour',
    'weekday',
    'month'
]

## 3.2 Select static features, join to accidents records

In [19]:
features = df\
    .xs(['segment_id']+temporal_feature_names,axis=1)\
    .set_index('segment_id')\
    .join(
        rdf.xs(static_feature_names+['station_id'],axis=1),
        how='left'
    ).reset_index()

## 3.3 Index by timestamp/road

In [20]:
feature_timestamp_series = time_index.to_series()
segment_id = features.segment_id.astype('str')

# Index for fast lookup
features['fid'] = feature_timestamp_series.values.astype('str') + segment_id.values
features.set_index('fid',inplace=True)

# Floor to nearest hour
features['timestamp'] = feature_timestamp_series.values


# Set target value
features['target'] = 1

In [21]:
features.head(15)

Unnamed: 0_level_0,segment_id,hour,weekday,month,pre_dir,street_type,suf_dir,one_way,speed_limit,surface_type,...,euclidean_length,segment_length,near_major_road,road_orient_approx,at_intersection,near_billboard,accident_counts,station_id,timestamp,target
fid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-10-15T19:00:00.0000000001,1,10,5,11,N,,,0,40.0,,...,110.198977,110.198977,0,1.022803,1,0,4.0,72572024127,2010-10-15 19:00:00,1
2010-08-20T19:00:00.0000000001,1,16,0,10,N,,,0,40.0,,...,110.198977,110.198977,0,1.022803,1,0,4.0,72572024127,2010-08-20 19:00:00,1
2010-03-27T21:00:00.0000000001,1,15,5,1,N,,,0,40.0,,...,110.198977,110.198977,0,1.022803,1,0,4.0,72572024127,2010-03-27 21:00:00,1
2010-04-16T13:00:00.0000000001,1,15,0,12,N,,,0,40.0,,...,110.198977,110.198977,0,1.022803,1,0,4.0,72572024127,2010-04-16 13:00:00,1
2010-03-05T09:00:00.0000000002,2,21,4,9,N,,,0,40.0,,...,87.835598,87.835598,0,1.187714,1,0,7.0,72572024127,2010-03-05 09:00:00,1
2010-10-17T01:00:00.0000000002,2,6,3,10,N,,,0,40.0,,...,87.835598,87.835598,0,1.187714,1,0,7.0,72572024127,2010-10-17 01:00:00,1
2010-01-08T22:00:00.0000000002,2,7,2,9,N,,,0,40.0,,...,87.835598,87.835598,0,1.187714,1,0,7.0,72572024127,2010-01-08 22:00:00,1
2010-05-13T00:00:00.0000000002,2,20,3,3,N,,,0,40.0,,...,87.835598,87.835598,0,1.187714,1,0,7.0,72572024127,2010-05-13 00:00:00,1
2010-07-16T09:00:00.0000000002,2,14,3,4,N,,,0,40.0,,...,87.835598,87.835598,0,1.187714,1,0,7.0,72572024127,2010-07-16 09:00:00,1
2010-10-13T17:00:00.0000000002,2,21,5,5,N,,,0,40.0,,...,87.835598,87.835598,0,1.187714,1,0,7.0,72572024127,2010-10-13 17:00:00,1


# 4 Negative Sampling
We have a large set of positive examples, that is, when accidents occured. Any time/segment when an accident didn't occur is a valid negative sample. We sample possible negtive examples by sampling from and mutating the positive examples until they aren't positive. We do this by changing either the hour, day or segment ID.

In [22]:
class NegativeSampler:
    """
    Given features (positive examples), roads, times and the number of samples to draw, draws a sample
    """
    def __init__(self,num_samples,
                 accidents,roads,
                 static_feature_names,
                 seed=None):

        self.roads = roads
        self.accidents = accidents
        self.num_samples = num_samples
        
        self.static_feature_names = static_feature_names
        
        if seed:
            np.random.seed(seed)
    
    def sample(self):
        segment_ids = self.roads.index.to_series()
        
        altered = pd.DataFrame()
        num_to_sample = self.num_samples
        
        while num_to_sample > 0:
            samples = self.accidents.sample(n=num_to_sample,replace=True).reset_index()
            alt = self._mod(samples,segment_ids).copy()
            # Create an index
         
            ats = pd.DatetimeIndex(alt.timestamp).floor('1h').to_series().astype('str')
            asegid = alt.segment_id.astype('str')
            
            # Set index on altered samples
            alt['fid'] = ats.values + asegid.values
            alt.set_index('fid',inplace=True)
            
            altered = altered.append(alt)

            # Which happened before? They shouldn't get negative samples
            intersection = altered.index.intersection(self.accidents.index)
            idxer = intersection.get_values()
            
            # Drop samples where accidents occurred.
            altered = altered.drop(idxer)
            
            num_to_sample = self.num_samples - altered.shape[0]
        
        altered['target'] = 0
        
        ts = pd.DatetimeIndex(altered.timestamp)
        segment_ids = altered.segment_id
        
        altered = altered.reset_index()
        
        station_ids = self.roads.loc[segment_ids].station_id
        static_features = self.roads.loc[segment_ids][self.static_feature_names]
        
        altered[self.static_feature_names] = static_features.values
        altered['station_id'] = station_ids.values
        altered['weekday'] = ts.weekday
        altered['month'] = ts.month
        altered = altered.drop(columns=['fid'])
        return altered
    
    def _mod(self,samples,segment_ids):
        # Get the current timestamps
        ts = pd.DatetimeIndex(samples.timestamp)
        
        # Hour, Day, Year
        hour = samples.hour.copy()
        day = ts.dayofyear.to_series()
        year = ts.year.to_series()

        # Road ID
        segment_id = samples.segment_id.copy()
        
        # Index of samples to mutate
        feat_i = np.random.randint(0,3,size=samples.shape[0])
        
        #samples['ole_timestamp'] = samples.timestamp.values
        #samples['ole_hour'] = hour.values
        #samples['ole_year'] = year.values
        #samples['ole_SegID'] = road_id.values
        #samples['feat_i'] = feat_i

        ##########################
        # i == 0
        # Change hour of day
        idx = feat_i == 0
        samp_i = samples.loc[idx]
        N = samp_i.shape[0]

        # Sample until we have all different hours of the day
        num_same = N
        new_hours = hour[idx].values.copy()
        dif_idx = np.ones(N,dtype='bool')
        while num_same != 0:
            new_hours[dif_idx] = np.random.choice(24,size=num_same)
            dif_idx = new_hours == hour[idx]
            num_same = dif_idx.sum()

        # Create new timestamps
        new_timestamps = year[idx].apply(pd.Timestamp,args=(1,1))
        new_timestamps += pd.TimedeltaIndex(day[idx]-1,unit='D') # same day
        new_timestamps += pd.TimedeltaIndex(new_hours,unit='H') # new hour
        
        
        samples.loc[idx,'hour'] = new_hours
        samples.loc[idx,'segment_id'] = segment_id[idx]
        samples.loc[idx,'timestamp'] = new_timestamps.values
        ##########################

        ##########################
        # i == 1
        # Change day of year
        idx = feat_i == 1
        samp_i = samples.loc[idx]
        N = samp_i.shape[0]

        is_leap_yr = ts[idx].is_leap_year

        # Sample until we have all different days of the year.
        num_same = N
        new_days = day[idx].values.copy()
        dif_idx = np.ones(N,dtype='bool')
        while num_same != 0:
            # Pay attention to leap years
            dif_leap_yr = (dif_idx&is_leap_yr)
            dif_no_leap_yr = dif_idx&(~is_leap_yr)
            
            new_days[dif_leap_yr] = np.random.choice(np.arange(1,367,dtype='int'),size=dif_leap_yr.sum())
            new_days[dif_no_leap_yr] = np.random.choice(np.arange(1,366,dtype='int'),size=dif_no_leap_yr.sum())

            dif_idx = new_days == day[idx]
            num_same = dif_idx.sum()
        
        # Create new timestamps
        timestamps = year[idx].apply(pd.Timestamp,args=(1,1))
        new_timestamps = pd.DatetimeIndex(timestamps) 
        new_timestamps += pd.TimedeltaIndex(new_days,unit='D')  # new day
        new_timestamps += pd.TimedeltaIndex(hour[idx],unit='H') # same hour

        samples.loc[idx,'hour'] = new_timestamps.hour
        samples.loc[idx,'segment_id'] = segment_id[idx]
        samples.loc[idx,'timestamp'] = new_timestamps.values
        ##########################

        ##########################
        # i == 2
        # Change road
        idx = feat_i == 2
        samp_i = samples.loc[idx]
        N = samp_i.shape[0]

        num_same = N
        new_roads = segment_id[idx].values.copy()
        dif_idx = np.ones(N,dtype='bool')

        while num_same != 0:
            new_roads[dif_idx] = segment_ids.sample(n=num_same,replace=True).values
            dif_idx = new_roads == segment_id[idx]
            num_same = dif_idx.sum()

        samples.loc[idx,'segment_id'] = new_roads
        samples.loc[idx,'hour'] = hour[idx]
        samples.loc[idx,'timestamp'] = ts[idx]
        
        samples['segment_id'] = samples['segment_id'].astype('int64')     
        return samples

In [30]:
t = time.time()
N = features.shape[0] * 10
ns = NegativeSampler(N,features,rdf,static_feature_names,seed=12345)
negative_examples = ns.sample().set_index(['timestamp','segment_id','station_id'])

print((time.time() - t),'seconds')

82.72027134895325 seconds


In [31]:
if not os.path.exists('training_data'):
    os.mkdir('training_data')

In [32]:
negative_examples.to_csv('training_data/utah_negative_examples.csv')

In [33]:
negative_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hour,weekday,month,pre_dir,street_type,suf_dir,one_way,speed_limit,surface_type,surface_width,aadt,sinuosity,euclidean_length,segment_length,near_major_road,road_orient_approx,at_intersection,near_billboard,accident_counts,target
timestamp,segment_id,station_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-12-22 19:00:00,385152,9999993081,17,0,12,,,,0,25.0,,0,0.0,1.02015,639.724684,652.615291,0,1.681712,0,0,0.0,0
2017-12-06 09:00:00,5753,72572024127,9,2,12,S,,E,0,35.0,,0,11763.0,1.0,74.051639,74.051639,0,1.562353,0,0,3.0,0
2013-07-01 12:00:00,324415,72472393025,8,0,7,,,,0,25.0,,0,0.0,1.003237,96.842309,97.1558,0,2.961512,0,0,0.0,0
2013-07-06 08:00:00,153026,72572424174,8,5,7,W,,S,0,25.0,,0,0.0,1.0,197.361685,197.361685,0,0.018032,1,0,28.0,0
2010-03-01 09:00:00,269209,74003024103,16,0,3,,RD,,0,25.0,DIRT,10,0.0,1.128169,1965.866308,2217.828527,0,0.701536,0,0,0.0,0


In [34]:
# Reselect Features
positive_examples = df\
    .xs(['segment_id']+temporal_feature_names,axis=1)\
    .set_index('segment_id')\
    .join(
        rdf.xs(static_feature_names+['station_id'],axis=1),
        how='left'
    ).reset_index()
    
positive_examples = positive_examples.set_index([time_index.floor('1h'),'segment_id','station_id'])
positive_examples['target'] = 1
positive_examples.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,hour,weekday,month,pre_dir,street_type,suf_dir,one_way,speed_limit,surface_type,surface_width,aadt,sinuosity,euclidean_length,segment_length,near_major_road,road_orient_approx,at_intersection,near_billboard,accident_counts,target
timestamp,segment_id,station_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010-10-15 19:00:00,1,72572024127,10,5,11,N,,,0,40.0,,0,16689.0,1.0,110.198977,110.198977,0,1.022803,1,0,4.0,1
2010-08-20 19:00:00,1,72572024127,16,0,10,N,,,0,40.0,,0,16689.0,1.0,110.198977,110.198977,0,1.022803,1,0,4.0,1
2010-03-27 21:00:00,1,72572024127,15,5,1,N,,,0,40.0,,0,16689.0,1.0,110.198977,110.198977,0,1.022803,1,0,4.0,1
2010-04-16 13:00:00,1,72572024127,15,0,12,N,,,0,40.0,,0,16689.0,1.0,110.198977,110.198977,0,1.022803,1,0,4.0,1
2010-03-05 09:00:00,2,72572024127,21,4,9,N,,,0,40.0,,0,16689.0,1.0,87.835598,87.835598,0,1.187714,1,0,7.0,1


In [35]:
positive_examples.to_csv('training_data/utah_positive_examples.csv')

In [36]:
rdf.to_csv('training_data/road_features.csv')