# Features

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
data_path = '../../data/'

In [3]:
df = pd.read_csv(os.path.join(data_path, 'accidents_with_vehicles.csv'))

In [4]:
df.head()

Unnamed: 0,accident_id,location_easting_osgr,location_northing_osgr,longitude,latitude,number_of_vehicles,number_of_casualties,date,time,1st_road_class,...,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area,lsoa_of_accident_location,police_force,local_authority_district,local_authority_highway,target,all_vehicles
0,359433,519310.0,188730.0,-0.279323,51.584754,2,1,2016-01-11,02:30,A,...,Dry,,,Urban,E01000543,Metropolitan Police,Brent,Brent,0,1 Taxi/Private hire car No tow/articulation U...
1,446617,551920.0,174560.0,0.184928,51.449595,1,1,2016-01-11,00:37,A,...,Dry,,,Urban,E01000375,Metropolitan Police,Bexley,Bexley,0,1 Car No tow/articulation Going ahead other O...
2,342312,505930.0,183850.0,-0.473837,51.543563,1,1,2016-01-11,01:25,A,...,Dry,,,Urban,E01033725,Metropolitan Police,Hillingdon,Hillingdon,0,1 Car No tow/articulation Going ahead other O...
3,495118,527770.0,168930.0,-0.164442,51.404958,1,1,2016-01-11,09:15,A,...,Dry,,,Urban,E01003379,Metropolitan Police,Merton,Merton,0,1 Bus or coach (17 or more pass seats) No tow...
4,595500,510740.0,177230.0,-0.40658,51.483139,2,1,2016-01-11,07:53,A,...,Dry,,,Urban,E01002583,Metropolitan Police,Hounslow,Hounslow,0,1 Car No tow/articulation Changing lane to ri...


In [5]:
list(df.columns)

['accident_id',
 'location_easting_osgr',
 'location_northing_osgr',
 'longitude',
 'latitude',
 'number_of_vehicles',
 'number_of_casualties',
 'date',
 'time',
 '1st_road_class',
 '1st_road_number',
 'road_type',
 'speed_limit',
 'junction_detail',
 'junction_control',
 '2nd_road_class',
 '2nd_road_number',
 'pedestrian_crossing-human_control',
 'pedestrian_crossing-physical_facilities',
 'light_conditions',
 'weather_conditions',
 'road_surface_conditions',
 'special_conditions_at_site',
 'carriageway_hazards',
 'urban_or_rural_area',
 'lsoa_of_accident_location',
 'police_force',
 'local_authority_district',
 'local_authority_highway',
 'target',
 'all_vehicles']

In [6]:
df.dtypes

accident_id                                  int64
location_easting_osgr                      float64
location_northing_osgr                     float64
longitude                                  float64
latitude                                   float64
number_of_vehicles                           int64
number_of_casualties                         int64
date                                        object
time                                        object
1st_road_class                              object
1st_road_number                              int64
road_type                                    int64
speed_limit                                float64
junction_detail                             object
junction_control                            object
2nd_road_class                              object
2nd_road_number                            float64
pedestrian_crossing-human_control           object
pedestrian_crossing-physical_facilities     object
light_conditions               

In [7]:
df['date'] = pd.to_datetime(df['date'])

In [8]:
df['day_of_week'] = df['date'].dt.dayofweek

In [9]:
df['month'] = df['date'].dt.month

In [10]:
df['time'] = pd.to_datetime(df['time'], format='%H:%M')

In [11]:
df['hour'] = df['time'].dt.hour

In [12]:
numeric_columns = [
    'number_of_vehicles',
    'number_of_casualties',
    'speed_limit',
]

In [13]:
category_columns = [
    'road_type',
    '1st_road_class',
    '2nd_road_class',
    'urban_or_rural_area',
    'junction_detail',
    'junction_control',
    'pedestrian_crossing-human_control',
    'pedestrian_crossing-physical_facilities',
    'light_conditions',
    'weather_conditions',
    'road_surface_conditions',
    'special_conditions_at_site',
    'carriageway_hazards',
    # own pre-computed features
    'day_of_week',
    'month',
    # 'hour',
]

In [14]:
text_columns = [
    'all_vehicles'
]

In [15]:
target_columns = [
    'target',
]

In [16]:
df_features = pd.DataFrame()

In [17]:
# Numeric
for col_name in numeric_columns:
    
    print('Number: {}'.format(col_name))
    
    df_features[col_name] = df[col_name]

Number: number_of_vehicles
Number: number_of_casualties
Number: speed_limit


In [18]:
# Categories
for col_name in category_columns:
    
    print('Category: {}'.format(col_name))
    # keys = set(df[col_name].unique())
    # print(keys)
        
    ## One-hot encoding
        
    # Categories
    df_categories = pd.get_dummies(df[col_name], prefix=col_name)

    # Append to features
    df_features = pd.concat([df_features, df_categories], axis=1)

Category: road_type
Category: 1st_road_class
Category: 2nd_road_class
Category: urban_or_rural_area
Category: junction_detail
Category: junction_control
Category: pedestrian_crossing-human_control
Category: pedestrian_crossing-physical_facilities
Category: light_conditions
Category: weather_conditions
Category: road_surface_conditions
Category: special_conditions_at_site
Category: carriageway_hazards
Category: day_of_week
Category: month


In [19]:
# Text
for col_name in text_columns:
    
    print('Text: {}'.format(col_name))
    
    df_features[col_name] = df[col_name]

Text: all_vehicles


In [20]:
# Target
for col_name in target_columns:
    
    print('Target: {}'.format(col_name))
    
    df_features[col_name] = df[col_name] 

Target: target


In [21]:
len(df_features.columns)

110

In [22]:
list(df_features.columns)

['number_of_vehicles',
 'number_of_casualties',
 'speed_limit',
 'road_type_1',
 'road_type_2',
 'road_type_3',
 'road_type_4',
 'road_type_5',
 'road_type_6',
 '1st_road_class_A',
 '1st_road_class_A(M)',
 '1st_road_class_B',
 '1st_road_class_C',
 '1st_road_class_Motorway',
 '1st_road_class_Unclassified',
 '2nd_road_class_-1',
 '2nd_road_class_A',
 '2nd_road_class_A(M)',
 '2nd_road_class_B',
 '2nd_road_class_C',
 '2nd_road_class_Motorway',
 '2nd_road_class_Unclassified',
 'urban_or_rural_area_Rural',
 'urban_or_rural_area_Urban',
 'junction_detail_-1',
 'junction_detail_Crossroads',
 'junction_detail_Mini-roundabout',
 'junction_detail_More than 4 arms (not roundabout)',
 'junction_detail_Not at junction or within 20 metres',
 'junction_detail_Other junction',
 'junction_detail_Private drive or entrance',
 'junction_detail_Roundabout',
 'junction_detail_Slip road',
 'junction_detail_T or staggered junction',
 'junction_control_-1',
 'junction_control_Authorised person',
 'junction_cont

In [23]:
df_features.head()

Unnamed: 0,number_of_vehicles,number_of_casualties,speed_limit,road_type_1,road_type_2,road_type_3,road_type_4,road_type_5,road_type_6,1st_road_class_A,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,all_vehicles,target
0,2,1,30.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1 Taxi/Private hire car No tow/articulation U...,0
1,1,1,30.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1 Car No tow/articulation Going ahead other O...,0
2,1,1,30.0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1 Car No tow/articulation Going ahead other O...,0
3,1,1,30.0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1 Bus or coach (17 or more pass seats) No tow...,0
4,2,1,40.0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1 Car No tow/articulation Changing lane to ri...,0


In [24]:
df_features.iloc[0, :]

number_of_vehicles                                                                                                     2
number_of_casualties                                                                                                   1
speed_limit                                                                                                           30
road_type_1                                                                                                            0
road_type_2                                                                                                            0
road_type_3                                                                                                            0
road_type_4                                                                                                            1
road_type_5                                                                                                            0
road_type_6                     

In [25]:
len(list(df_features.columns))

110

In [26]:
with open('../../data/features.csv', 'w') as f:
    f.write(df_features.to_csv(index=False))

In [27]:
!wc -l '../../data/features.csv'

394303 ../../data/features.csv


In [28]:
!head -2 '../../data/features.csv'

number_of_vehicles,number_of_casualties,speed_limit,road_type_1,road_type_2,road_type_3,road_type_4,road_type_5,road_type_6,1st_road_class_A,1st_road_class_A(M),1st_road_class_B,1st_road_class_C,1st_road_class_Motorway,1st_road_class_Unclassified,2nd_road_class_-1,2nd_road_class_A,2nd_road_class_A(M),2nd_road_class_B,2nd_road_class_C,2nd_road_class_Motorway,2nd_road_class_Unclassified,urban_or_rural_area_Rural,urban_or_rural_area_Urban,junction_detail_-1,junction_detail_Crossroads,junction_detail_Mini-roundabout,junction_detail_More than 4 arms (not roundabout),junction_detail_Not at junction or within 20 metres,junction_detail_Other junction,junction_detail_Private drive or entrance,junction_detail_Roundabout,junction_detail_Slip road,junction_detail_T or staggered junction,junction_control_-1,junction_control_Authorised person,junction_control_Auto traffic signal,junction_control_Give way or uncontrolled,junction_control_Not at junction or within 20 metres,junction_control_Stop sign,