# Test final model

In [1]:
import os
import pandas as pd
import numpy as np
import pickle

In [2]:
data_path = '../../data/'

In [3]:
df_test = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [4]:
df_test.head()

Unnamed: 0,accident_id,location_easting_osgr,location_northing_osgr,longitude,latitude,number_of_vehicles,number_of_casualties,date,time,1st_road_class,...,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area,lsoa_of_accident_location,police_force,local_authority_district,local_authority_highway
0,372234,532920.0,196330.0,-0.080107,51.650061,2,3,2017-05-08,03:12,A,...,Darkness - lights lit,1,Dry,,,Urban,E01001450,Metropolitan Police,Enfield,Enfield
1,332946,526790.0,181970.0,-0.173845,51.522425,2,1,2017-01-01,01:30,A,...,Darkness - lights lit,1,Wet or damp,,,Urban,E01004702,Metropolitan Police,Westminster,Westminster
2,596385,535200.0,181260.0,-0.052969,51.514096,3,1,2017-01-01,00:30,A,...,Darkness - lights lit,1,Dry,,,Urban,E01004298,Metropolitan Police,Tower Hamlets,Tower Hamlets
3,493689,534340.0,193560.0,-0.060658,51.624832,2,1,2017-01-01,01:11,A,...,Darkness - lights lit,2,Wet or damp,,,Urban,E01001429,Metropolitan Police,Enfield,Enfield
4,420550,533680.0,187820.0,-0.072372,51.573408,1,1,2017-01-01,01:42,A,...,Darkness - lights lit,1,Wet or damp,,,Urban,E01001808,Metropolitan Police,Hackney,Hackney


In [5]:
df_test.shape

(129950, 29)

In [6]:
'Number of accidents in the test set: {:,}'.format(df_test.shape[0])

'Number of accidents in the test set: 129,950'

In [7]:
df_vehicles = pd.read_csv(os.path.join(data_path, 'grouped_vehicles.csv'))

In [8]:
df_vehicles.shape

(244091, 14)

In [9]:
df_vehicles.head()

Unnamed: 0,accident_id,Junction_Location,Age_of_Vehicle,Vehicle_Type,Vehicle_Location-Restricted_Lane,Skidding_and_Overturning,Hit_Object_in_Carriageway,Vehicle_Reference,1st_Point_of_Impact,Towing_and_Articulation,Vehicle_Leaving_Carriageway,Age_of_Driver,Hit_Object_off_Carriageway,Vehicle_Manoeuvre
0,300000,,23.0,,,,,2,,,,19.0,,
1,300001,,-2.0,,,,,2,,,,40.0,,
2,300002,,21.0,,,,,2,,,,111.0,,
3,300003,,9.0,,,,,1,,,,17.0,,
4,300004,,1.0,,,,,2,,,,68.0,,


In [10]:
df_vehicles.columns

Index(['accident_id', 'Junction_Location', 'Age_of_Vehicle', 'Vehicle_Type',
       'Vehicle_Location-Restricted_Lane', 'Skidding_and_Overturning',
       'Hit_Object_in_Carriageway', 'Vehicle_Reference', '1st_Point_of_Impact',
       'Towing_and_Articulation', 'Vehicle_Leaving_Carriageway',
       'Age_of_Driver', 'Hit_Object_off_Carriageway', 'Vehicle_Manoeuvre'],
      dtype='object')

In [11]:
df_merged = pd.merge(df_test, df_vehicles, on='accident_id', how='left')

In [12]:
df_merged.head()

Unnamed: 0,accident_id,location_easting_osgr,location_northing_osgr,longitude,latitude,number_of_vehicles,number_of_casualties,date,time,1st_road_class,...,Vehicle_Location-Restricted_Lane,Skidding_and_Overturning,Hit_Object_in_Carriageway,Vehicle_Reference,1st_Point_of_Impact,Towing_and_Articulation,Vehicle_Leaving_Carriageway,Age_of_Driver,Hit_Object_off_Carriageway,Vehicle_Manoeuvre
0,372234,532920.0,196330.0,-0.080107,51.650061,2,3,2017-05-08,03:12,A,...,,,,2.0,,,,43.0,,
1,332946,526790.0,181970.0,-0.173845,51.522425,2,1,2017-01-01,01:30,A,...,,,,2.0,,,,73.0,,
2,596385,535200.0,181260.0,-0.052969,51.514096,3,1,2017-01-01,00:30,A,...,,,,3.0,,,,65.0,,
3,493689,534340.0,193560.0,-0.060658,51.624832,2,1,2017-01-01,01:11,A,...,,,,2.0,,,,66.0,,
4,420550,533680.0,187820.0,-0.072372,51.573408,1,1,2017-01-01,01:42,A,...,,,,1.0,,,,78.0,,


In [13]:
df_merged.shape

(129950, 42)

In [14]:
df_merged['date'] = pd.to_datetime(df_merged['date'])

In [15]:
df_merged['day_of_week'] = df_merged['date'].dt.dayofweek

In [16]:
df_merged['month'] = df_merged['date'].dt.month

In [17]:
df_merged['time'] = pd.to_datetime(df_merged['time'], format='%H:%M')

In [18]:
df_merged['hour'] = df_merged['time'].dt.hour

In [19]:
numeric_columns = [
    'number_of_vehicles',
    'number_of_casualties',
    'speed_limit',
    # vehicles
    # 'Vehicle_Reference',
    'Age_of_Driver',
    'Age_of_Vehicle'
]

In [20]:
category_columns = [
    'road_type',
    '1st_road_class',
    '2nd_road_class',
    'urban_or_rural_area',
    'junction_detail',
    'junction_control',
    'pedestrian_crossing-human_control',
    'pedestrian_crossing-physical_facilities',
    'light_conditions',
    'weather_conditions',
    'road_surface_conditions',
    'special_conditions_at_site',
    'carriageway_hazards',
    # own pre-computed features
    'day_of_week',
    'month',
    'hour',
    # from vehicles
    'Vehicle_Type',
    'Towing_and_Articulation',
    'Vehicle_Manoeuvre',
    'Vehicle_Location-Restricted_Lane',
    'Junction_Location',
    'Skidding_and_Overturning',
    'Hit_Object_in_Carriageway',
    'Vehicle_Leaving_Carriageway',
    'Hit_Object_off_Carriageway',
    '1st_Point_of_Impact',
]

In [21]:
df_features = pd.DataFrame()

In [22]:
accident_ids = df_merged['accident_id']

In [23]:
# Numeric
for col_name in numeric_columns:
    
    print('Number: {}'.format(col_name))
    
    df_features[col_name] = df_merged[col_name]

Number: number_of_vehicles
Number: number_of_casualties
Number: speed_limit
Number: Age_of_Driver
Number: Age_of_Vehicle


In [24]:
# Categories
for col_name in category_columns:
    
    print('Category: {}'.format(col_name))
    # keys = set(df[col_name].unique())
    # print(keys)
        
    ## One-hot encoding
        
    # Categories
    df_categories = pd.get_dummies(df_merged[col_name], prefix=col_name)

    # Append to features
    df_features = pd.concat([df_features, df_categories], axis=1)

Category: road_type
Category: 1st_road_class
Category: 2nd_road_class
Category: urban_or_rural_area
Category: junction_detail
Category: junction_control
Category: pedestrian_crossing-human_control
Category: pedestrian_crossing-physical_facilities
Category: light_conditions
Category: weather_conditions
Category: road_surface_conditions
Category: special_conditions_at_site
Category: carriageway_hazards
Category: day_of_week
Category: month
Category: hour
Category: Vehicle_Type
Category: Towing_and_Articulation
Category: Vehicle_Manoeuvre
Category: Vehicle_Location-Restricted_Lane
Category: Junction_Location
Category: Skidding_and_Overturning
Category: Hit_Object_in_Carriageway
Category: Vehicle_Leaving_Carriageway
Category: Hit_Object_off_Carriageway
Category: 1st_Point_of_Impact


In [25]:
df_features.head()

Unnamed: 0,number_of_vehicles,number_of_casualties,speed_limit,Age_of_Driver,Age_of_Vehicle,road_type_1,road_type_2,road_type_3,road_type_4,road_type_5,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,2,3,30.0,43.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,30.0,73.0,20.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,30.0,65.0,22.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2,1,30.0,66.0,15.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,20.0,78.0,14.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_features.iloc[0, :]

number_of_vehicles                  2.0
number_of_casualties                3.0
speed_limit                        30.0
Age_of_Driver                      43.0
Age_of_Vehicle                      0.0
road_type_1                         0.0
road_type_2                         0.0
road_type_3                         0.0
road_type_4                         1.0
road_type_5                         0.0
road_type_6                         0.0
1st_road_class_A                    1.0
1st_road_class_A(M)                 0.0
1st_road_class_B                    0.0
1st_road_class_C                    0.0
1st_road_class_Motorway             0.0
1st_road_class_Unclassified         0.0
2nd_road_class_-1                   1.0
2nd_road_class_A                    0.0
2nd_road_class_A(M)                 0.0
2nd_road_class_B                    0.0
2nd_road_class_C                    0.0
2nd_road_class_Motorway             0.0
2nd_road_class_Unclassified         0.0
urban_or_rural_area_Rural           0.0


In [27]:
len(list(df_features.columns))

134

### Load data

In [28]:
with open('../../data/training_columns.txt') as f:
    training_columns = f.read().split(';')

In [29]:
len(training_columns)

135

In [30]:
training_columns

['number_of_vehicles',
 'number_of_casualties',
 'speed_limit',
 'Age_of_Driver',
 'Age_of_Vehicle',
 'road_type_1',
 'road_type_2',
 'road_type_3',
 'road_type_4',
 'road_type_5',
 'road_type_6',
 '1st_road_class_A',
 '1st_road_class_A(M)',
 '1st_road_class_B',
 '1st_road_class_C',
 '1st_road_class_Motorway',
 '1st_road_class_Unclassified',
 '2nd_road_class_-1',
 '2nd_road_class_A',
 '2nd_road_class_A(M)',
 '2nd_road_class_B',
 '2nd_road_class_C',
 '2nd_road_class_Motorway',
 '2nd_road_class_Unclassified',
 'urban_or_rural_area_Rural',
 'urban_or_rural_area_Urban',
 'junction_detail_-1',
 'junction_detail_Crossroads',
 'junction_detail_Mini-roundabout',
 'junction_detail_More than 4 arms (not roundabout)',
 'junction_detail_Not at junction or within 20 metres',
 'junction_detail_Other junction',
 'junction_detail_Private drive or entrance',
 'junction_detail_Roundabout',
 'junction_detail_Slip road',
 'junction_detail_T or staggered junction',
 'junction_control_-1',
 'junction_contro

In [31]:
training_columns.remove('target')

In [32]:
len(training_columns)

134

In [35]:
df_features = df_features[training_columns]

In [34]:
df_features['junction_control_Not at junction or within 20 metres'] = 0

In [36]:
df_features = df_features.fillna(0)

In [37]:
X_test = np.array(df_features)

### Load model

In [38]:
filename = '../../models/finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [39]:
results = loaded_model.predict(X_test)

In [40]:
df_results = pd.DataFrame()
df_results['accident_id'] = accident_ids
df_results['result'] = results

In [43]:
float(sum(results)) / len(results)

0.020115429011158138

In [44]:
with open('../../models/submission.csv', 'w') as f:
    f.write(df_results.to_csv(index=False))