In [59]:
import pandas as pd
import numpy as np

df = pd.read_csv("transformed_data.csv", header=0)

In [60]:
df['engine_capacity_cc'].unique()

array(['Data missing or out of range', '1197', '1395', '1991', '1796',
       '1390', '999', '2993', '1798', '1499', '1596', '1998', '125',
       '1399', '1598', '2143', '1477', '2198', '2487', '1364', '2261',
       '1797', '1595', '1332', '1950', '2488', '998', '3199', '1198',
       '1498', '2148', '1580', '1995', '1328', '1591', '1400', '2521',
       '996', '1339', '1968', '1329', '2499', '647', '1299', '1397',
       '1500', '1229', '1560', '2967', '12777', '1360', '1799', '1685',
       '1388', '1248', '2299', '698', '1999', '1769', '10837', '5132',
       '2987', '1368', '1149', '1086', '1790', '1482', '1398', '1910',
       '2000', '1199', '1994', '1584', '2494', '1389', '4500', '1497',
       '6700', '1461', '1997', '2400', '1956', '2998', '1984', '1868',
       '2996', '4951', '1386', '1200', '2298', '108', '3498', '2995',
       '649', '1333', '1587', '1242', '2461', '2300', '2295', '2200',
       '2463', '1586', '2184', '4367', '1496', '1296', '745', '2393',
       '6693'

In [61]:
# column summaries
pd.DataFrame([i for i in zip(df.columns, df.dtypes, df.nunique())], columns=['column', 'dtype', 'nunique'])

Unnamed: 0,column,dtype,nunique
0,longitude.x,float64,6353
1,latitude.x,float64,6309
2,date.x,object,365
3,day_of_week,object,7
4,time,object,1055
5,first_road_class,object,4
6,road_type,object,6
7,speed_limit,int64,6
8,junction_detail,object,10
9,junction_control,object,6


In [62]:
# drop lsoa variables
# keep longitude, latitude, date as I may use these in future to join to extra data, but do not use in model
# remove age variables and keep age_band variables
df = df.drop(columns=['lsoa_of_casualty', 'lsoa_of_driver', 'age_of_casualty', 'age_of_driver'])

# categorise speed_limit
df['speed_limit'] = df['speed_limit'].astype('object')

# convert engine_capacity_cc to continuous
df['engine_capacity_cc'] = df['engine_capacity_cc'].replace('Data missing or out of range', np.nan).astype('float')

In [63]:
# drop variables that are unlikely to be predictive (using own judgement)
# pedestrian crossings should not be relevant to cyclists, assuming cyclists are cycling on the road
# age of vehicle not important - aggressive or careless drivers could equally be driving a new car or an old car
df = df.drop(columns=['pedestrian_crossing_human_control', 'pedestrian_crossing_physical_facilities', 'age_of_vehicle'])

In [64]:
# check there are no duplicates
dups = df.duplicated()
dups.any()

False

next steps
- check % missing by variable
- consider imputing or dropping variables with high % missing
- create new date and time variables (bin time into 4 hour periods)
- look at variable distributions - transform to log if necessary
- look at correlation/multicollinearity between variables
- PCA?
- chi-square to pick useful variables
