# Exploratory data analysis of the dutch road accidents 
_Timespan of data: 2004-2015_

<img src='./img/accident.jpeg'>

Author: Erfan Nariman  
Date: 20-09-2020

In [1]:
import os
import pandas as pd
import re
pd.set_option("display.max_columns", None)

from src.get_files import GetFiles

### Check if csv's are in the data folder, else download them dutch government [website](https://data.overheid.nl/dataset/verkeersongevallen)

In [2]:
get_files = GetFiles()

if len(os.listdir("./data")) == 0:
    get_files.download_csvs()
else:
    print("CSV's of accidents are already download and in the data folder")

CSV's of accidents are already download and in the data folder


### Get all files with the `.csv` suffix

In [3]:
files = sorted([file for file in os.listdir("./data") if file.endswith(".csv")])
files

['ongevallen-2004.csv',
 'ongevallen-2005.csv',
 'ongevallen-2006.csv',
 'ongevallen-2007.csv',
 'ongevallen-2008.csv',
 'ongevallen-2009.csv',
 'ongevallen-2010.csv',
 'ongevallen-2011.csv',
 'ongevallen-2012.csv',
 'ongevallen-2013.csv',
 'ongevallen-2014.csv',
 'ongevallen-2015.csv']

### Read in the data

While reading in the csv's, we assign a new column called `Year` which is the number extracted from the filename

In [4]:
df = pd.concat(
    [pd.read_csv(f"./data/{file}", low_memory=False).assign(Year=re.search("\d+", file).group(0)) for file in files], 
    ignore_index=True
)
df.shape

(1326104, 120)

### Remove columns with more than `80%` missing values

1. get sum of `NaN` per column and sort them descending
2. get the column names which have more than 80% missing values
3. drop these columns

In [5]:
# 1
nan = df.isna().sum().sort_values(ascending=False)
# 2
to_remove = nan[nan.div(df.shape[0]).sort_values(ascending=False).gt(0.8)].index
# 3
df = df.drop(columns=to_remove)
df.shape

(1326104, 73)

### Convert object columns to categorical

1. get column names with object dtype
2. per column, count the amount of unique values
3. create a series and sort them descending

In [6]:
object_cols = df.select_dtypes(include=object).columns
unique_vals = {col: df[col].unique().shape[0] for col in object_cols}
s = pd.Series(unique_vals).sort_values(ascending=False)
s.head(10)

Communicatie_Ref       1144887
StraatNaam               66479
WoonplaatsNaam            2670
WegNummer                  628
GemeenteNaam               466
WijkteamNaam               241
RijbewijsCategry           219
BasiseenheidNaam           143
Inrichting                  82
PolitieDistrictNaam         36
dtype: int64

As we can see, `Communicatie_Ref` and `Straatnaam` have quite some unique values, so those will stay as object dtype, the rest will be converted to `Categerical` to save memory.

In [7]:
# check memory usage before
df.memory_usage(deep=True).sum() / 1_000_000

3480.77939

In [8]:
exclude = ["Communicatie_Ref", "StraatNaam"]
to_categorical = [col for col in object_cols if col not in exclude]
df[to_categorical] = df[to_categorical].astype("category")

In [9]:
# check memory usage after converting to categorical
df.memory_usage(deep=True).sum() / 1_000_000

508.265174