## Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import FeatureGroup, LayerControl, Map, Marker
import pandas as pd
import random
from scipy import stats
%matplotlib inline

## Consts

In [3]:
PROVIDER_AND_ID = "provider_and_id"
YEAR = "accident_year"
LAT = "latitude"
LONG = "longitude"
X = "x"
Y = "y"
SEVERITY = "injury_severity_hebrew"
ROAD_SEGMENT_NAME = 'road_segment_name'
ROAD_SEGMENT = 'road_segment_id'
ROAD_SEGMENT_LENGTH = 'road_segment_length_km'
SEVERITY_DEAD = 'הרוג'
SEVERITY_HARD = 'פצוע קשה'
RELEVANT_KEYS_ANALYSIS = [PROVIDER_AND_ID, YEAR, ROAD_SEGMENT_NAME, ROAD_SEGMENT, ROAD_SEGMENT_LENGTH, LAT, LONG, X, Y, SEVERITY]

DEFAULT_ZOOM = 9
FROM_YEAR = 2014
KM_FROM_MEDIAN = 'km_from_median'
DEFAULT_COORD = (32.079184, 34.824768)

## Load data

In [4]:
csv_path = r"views2020/involved_markers_hebrew.csv"
data = pd.read_csv(csv_path, na_values='')

In [5]:
# get only relevant data
filtered = data[RELEVANT_KEYS_ANALYSIS].drop_duplicates(subset=[PROVIDER_AND_ID, LAT, LONG])
# from this year the data was corrected
filtered = filtered[filtered[YEAR] >= FROM_YEAR]
filtered

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew
726,12015000826,2015,,,,32.043049,34.770967,178508.0,661167.0,פצוע קל
727,12015002360,2015,צומת מחסיה - צומת נס הרים,38660010.0,9.0,31.745920,35.014625,201464.0,628153.0,
728,12015003578,2015,,,,32.525085,35.161378,215404.0,714540.0,פצוע קל
729,12015006896,2015,כניסה למנהרה - כניסה לדרך האלוף עוזי נרקיס,10090.0,9.0,31.801289,35.184305,217543.0,634277.0,פצוע קל
733,12015006964,2015,,,,32.320981,34.863568,187352.0,691956.0,
...,...,...,...,...,...,...,...,...,...,...
1881289,32020025415,2020,מחלף גהה - מחלף מורשה,40140.0,3.7,32.092494,34.845892,185603.0,666624.0,פצוע קל
1881292,32020032761,2020,,,,32.081712,34.796332,180920.0,665445.0,פצוע קל
1881293,32020033896,2020,,,,32.954099,35.163235,215597.0,762118.0,פצוע קל
1881299,32020049734,2020,,,,32.059339,34.766696,178112.0,662975.0,פצוע קל


## Plot funcs

In [6]:
def create_map(coord):
    
    folium_map = folium.Map(location=coord, zoom_start=DEFAULT_ZOOM)
    
    return folium_map

In [7]:
def plot_coord(folium_map, coord, count, color, icon):
   
    tooltip = 'Click to see accident counts'
    folium.Marker(coord, icon=folium.Icon(color=color, icon=icon), 
                  popup=f'<i>{count}</i>', tooltip=tooltip).add_to(folium_map)

In [8]:
def plot_all_coords(coords, folium_map, color='green', icon='ok-sign'):
    """
    Plot all coords according to location and count
    """

    coord_to_count = coords.groupby([LAT, LONG]).size()
    
    for coord, count in coord_to_count.items():
        plot_coord(folium_map, coord, count, color=color, icon=icon)
    
    return coord_to_count

In [9]:
def get_median_coord(coords, decimals=4):
    
    return tuple(np.around(np.median(coords[[LAT, LONG]], axis=0), decimals))

In [10]:
def plot_median_coord(median_coord, coord_to_count, folium_map, color, icon):
    
    count = coord_to_count[median_coord] if median_coord in coord_to_count else 1
    plot_coord(folium_map, median_coord, count, color=color, icon=icon)

In [11]:
def plot_all_segment_coords(segment_rows, outlier_rows, segment_map_layer, decimals=4):
    """
    Plot all coords in segment, rounded to decimals, according to their location and count
    Median is blue, outliers are red, all other coords are green
    
    Keyword arguments:
    segment_rows -- segment rows without outliers
    outlier_rows -- outlier segment rows
    segment_map_layer -- points are added to this layer
    decimals -- round to decimals, very close points are merged 
                to the same point with a larger count
    """
    
    segment_coords = segment_rows[[LAT, LONG]].dropna().round(decimals=decimals)
    outlier_coords = outlier_rows[[LAT, LONG]].dropna().round(decimals=decimals)
    
    coord_to_count = plot_all_coords(segment_coords, segment_map_layer, color='green', icon='ok-sign')
    
    median_coord = get_median_coord(segment_coords, decimals=decimals)
    plot_median_coord(median_coord, coord_to_count, segment_map_layer, color='blue', icon='screenshot')
    
    plot_all_coords(outlier_coords, segment_map_layer, color='red', icon='exclamation-sign')


In [12]:
def get_segment_rows(data, road_segment_id, max_coords=None):
    
    segment_rows = data[data[ROAD_SEGMENT]==road_segment_id]
    
    return segment_rows[:max_coords]

In [86]:
def plot_outliers(data, outliers, max_coords=500):
    
    outlier_map = create_map(DEFAULT_COORD)
    
    for road_segment_id, outlier_rows in outliers.groupby(ROAD_SEGMENT):
        
        first_outlier = outlier_rows.iloc[0]
        segment_name = first_outlier[ROAD_SEGMENT_NAME]
        segment_length = round(first_outlier[ROAD_SEGMENT_LENGTH], 1)
        road_segment_id = int(road_segment_id)
        
        segment_rows = get_segment_rows(data, road_segment_id, max_coords)
        segment_rows = segment_rows[~segment_rows[PROVIDER_AND_ID].isin(outlier_rows[PROVIDER_AND_ID])]
        
        segment_map_layer = FeatureGroup(name=f'{road_segment_id} {segment_length} km {segment_name}', show=False)
        plot_all_segment_coords(segment_rows, outlier_rows, segment_map_layer)
        
        segment_map_layer.add_to(outlier_map)
        
    LayerControl().add_to(outlier_map)
    
    return outlier_map

## Find outliers by km from median

In [49]:
def spherical_distance(lat1, long1, lat2, long2):
    """
    Calculate the spherical distance in km between two coordinates in WGS-84 using Vincenty's formulae
    
    credit: https://www.johndcook.com/blog/2018/11/24/spheroid-distance/
    """

    lat1, long1, lat2, long2 = np.deg2rad(lat1), np.deg2rad(long1), np.deg2rad(lat2), np.deg2rad(long2)
    
    phi1 = 0.5*np.pi - lat1
    phi2 = 0.5*np.pi - lat2
    r = 0.5*(6378137 + 6356752) # mean radius in meters
    t = np.sin(phi1)*np.sin(phi2)*np.cos(long1-long2) + np.cos(phi1)*np.cos(phi2)
    
    # -1<=t<=1 for the arcos func
    t = np.minimum(t, 1)
    t = np.maximum(t, -1)
    
    return round(r * np.arccos(t) / 1000, 2)

In [15]:
def is_far_from_median(data, row, outliers, segment_length, padding):
    """
    Check if the distance of row from the median_coord is > segment_length + padding*segment_length
    The median is computed according to all coords, after removing the outliers and the current coord
    """
    
    curr_coord = [row[LAT], row[LONG]]

    data_without_outlier = data[~data[PROVIDER_AND_ID].isin(outliers) & data[PROVIDER_AND_ID] != row[PROVIDER_AND_ID]]

    median_coord = np.median(data_without_outlier[[LAT, LONG]], axis=0)

    dist = spherical_distance(*curr_coord, *median_coord)

    return dist > segment_length + padding*segment_length

In [16]:
def get_segment_outliers_far_from_median(data, segment_length, padding):
    
    outliers = []
    
    for i, row in data.iterrows():
        if is_far_from_median(data, row, outliers, segment_length, padding):
            outliers.append(row[PROVIDER_AND_ID])
            
    return outliers

In [17]:
def add_km_from_median_col(other_data, outliers_data):
    
    median_coord = np.median(other_data[[LAT, LONG]], axis=0)

    outliers_data[KM_FROM_MEDIAN] = outliers_data.apply(lambda row: spherical_distance(row[LAT], row[LONG], *median_coord), axis=1)
    outliers_data = outliers_data.sort_values(by=KM_FROM_MEDIAN)
    
    return outliers_data

In [78]:
def calc_outliers_for_segment_km(data, min_sample_size, padding=0.25):
     
    data = data.dropna(subset=[LAT, LONG])
    
    if len(data) <= min_sample_size:
        return []
    
    segment_length = data.iloc[0][ROAD_SEGMENT_LENGTH]
    
    outliers = get_segment_outliers_far_from_median(data, segment_length, padding)

    outliers_data = data[data[PROVIDER_AND_ID].isin(outliers)].copy(deep=True)
    other_data = data[~data[PROVIDER_AND_ID].isin(outliers)].copy(deep=True)
    
    if len(outliers) > 0:
        outliers_data = add_km_from_median_col(other_data, outliers_data)
        
    return outliers_data

In [19]:
def get_outliers_by_km(data, min_sample_size=2):
    
    res = []
    for seg, rows in data.groupby(ROAD_SEGMENT):
        outliers = calc_outliers_for_segment_km(rows, min_sample_size)
        if len(outliers) > 0:
            res.append(outliers)
            
    return pd.concat(res)

### Old method:

In [20]:
def calc_zscores(data, zscore_thresh, min_sample_size):

    coords = data[[X, Y]].dropna()
    
    # check if sample size is big enough for the statistic calculation
    if len(coords.drop_duplicates()) <= min_sample_size:
        return []
    
    # calculate zscores for each column
    zscores = stats.zscore(coords, axis = 0)
    zscores = np.nan_to_num(zscores)
    zscores = np.abs(zscores)
    # get indices to outliers
    outliers = np.argwhere(zscores > zscore_thresh)
    outlier_rows = np.unique(outliers[:, 0])
    outlier_rows = data.iloc[outlier_rows]
    
    return outlier_rows

def calc_outliers_for_segment_zscores(data, zscore_thresh, min_sample_size, check_by_km):
    
    outlier_rows = calc_zscores(data, zscore_thresh, min_sample_size)
    
    if len(outlier_rows) > 0:
        
        if check_by_km:
            median_coord = np.median(data[[LAT, LONG]].dropna(), axis=0)
            segment_length = outlier_rows.iloc[0][ROAD_SEGMENT_LENGTH]

            outlier_rows = outlier_rows[spherical_distance(outlier_rows[LAT], outlier_rows[LONG], *median_coord) 
                                        > segment_length]
        
        return outlier_rows
    
    return []

def get_outliers_by_z_scores(data, zscore_thresh, min_sample_size=0, check_by_km=True):
    res = []
    for seg, rows in data.groupby(ROAD_SEGMENT):
        outliers = calc_outliers_for_segment_zscores(rows, zscore_thresh, min_sample_size, check_by_km)
        if len(outliers) > 0:
            res.append(outliers)
    return pd.concat(res)

## Pick threshold and minimum sample size

In [21]:
ZSCORE_THRESH = 5
MIN_SAMPLE_SIZE = 10

## Test

In [87]:
segmend_id = 10010
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]
outliers_test = calc_outliers_for_segment_km(segment_rows, MIN_SAMPLE_SIZE)

outliers_test

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew,km_from_median
262132,32014096076,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.995741,34.876132,188425.0,655886.0,פצוע קל,7.23
79852,32016097605,2016,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.985041,34.892274,189947.0,654695.0,פצוע קל,9.14
630376,32014086342,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.985041,34.892274,189947.0,654695.0,פצוע קל,9.14
1290954,32015074726,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.980103,34.901051,190775.0,654145.0,פצוע קל,10.14
193073,32015032765,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.980103,34.901051,190775.0,654145.0,פצוע קל,10.14
1096313,32016024246,2016,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.960313,34.935421,194018.0,651942.0,פצוע קל,14.05
293439,32014038567,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.925197,34.927096,193221.0,648050.0,פצוע קל,15.9
298283,32014002960,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.900807,34.937782,194225.0,645343.0,פצוע קל,18.57
33514,32015070182,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.818555,35.021194,202100.0,636206.0,,30.64
171836,32015057939,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.804907,35.045774,204425.0,634689.0,,33.31


In [88]:
plot_outliers(filtered, outliers_test)

In [80]:
segmend_id = 650020
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]

outliers_test = calc_outliers_for_segment_km(segment_rows, MIN_SAMPLE_SIZE)

plot_outliers(filtered, outliers_test)

The algorithm can't detect outliers that are very close to the start or the end of the segment.    
Note that there is one coord at צומת חדרה מזרח   
Should it be in this segment or the next segment? 

### Test segment padding

In [75]:
segmend_id = 650030
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]

In [76]:
outliers_test = calc_outliers_for_segment_km(segment_rows, MIN_SAMPLE_SIZE, padding=0)
len(outliers_test)

27

In [27]:
outliers_test.head()

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew,km_from_median
4635,32020034458,2020,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,32.447831,34.943457,194909.0,706002.0,,1.5
824257,32014091699,2014,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,32.447831,34.943457,194909.0,706002.0,,1.5
785725,12016080255,2016,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,32.447831,34.943457,194909.0,706002.0,פצוע קל,1.5
784117,12016027568,2016,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,32.447831,34.943457,194909.0,706002.0,,1.5
760624,12015083066,2015,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,32.447831,34.943457,194909.0,706002.0,פצוע קל,1.5


road_segment_length_km should be 1.9 according to google maps

In [28]:
plot_outliers(filtered, outliers_test)

In [29]:
outliers_test = calc_outliers_for_segment_km(segment_rows, MIN_SAMPLE_SIZE, padding=0.2)
len(outliers_test)

27

In [77]:
outliers_test = calc_outliers_for_segment_km(segment_rows, MIN_SAMPLE_SIZE, padding=0.25)
len(outliers_test)

0

In [74]:
segmend_id = 440010
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]
outliers_test = calc_outliers_for_segment_km(segment_rows, MIN_SAMPLE_SIZE, padding=0.25)

outliers_test

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew,km_from_median
267975,32018070927,2018,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,31.89574,34.887068,189426.0,644794.0,,13.06
287602,32019031467,2019,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,31.89574,34.887068,189426.0,644794.0,,13.06
661535,32019052527,2019,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,31.89574,34.887068,189426.0,644794.0,,13.06
931886,32019030852,2019,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,31.89574,34.887068,189426.0,644794.0,,13.06
1404219,32019092402,2019,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,31.939954,34.865469,187398.0,649703.0,פצוע קל,18.29


In [69]:
plot_outliers(filtered, outliers_test)

## Calculate all outliers

In [79]:
outliers_by_km = get_outliers_by_km(filtered, MIN_SAMPLE_SIZE)
len(outliers_by_km)

534

### Segments with outliers

In [82]:
len(outliers_by_km[ROAD_SEGMENT].unique())

56

In [33]:
outliers_by_km.head(10)

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew,km_from_median
262132,32014096076,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.995741,34.876132,188425.0,655886.0,פצוע קל,7.23
79852,32016097605,2016,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.985041,34.892274,189947.0,654695.0,פצוע קל,9.14
630376,32014086342,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.985041,34.892274,189947.0,654695.0,פצוע קל,9.14
1290954,32015074726,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.980103,34.901051,190775.0,654145.0,פצוע קל,10.14
193073,32015032765,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.980103,34.901051,190775.0,654145.0,פצוע קל,10.14
1096313,32016024246,2016,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.960313,34.935421,194018.0,651942.0,פצוע קל,14.05
293439,32014038567,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.925197,34.927096,193221.0,648050.0,פצוע קל,15.9
298283,32014002960,2014,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.900807,34.937782,194225.0,645343.0,פצוע קל,18.57
33514,32015070182,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.818555,35.021194,202100.0,636206.0,,30.64
171836,32015057939,2015,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.804907,35.045774,204425.0,634689.0,,33.31


## Unreliable segments
**Most of the coords are wrong, therefore it is not possible to find outliers - the median is in the wrong coord**


צומת מעבר רפיח - צומת חבל עזה 40006

70040 מחלף בני דרום - צומת בית רבן

70050 צומת בית רבן - מחלף גדרה

4120010 צומת ביל"ו - כניסה לרחובות דרך הפרדסים

6520003 צומת שפיה - כניסה לזכרון יעקב (ישן)

440030 צומת גזר - מחלף רמלוד

600005 יישוב באר שבע (לערד) - צומת חטיבת הנגב

4120010 צומת ביל"ו - כניסה לרחובות דרך הפרדסים

4120020 מפגש עם מסילת ברזל - צומת נס ציונה

**Wrong segment length**

Should be 4.5 km, segment_length=0.8   
550030 צומת כפר סבא (מזרח) - צומת לאלפי מנשה

In [34]:
unreliable_segment_ids = [40006, 70040, 70050, 4120010, 6520003, 440030, 600005, 4120010, 4120020]

segment_rows = filtered[filtered[ROAD_SEGMENT].isin(unreliable_segment_ids)]
unreliable_segments_outliers = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE)

In [35]:
unreliable_segments_outliers

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew,km_from_median
540613,32015062919,2015,צומת מעבר רפיח - צומת חבל עזה,40006.0,46.8,32.300595,34.899988,190775.0,689685.0,,62.35
830628,32015035754,2015,צומת מעבר רפיח - צומת חבל עזה,40006.0,46.8,32.202832,34.884511,189285.0,678848.0,פצוע קל,73.21
304738,12015020261,2015,צומת מעבר רפיח - צומת חבל עזה,40006.0,46.8,32.186298,34.889977,189795.0,677013.0,פצוע קל,74.85
4003,32014067024,2014,צומת מעבר רפיח - צומת חבל עזה,40006.0,46.8,32.177709,34.888469,189650.0,676061.0,,75.81
700652,32014093457,2014,צומת מעבר רפיח - צומת חבל עזה,40006.0,46.8,32.177709,34.888469,189650.0,676061.0,,75.81
...,...,...,...,...,...,...,...,...,...,...,...
738371,32020008535,2020,"צומת ביל""ו - כניסה לרחובות דרך הפרדסים",4120010.0,1.6,31.965546,34.802908,181493.0,652561.0,,7.24
739387,32015028670,2015,"צומת ביל""ו - כניסה לרחובות דרך הפרדסים",4120010.0,1.6,31.992308,34.813363,182492.0,655525.0,פצוע קל,10.19
659000,12017017197,2017,מפגש עם מסילת ברזל - צומת נס ציונה,4120020.0,1.5,31.908342,34.805335,181699.0,646217.0,פצוע קל,2.51
1509886,12018034544,2018,מפגש עם מסילת ברזל - צומת נס ציונה,4120020.0,1.5,31.908342,34.805335,181699.0,646217.0,פצוע קל,2.51


In [91]:
outliers_by_km_filtered = outliers_by_km[~outliers_by_km[ROAD_SEGMENT].isin(unreliable_segment_ids)]
outlier_map = plot_outliers(filtered, outliers_by_km_filtered)

outlier_map

In [93]:
outliers_by_km_unreliable = outliers_by_km[outliers_by_km[ROAD_SEGMENT].isin(unreliable_segment_ids)]
outlier_map_unreliable = plot_outliers(filtered, outliers_by_km_unreliable)

outlier_map_unreliable

### Severe outliers

In [39]:
severe_outliers = outliers_by_km.loc[(outliers_by_km[SEVERITY] == SEVERITY_HARD) | (outliers_by_km[SEVERITY] == SEVERITY_DEAD)]
print(len(severe_outliers))
severe_outliers

11


Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew,km_from_median
618466,12019063641,2019,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.799843,35.114516,210934.0,634120.0,פצוע קשה,38.33
149832,12017054749,2017,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,31.799843,35.114516,210934.0,634120.0,פצוע קשה,38.33
356736,12016068274,2016,מחלף אשדוד - מחלף בני דרום,70030.0,0.6,31.797965,34.798619,181018.0,633980.0,פצוע קשה,9.83
1573338,12015055829,2015,מחלף חולות - מחלף משה דיין,200005.0,4.4,32.081591,34.7979,181068.0,665431.0,פצוע קשה,13.06
1512650,12019031576,2019,מחלף חולות - מחלף משה דיין,200005.0,4.4,32.081591,34.7979,181068.0,665431.0,הרוג,13.06
508948,12017016314,2017,צומת להבים - מחלף להבים,310010.0,3.2,31.255024,35.139688,213282.0,573709.0,פצוע קשה,35.33
57526,12016058694,2016,כניסה לעומר - צומת שוקת,600012.0,7.2,31.627132,35.132833,212656.0,614968.0,הרוג,45.26
128068,12019053708,2019,צומת עין אפק - מחלף גלעם,790001.0,5.0,32.780162,35.20377,219387.0,742827.0,פצוע קשה,11.88
211176,12018031336,2018,צומת תל עכו - צומת יסיף,850002.0,7.1,32.927923,35.319304,230193.0,759220.0,פצוע קשה,18.33
185573,12019032255,2019,צומת האלה - צומת עציונה,3750010.0,5.4,31.711315,35.054352,205223.0,624310.0,פצוע קשה,9.45


In [40]:
plot_outliers(filtered, severe_outliers)

### Old method - Z scores

In [41]:
outliers_zscore = get_outliers_by_z_scores(filtered, ZSCORE_THRESH, MIN_SAMPLE_SIZE, check_by_km=False)
len(outliers_zscore)

47

After removing false positives by checking distance from median:

In [50]:
outliers_zscore = get_outliers_by_z_scores(filtered, ZSCORE_THRESH, MIN_SAMPLE_SIZE)
len(outliers_zscore)

44

In [55]:
severe_outliers_zscore = outliers_zscore.loc[(outliers_zscore[SEVERITY] == SEVERITY_HARD) | (outliers_zscore[SEVERITY] == SEVERITY_DEAD)]
severe_outliers_zscore

Unnamed: 0,provider_and_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,latitude,longitude,x,y,injury_severity_hebrew
57526,12016058694,2016,כניסה לעומר - צומת שוקת,600012.0,7.2,31.627132,35.132833,212656.0,614968.0,הרוג
211176,12018031336,2018,צומת תל עכו - צומת יסיף,850002.0,7.1,32.927923,35.319304,230193.0,759220.0,פצוע קשה


By using z-scores, only 44 outliers are detected (versus 534)

## Save output

In [56]:
len(filtered), len(outliers_by_km)

(373239, 508)

In [83]:
outliers_by_km.to_csv("outliers.csv", header=True, index=False, encoding='utf-8')

In [84]:
unreliable_segment_ids = [40006, 70040, 70050, 4120010, 6520003, 440030, 600005, 4120010, 4120020]

unreliable_segments = filtered[filtered[ROAD_SEGMENT].isin(unreliable_segment_ids)]

unreliable_segments.to_csv("unreliable_segments.csv", header=True, index=False, encoding='utf-8')

In [94]:
outlier_map.save('outliers.html')
outlier_map_unreliable.save('unreliable_segments.html')