## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium import FeatureGroup, LayerControl, Map, Marker
import pandas as pd
import random
from scipy import stats
%matplotlib inline

## Consts

In [2]:
PROVIDER_AND_ID = "provider_and_id"
YEAR = "accident_year"
LAT = "latitude"
LONG = "longitude"
X = "x"
Y = "y"
SEVERITY = "injury_severity_hebrew"
ROAD_SEGMENT_NAME = 'road_segment_name'
ROAD_SEGMENT = 'road_segment_id'
ROAD_SEGMENT_LENGTH = 'road_segment_length_km'
SEVERITY_DEAD = 'הרוג'
SEVERITY_HARD = 'פצוע קשה'
ID = 'accident_id'
PROVIDER_CODE = 'provider_code'
KM_LOCATION = 'km'
IS_OUTLIER = 'is_outlier'

RELEVANT_KEYS_ANALYSIS = [PROVIDER_AND_ID, PROVIDER_CODE, ID, YEAR, ROAD_SEGMENT_NAME, ROAD_SEGMENT, ROAD_SEGMENT_LENGTH, KM_LOCATION,
                          LAT, LONG, X, Y, SEVERITY]

DEFAULT_ZOOM = 9
FROM_YEAR = 2014
KM_FROM_MEDIAN = 'km_from_median'
DEFAULT_COORD = (32.079184, 34.824768)

## Load data

In [3]:
csv_path = r"views2020/involved_markers_hebrew.csv"
data = pd.read_csv(csv_path, na_values='')

In [4]:
data.head()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,involved_type_hebrew,license_acquiring_date,age_group,age_group_hebrew,sex,...,vehicle_status_hebrew,vehicle_attribution,vehicle_attribution_hebrew,seats,total_weight,total_weight_hebrew,vehicle_vehicle_type,vehicle_vehicle_type_hebrew,vehicle_damage,vehicle_damage_hebrew
0,2008042695,12008042695,1,,3,נפגע,0,15,70-74,2.0,...,,,,,,,,,,
1,2008077932,12008077932,1,,3,נפגע,0,7,30-34,1.0,...,,,,,,,,,,
2,2008061813,12008061813,1,,3,נפגע,0,16,75-79,2.0,...,,,,,,,,,,
3,2008053506,12008053506,1,,3,נפגע,0,2,05-09,1.0,...,,,,,,,,,,
4,2008009914,12008009914,1,,3,נפגע,0,15,70-74,2.0,...,,,,,,,,,,


In [5]:
# get only relevant data
filtered = data[RELEVANT_KEYS_ANALYSIS].drop_duplicates(subset=[PROVIDER_AND_ID]).dropna(subset=[LAT, LONG])
# from this year the data was corrected
filtered = filtered[filtered[YEAR] >= FROM_YEAR]
filtered

Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew
726,12015000826,1,2015000826,2015,,,,,32.043049,34.770967,178508.0,661167.0,פצוע קל
727,12015002360,1,2015002360,2015,צומת מחסיה - צומת נס הרים,38660010.0,9.0,25.0,31.745920,35.014625,201464.0,628153.0,
728,12015003578,1,2015003578,2015,,,,,32.525085,35.161378,215404.0,714540.0,פצוע קל
729,12015006896,1,2015006896,2015,כניסה למנהרה - כניסה לדרך האלוף עוזי נרקיס,10090.0,9.0,558.0,31.801289,35.184305,217543.0,634277.0,פצוע קל
733,12015006964,1,2015006964,2015,,,,,32.320981,34.863568,187352.0,691956.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1881289,32020025415,3,2020025415,2020,מחלף גהה - מחלף מורשה,40140.0,3.7,1191.0,32.092494,34.845892,185603.0,666624.0,פצוע קל
1881292,32020032761,3,2020032761,2020,,,,,32.081712,34.796332,180920.0,665445.0,פצוע קל
1881293,32020033896,3,2020033896,2020,,,,,32.954099,35.163235,215597.0,762118.0,פצוע קל
1881299,32020049734,3,2020049734,2020,,,,,32.059339,34.766696,178112.0,662975.0,פצוע קל


## Plot funcs

In [6]:
def create_map(coord):
    
    folium_map = folium.Map(location=coord, zoom_start=DEFAULT_ZOOM)
    
    return folium_map

In [7]:
def plot_coord(folium_map, coord, count, color, icon):
   
    tooltip = 'Click to see accident counts'
    folium.Marker(coord, icon=folium.Icon(color=color, icon=icon), 
                  popup=f'<i>{count}</i>', tooltip=tooltip).add_to(folium_map)

In [8]:
def plot_all_coords(coords, folium_map, color='green', icon='ok-sign'):
    """
    Plot all coords according to location and count
    """

    coord_to_count = coords.groupby([LAT, LONG]).size()
    
    for coord, count in coord_to_count.items():
        plot_coord(folium_map, coord, count, color=color, icon=icon)
    
    return coord_to_count

In [9]:
def plot_median_coord(median_coord, coord_to_count, folium_map, color, icon):
    
    count = coord_to_count[median_coord] if median_coord in coord_to_count else 1
    plot_coord(folium_map, median_coord, count, color=color, icon=icon)

In [10]:
def plot_all_segment_coords(segment_rows, outlier_rows, median_coord, segment_map_layer, decimals=4):
    """
    Plot all coords in segment, rounded to decimals, according to their location and count
    Median is blue, outliers are red, all other coords are green
    
    Keyword arguments:
    segment_rows -- segment rows without outliers
    outlier_rows -- outlier segment rows
    segment_map_layer -- points are added to this layer
    decimals -- round to decimals, very close points are merged 
                to the same point with a larger count
    """
    
    segment_coords = segment_rows[[LAT, LONG]].dropna().round(decimals=decimals)
    outlier_coords = outlier_rows[[LAT, LONG]].dropna().round(decimals=decimals)
    
    coord_to_count = plot_all_coords(segment_coords, segment_map_layer, color='green', icon='ok-sign')
    
    median_coord = tuple(np.around(median_coord, decimals))
    plot_median_coord(median_coord, coord_to_count, segment_map_layer, color='blue', icon='screenshot')
    
    plot_all_coords(outlier_coords, segment_map_layer, color='red', icon='exclamation-sign')


In [11]:
def get_segment_rows(data, road_segment_id, max_coords=None):
    
    segment_rows = data[data[ROAD_SEGMENT]==road_segment_id]
    
    return segment_rows[:max_coords]

In [12]:
def plot_outliers(outlier_segments, max_coords=500):
    
    outlier_map = create_map(DEFAULT_COORD)
    
    for road_segment_id, segment_rows in outlier_segments.groupby(ROAD_SEGMENT):
        
        first_row = segment_rows.iloc[0]
        segment_name = first_row[ROAD_SEGMENT_NAME]
        segment_length = round(first_row[ROAD_SEGMENT_LENGTH], 1)
        road_segment_id = int(road_segment_id)
        
        outliers = segment_rows[segment_rows[IS_OUTLIER] == True]
        not_outliers = segment_rows[segment_rows[IS_OUTLIER] == False]
        median_row = segment_rows[segment_rows[IS_OUTLIER] == 'MEDIAN'].iloc[0]
        median_coord = (median_row[LAT], median_row[LONG])
        
        segment_map_layer = FeatureGroup(name=f'{road_segment_id} {segment_length} km {segment_name}', show=False)
        plot_all_segment_coords(not_outliers, outliers, median_coord, segment_map_layer)
        
        segment_map_layer.add_to(outlier_map)
        
    LayerControl().add_to(outlier_map)
    
    return outlier_map

## Find outliers by km from median

In [13]:
def spherical_distance(lat1, long1, lat2, long2):
    """
    Calculate the spherical distance in km between two coordinates in WGS-84 using Vincenty's formulae
    
    credit: https://www.johndcook.com/blog/2018/11/24/spheroid-distance/
    """

    lat1, long1, lat2, long2 = np.deg2rad(lat1), np.deg2rad(long1), np.deg2rad(lat2), np.deg2rad(long2)
    
    phi1 = 0.5*np.pi - lat1
    phi2 = 0.5*np.pi - lat2
    r = 0.5*(6378137 + 6356752) # mean radius in meters
    t = np.sin(phi1)*np.sin(phi2)*np.cos(long1-long2) + np.cos(phi1)*np.cos(phi2)
    
    # -1<=t<=1 for the arcos func
    t = np.minimum(t, 1)
    t = np.maximum(t, -1)
    
    return round(r * np.arccos(t) / 1000, 2)

In [14]:
def is_far_from_median(data, row, outliers, segment_length, padding):
    """
    Check if the distance of row from the median_coord is > segment_length + padding*segment_length
    The median is computed according to all coords, after removing the outliers and the current coord
    """
    
    curr_coord = [row[LAT], row[LONG]]

    data_without_outlier = data[~data[PROVIDER_AND_ID].isin(outliers) & data[PROVIDER_AND_ID] != row[PROVIDER_AND_ID]]

    median_coord = np.median(data_without_outlier[[LAT, LONG]], axis=0)

    dist = spherical_distance(*curr_coord, *median_coord)

    return dist > segment_length + padding*segment_length

In [15]:
def get_segment_outliers_far_from_median(data, segment_length, padding):
    
    outliers = []
    
    for i, row in data.iterrows():
        if is_far_from_median(data, row, outliers, segment_length, padding):
            outliers.append(row[PROVIDER_AND_ID])
            
    return outliers

In [16]:
def km_from_median_col(segment_rows, median_coord):
    
     return segment_rows.apply(lambda row: spherical_distance(row[LAT], row[LONG], *median_coord), 
                                                                axis=1)

In [17]:
def median_row(seg, segment_length, segment_name, median_coord):
    return {ROAD_SEGMENT: seg, ROAD_SEGMENT_LENGTH: segment_length, ROAD_SEGMENT_NAME: segment_name, 
            LAT: median_coord[0], LONG: median_coord[1], IS_OUTLIER: 'MEDIAN', KM_FROM_MEDIAN: 0}

In [18]:
def updated_segment_rows(seg, segment_length, segment_name, segment_rows, outliers):
    
    segment_rows = segment_rows.copy(deep=True)
    
    segment_rows[IS_OUTLIER] = np.where(segment_rows[PROVIDER_AND_ID].isin(outliers), True, False)

    not_outlier_rows = segment_rows[~segment_rows[IS_OUTLIER]]
    median_coord = np.median(not_outlier_rows[[LAT, LONG]], axis=0)
    
    segment_rows[KM_FROM_MEDIAN] = km_from_median_col(segment_rows, median_coord)
    
    segment_rows = segment_rows.append(median_row(seg, segment_length, segment_name, median_coord), ignore_index=True);
    
    return segment_rows

In [19]:
def get_outliers_by_km(data, min_sample_size=2, padding=0.25):
    
    res = []
    
    for seg, segment_rows in data.groupby(ROAD_SEGMENT):
        
        if len(segment_rows) <= min_sample_size:
            continue
            
        segment_length = segment_rows.iloc[0][ROAD_SEGMENT_LENGTH]
        segment_name = segment_rows.iloc[0][ROAD_SEGMENT_NAME]
        outliers = get_segment_outliers_far_from_median(segment_rows, segment_length, padding)
        
        if len(outliers) > 0:
            
            segment_rows = updated_segment_rows(seg, segment_length, segment_name, segment_rows, outliers)
            
            res.append(segment_rows)
            
    return pd.concat(res) if res else pd.DataFrame()

### Old method:

In [21]:
def calc_zscores(data, zscore_thresh, min_sample_size):

    coords = data[[X, Y]].dropna()
    
    # check if sample size is big enough for the statistic calculation
    if len(coords.drop_duplicates()) <= min_sample_size:
        return []
    
    # calculate zscores for each column
    zscores = stats.zscore(coords, axis = 0)
    zscores = np.nan_to_num(zscores)
    zscores = np.abs(zscores)
    # get indices to outliers
    outliers = np.argwhere(zscores > zscore_thresh)
    outlier_rows = np.unique(outliers[:, 0])
    outlier_rows = data.iloc[outlier_rows]
    
    return outlier_rows

def calc_outliers_for_segment_zscores(data, zscore_thresh, min_sample_size, check_by_km):
    
    outlier_rows = calc_zscores(data, zscore_thresh, min_sample_size)
    
    if len(outlier_rows) > 0:
        
        if check_by_km:
            median_coord = np.median(data[[LAT, LONG]].dropna(), axis=0)
            segment_length = outlier_rows.iloc[0][ROAD_SEGMENT_LENGTH]

            outlier_rows = outlier_rows[spherical_distance(outlier_rows[LAT], outlier_rows[LONG], *median_coord) 
                                        > segment_length]
        
        return outlier_rows
    
    return []

def get_outliers_by_z_scores(data, zscore_thresh, min_sample_size=0, check_by_km=True):
    res = []
    for seg, rows in data.groupby(ROAD_SEGMENT):
        outliers = calc_outliers_for_segment_zscores(rows, zscore_thresh, min_sample_size, check_by_km)
        if len(outliers) > 0:
            res.append(outliers)
    return pd.concat(res)

## Pick threshold and minimum sample size

In [22]:
ZSCORE_THRESH = 5
MIN_SAMPLE_SIZE = 10

## Test

In [23]:
segmend_id = 10010
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]
outliers_test = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE)

print(outliers_test.groupby(IS_OUTLIER).size())
outliers_test.head()

is_outlier
False     461
True       55
MEDIAN      1
dtype: int64


Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew,is_outlier,km_from_median
0,32015030000.0,3.0,2015028000.0,2015.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,30.0,32.027302,34.809022,182096.0,659407.0,,False,0.0
1,32016040000.0,3.0,2016044000.0,2016.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,30.0,32.027302,34.809022,182096.0,659407.0,פצוע קל,False,0.0
2,12014020000.0,1.0,2014015000.0,2014.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,40.0,32.021411,34.817029,182850.0,658751.0,,False,1.0
3,32016100000.0,3.0,2016096000.0,2016.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,30.0,32.027302,34.809022,182096.0,659407.0,פצוע קל,False,0.0
4,32019040000.0,3.0,2019043000.0,2019.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,40.0,32.021411,34.817029,182850.0,658751.0,,False,1.0


In [24]:
plot_outliers(outliers_test)

In [25]:
segmend_id = 650020
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]

outliers_test = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE)

plot_outliers(outliers_test)

The algorithm can't detect outliers that are very close to the start or the end of the segment.    
Note that there is one coord at צומת חדרה מזרח   
Should it be in this segment or the next segment? 

### Test segment padding

In [26]:
segmend_id = 650030
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]

In [27]:
outliers_test = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE, padding=0)
outliers_test.groupby(IS_OUTLIER).size()

is_outlier
False     60
True      27
MEDIAN     1
dtype: int64

In [28]:
outliers_test[outliers_test[IS_OUTLIER]==True].head()

Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew,is_outlier,km_from_median
0,32020030000.0,3.0,2020034000.0,2020.0,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,61.0,32.447831,34.943457,194909.0,706002.0,,True,1.5
4,32019050000.0,3.0,2019052000.0,2019.0,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,61.0,32.447831,34.943457,194909.0,706002.0,,True,1.5
5,12017080000.0,1.0,2017076000.0,2017.0,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,61.0,32.447831,34.943457,194909.0,706002.0,,True,1.5
7,12016070000.0,1.0,2016071000.0,2016.0,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,61.0,32.447831,34.943457,194909.0,706002.0,פצוע קל,True,1.5
11,12016010000.0,1.0,2016010000.0,2016.0,צומת חדרה (מזרח) - צומת אלון (שמורת אלון),650030.0,1.2,61.0,32.447831,34.943457,194909.0,706002.0,פצוע קל,True,1.5


road_segment_length_km should be 1.9 according to google maps

In [29]:
plot_outliers(outliers_test)

In [30]:
outliers_test = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE, padding=0.2)
if not outliers_test.empty:
    print(outliers_test.groupby(IS_OUTLIER).size())

is_outlier
False     60
True      27
MEDIAN     1
dtype: int64


In [31]:
outliers_test = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE, padding=0.25)
if not outliers_test.empty:
    print(outliers_test.groupby(IS_OUTLIER).size())

In [32]:
segmend_id = 440010
segment_rows = filtered[filtered[ROAD_SEGMENT]==segmend_id]
outliers_test = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE, padding=0.25)

outliers_test

Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew,is_outlier,km_from_median
0,3.201807e+10,3.0,2.018075e+09,2018.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,100.0,31.814834,34.920192,192536.0,635814.0,פצוע קל,False,4.68
1,3.201808e+10,3.0,2.018084e+09,2018.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,80.0,31.809998,34.940124,194422.0,635273.0,פצוע קל,False,2.73
2,3.201903e+10,3.0,2.019026e+09,2019.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,10.0,31.782566,34.999917,200078.0,632219.0,,False,3.69
3,1.201805e+10,1.0,2.018047e+09,2018.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,30.0,31.792779,34.985830,198746.0,633354.0,פצוע קשה,False,1.99
4,3.201404e+10,3.0,2.014044e+09,2014.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,10.0,31.782566,34.999917,200078.0,632219.0,פצוע קל,False,3.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,3.201702e+10,3.0,2.017024e+09,2017.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,60.0,31.801266,34.956918,196010.0,634301.0,פצוע קל,False,0.91
99,3.201808e+10,3.0,2.018079e+09,2018.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,100.0,31.814834,34.920192,192536.0,635814.0,פצוע קל,False,4.68
100,3.201803e+10,3.0,2.018030e+09,2018.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,10.0,31.782566,34.999917,200078.0,632219.0,פצוע קל,False,3.69
101,3.201604e+10,3.0,2.016037e+09,2016.0,צומת שמשון - צומת נחשון (הגבורה),440010.0,10.3,28.0,31.791428,34.987185,198874.0,633204.0,פצוע קל,False,2.17


In [33]:
plot_outliers(outliers_test)

## Calculate all outliers

In [34]:
all_segment_outliers = get_outliers_by_km(filtered, MIN_SAMPLE_SIZE)
len(all_segment_outliers)

6001

### Segments with outliers

In [35]:
len(all_segment_outliers[ROAD_SEGMENT].unique())

56

In [36]:
all_segment_outliers.head(10)

Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew,is_outlier,km_from_median
0,32015030000.0,3.0,2015028000.0,2015.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,30.0,32.027302,34.809022,182096.0,659407.0,,False,0.0
1,32016040000.0,3.0,2016044000.0,2016.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,30.0,32.027302,34.809022,182096.0,659407.0,פצוע קל,False,0.0
2,12014020000.0,1.0,2014015000.0,2014.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,40.0,32.021411,34.817029,182850.0,658751.0,,False,1.0
3,32016100000.0,3.0,2016096000.0,2016.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,30.0,32.027302,34.809022,182096.0,659407.0,פצוע קל,False,0.0
4,32019040000.0,3.0,2019043000.0,2019.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,40.0,32.021411,34.817029,182850.0,658751.0,,False,1.0
5,12018070000.0,1.0,2018068000.0,2018.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,20.0,32.033102,34.800909,181332.0,660053.0,,False,1.0
6,32018060000.0,3.0,2018057000.0,2018.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,10.0,32.038875,34.792774,180566.0,660696.0,פצוע קל,False,2.0
7,32019070000.0,3.0,2019069000.0,2019.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,0.0,32.047064,34.787581,180079.0,661606.0,פצוע קל,False,2.98
8,12019060000.0,1.0,2019065000.0,2019.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,0.0,32.047064,34.787581,180079.0,661606.0,פצוע קל,False,2.98
9,32016080000.0,3.0,2016082000.0,2016.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,20.0,32.033102,34.800909,181332.0,660053.0,,False,1.0


In [37]:
all_segment_outliers.groupby(PROVIDER_CODE).size()

provider_code
1.0    1487
3.0    4458
dtype: int64

## Unreliable segments
**Most of the coords are wrong, therefore it is not possible to find outliers - the median is in the wrong coord**


צומת מעבר רפיח - צומת חבל עזה 40006

70040 מחלף בני דרום - צומת בית רבן

70050 צומת בית רבן - מחלף גדרה

4120010 צומת ביל"ו - כניסה לרחובות דרך הפרדסים

6520003 צומת שפיה - כניסה לזכרון יעקב (ישן)

440030 צומת גזר - מחלף רמלוד

600005 יישוב באר שבע (לערד) - צומת חטיבת הנגב

4120010 צומת ביל"ו - כניסה לרחובות דרך הפרדסים

4120020 מפגש עם מסילת ברזל - צומת נס ציונה

**Wrong segment length**

Should be 4.5 km, segment_length=0.8   
550030 צומת כפר סבא (מזרח) - צומת לאלפי מנשה

Should be 1.9 km, segment_length=1.2   
צומת חדרה (מזרח) - צומת אלון (שמורת אלון) 650030  	

In [38]:
unreliable_segment_ids = [40006, 70040, 70050, 4120010, 6520003, 440030, 600005, 4120010, 4120020]

segment_rows = filtered[filtered[ROAD_SEGMENT].isin(unreliable_segment_ids)]
unreliable_segments_outliers = get_outliers_by_km(segment_rows, MIN_SAMPLE_SIZE)

In [39]:
outlier_map_unreliable_segments = plot_outliers(unreliable_segments_outliers)

outlier_map_unreliable_segments

In [40]:
outlier_map = plot_outliers(all_segment_outliers)

outlier_map

### Severe outliers

In [41]:
severe_outliers = all_segment_outliers.loc[(all_segment_outliers[SEVERITY] == SEVERITY_HARD) | 
                                           (all_segment_outliers[SEVERITY] == SEVERITY_DEAD)]
outlier_rows = severe_outliers[severe_outliers[IS_OUTLIER] == True]
print(len(outlier_rows))
outlier_rows

16


Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew,is_outlier,km_from_median
91,12017050000.0,1.0,2017055000.0,2017.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,0.0,31.799843,35.114516,210934.0,634120.0,פצוע קשה,True,38.33
321,12019060000.0,1.0,2019064000.0,2019.0,מחלף קיבוץ גלויות - מחלף גנות,10010.0,4.6,0.0,31.799843,35.114516,210934.0,634120.0,פצוע קשה,True,38.33
19,12016070000.0,1.0,2016068000.0,2016.0,מחלף אשדוד - מחלף בני דרום,70030.0,0.6,50.0,31.797965,34.798619,181018.0,633980.0,פצוע קשה,True,9.83
388,12019030000.0,1.0,2019032000.0,2019.0,מחלף חולות - מחלף משה דיין,200005.0,4.4,0.0,32.081591,34.7979,181068.0,665431.0,הרוג,True,13.06
391,12015060000.0,1.0,2015056000.0,2015.0,מחלף חולות - מחלף משה דיין,200005.0,4.4,0.0,32.081591,34.7979,181068.0,665431.0,פצוע קשה,True,13.06
46,12017020000.0,1.0,2017016000.0,2017.0,צומת להבים - מחלף להבים,310010.0,3.2,30.0,31.255024,35.139688,213282.0,573709.0,פצוע קשה,True,35.33
11,12019020000.0,1.0,2019017000.0,2019.0,צומת גזר - מחלף רמלוד,440030.0,1.0,230.0,31.921338,34.885299,189267.0,647633.0,פצוע קשה,True,2.19
16,12019030000.0,1.0,2019033000.0,2019.0,צומת גזר - מחלף רמלוד,440030.0,1.0,230.0,31.921338,34.885299,189267.0,647633.0,פצוע קשה,True,2.19
20,12018040000.0,1.0,2018039000.0,2018.0,צומת גזר - מחלף רמלוד,440030.0,1.0,230.0,31.921338,34.885299,189267.0,647633.0,פצוע קשה,True,2.19
7,12016060000.0,1.0,2016059000.0,2016.0,כניסה לעומר - צומת שוקת,600012.0,7.2,68.0,31.627132,35.132833,212656.0,614968.0,הרוג,True,45.26


In [42]:
plot_outliers(all_segment_outliers[all_segment_outliers[ROAD_SEGMENT].isin(outlier_rows[ROAD_SEGMENT])])

### Old method - Z scores

In [43]:
outliers_zscore = get_outliers_by_z_scores(filtered, ZSCORE_THRESH, MIN_SAMPLE_SIZE, check_by_km=False)
len(outliers_zscore)

47

After removing false positives by checking distance from median:

In [44]:
outliers_zscore = get_outliers_by_z_scores(filtered, ZSCORE_THRESH, MIN_SAMPLE_SIZE)
len(outliers_zscore)

44

In [45]:
severe_outliers_zscore = outliers_zscore.loc[(outliers_zscore[SEVERITY] == SEVERITY_HARD) | (outliers_zscore[SEVERITY] == SEVERITY_DEAD)]
severe_outliers_zscore

Unnamed: 0,provider_and_id,provider_code,accident_id,accident_year,road_segment_name,road_segment_id,road_segment_length_km,km,latitude,longitude,x,y,injury_severity_hebrew
57526,12016058694,1,2016058694,2016,כניסה לעומר - צומת שוקת,600012.0,7.2,68.0,31.627132,35.132833,212656.0,614968.0,הרוג
211176,12018031336,1,2018031336,2018,צומת תל עכו - צומת יסיף,850002.0,7.1,0.0,32.927923,35.319304,230193.0,759220.0,פצוע קשה


By using z-scores, only 44 outliers are detected (versus 534)

## Save output

In [46]:
len(filtered), len(all_segment_outliers[all_segment_outliers[IS_OUTLIER]==True])

(370374, 534)

In [47]:
print(len(all_segment_outliers))

all_segment_outliers['unreliable_segment'] = np.where(all_segment_outliers[ROAD_SEGMENT].isin(unreliable_segment_ids), True, False)
all_segment_outliers.groupby('unreliable_segment').size()

6001


unreliable_segment
False    5680
True      321
dtype: int64

In [49]:
all_segment_outliers.to_csv("outliers.csv", header=True, index=False, encoding='utf-8')

In [50]:
outlier_map.save('outliers.html')