#### High level:
This notebook shows all the inconsistencies of field that were produced with dictionaries (and have `hebrew` in the name) with their respective numeric values for the `markers_hebrew` table. 

The specific analysis below is based on data from `2020-01-13_views_and_main_tables` folder from Jan 12, 2020 that can be found here: https://drive.google.com/drive/folders/1zrwfdkedy7jtRRsGi2dEn8zboZAG8_U2?usp=sharing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100

In [2]:
involved_raw = pd.read_csv('../../views_and_main_tables_2020_01/involved_hebrew.csv')
i_all = involved_raw[involved_raw['accident_year'] == 2019]

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
i_all.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119953 entries, 417812 to 1823549
Data columns (total 49 columns):
accident_id                     119953 non-null int64
provider_and_id                 119953 non-null int64
provider_code                   119953 non-null int64
file_type_police                119953 non-null float64
involved_type                   119953 non-null int64
involved_type_hebrew            119953 non-null object
license_acquiring_date          119953 non-null int64
age_group                       119953 non-null int64
age_group_hebrew                119953 non-null object
sex                             119953 non-null float64
sex_hebrew                      93624 non-null object
vehicle_type                    113288 non-null float64
vehicle_type_hebrew             113288 non-null object
safety_measures                 119953 non-null int64
safety_measures_hebrew          119949 non-null object
involve_yishuv_symbol           119953 non-null int64
involve_y

### Helper functions

In [5]:
def calc_diff_counts_hebrew(data, feat_name):
    data = data[(data[feat_name].isnull() == False) & (data[feat_name + '_hebrew'].isnull() == False)]
    print(f'Shape of data: {data.shape}')
    return data[feat_name].value_counts().reset_index(drop=True) - \
           data[feat_name + '_hebrew'].value_counts().reset_index(drop=True)

def merge_with_hebrew(data, feat_name):
    nums_df = data[feat_name].value_counts().reset_index()
    nums_df.columns = ['index_' + feat_name, 'count']

    hebrew_df = data[feat_name + '_hebrew'].value_counts().reset_index()
    hebrew_df.columns = ['index_' + feat_name + '_hebrew', 'count']

    return pd.merge(nums_df, hebrew_df, how='outer', on='count')

def merge_with_hebrew_print_split_years(data, feat_name):
    merged = merge_with_hebrew(data, feat_name)

    for null_heb in merged[merged['index_' + feat_name + '_hebrew'].isnull()]['index_' + feat_name]:
        print(f'{feat_name} {null_heb}:')
        val_counts = data[data[feat_name] == null_heb][feat_name + '_hebrew'].value_counts()
        print(val_counts)
        print(f'Total: {val_counts.sum()}')
        for type_h in val_counts.index:
            print(f"Years {type_h}: {data[data[feat_name + '_hebrew'] == type_h]['accident_year'].unique()}")
        print('')

## involved_type and involved_type_hebrew

In [6]:
# involved_type                   119953 non-null int64
# involved_type_hebrew            119953 non-null object

In [7]:
calc_diff_counts_hebrew(i_all, 'involved_type')

Shape of data: (119953, 49)


0    0
1    0
2    0
dtype: int64

## age_group and age_group_hebrew

In [8]:
# age_group                       119953 non-null int64
# age_group_hebrew                119953 non-null object

In [11]:
calc_diff_counts_hebrew(i_all, 'age_group')

Shape of data: (119953, 49)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
dtype: int64

### sex and sex_hebrew

In [12]:
# sex                             119953 non-null float64
# sex_hebrew                      93624 non-null object

In [13]:
i_all[(i_all['sex'].isnull() == False) & (i_all['sex_hebrew'].isnull() == True)].shape

(26329, 49)

In [14]:
i_all['sex'].value_counts()

1.0    57001
2.0    36623
0.0    26329
Name: sex, dtype: int64

**Null Conclusion:** all `sex` = 0 are missing

In [15]:
calc_diff_counts_hebrew(i_all, 'sex')

Shape of data: (93624, 49)


0    0
1    0
dtype: int64

## vehicle_type and vehicle_type_hebrew

In [17]:
# vehicle_type                    113288 non-null float64
# vehicle_type_hebrew             113288 non-null object

In [19]:
calc_diff_counts_hebrew(i_all, 'vehicle_type')
# merge_with_hebrew(i_all, 'vehicle_type')

Shape of data: (113288, 49)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
dtype: int64

## safety_measures and safety_measures_hebrew

In [20]:
# safety_measures                 119953 non-null int64
# safety_measures_hebrew          119949 non-null object
i_all[(i_all['safety_measures'].isnull() == False) & (i_all['safety_measures_hebrew'].isnull() == True)].shape

(4, 49)

In [21]:
i_all[(i_all['safety_measures'].isnull() == False) & (i_all['safety_measures_hebrew'].isnull() == True)].describe()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,license_acquiring_date,age_group,sex,vehicle_type,safety_measures,involve_yishuv_symbol,injury_severity,injured_type,injured_position,population_type,home_region,home_district,home_natural_area,home_municipal_status,home_yishuv_shape,hospital_time,medical_type,release_dest,safety_measures_use,late_deceased,car_id,involve_id,accident_year,accident_month
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0
mean,2019057000.0,27019060000.0,2.5,2.5,2.25,0.0,99.0,0.0,7.0,0.0,0.0,3.0,3.25,2.75,1.0,,99.0,,,,,,,,,1.25,1.75,2019.0,8.25
std,29299.06,9999978000.0,1.0,1.0,0.5,0.0,0.0,0.0,7.118052,0.0,0.0,0.0,1.892969,3.5,0.0,,0.0,,,,,,,,,0.5,0.5,0.0,2.872281
min,2019020000.0,12019090000.0,1.0,1.0,2.0,0.0,99.0,0.0,1.0,0.0,0.0,3.0,2.0,1.0,1.0,,99.0,,,,,,,,,1.0,1.0,2019.0,4.0
25%,2019045000.0,27019040000.0,2.5,2.5,2.0,0.0,99.0,0.0,1.0,0.0,0.0,3.0,2.0,1.0,1.0,,99.0,,,,,,,,,1.0,1.75,2019.0,7.75
50%,2019060000.0,32019040000.0,3.0,3.0,2.0,0.0,99.0,0.0,6.0,0.0,0.0,3.0,2.5,1.0,1.0,,99.0,,,,,,,,,1.0,2.0,2019.0,9.5
75%,2019072000.0,32019060000.0,3.0,3.0,2.25,0.0,99.0,0.0,12.0,0.0,0.0,3.0,3.75,2.75,1.0,,99.0,,,,,,,,,1.25,2.0,2019.0,10.0
max,2019091000.0,32019070000.0,3.0,3.0,3.0,0.0,99.0,0.0,15.0,0.0,0.0,3.0,6.0,8.0,1.0,,99.0,,,,,,,,,2.0,2.0,2019.0,10.0


In [22]:
i_all['safety_measures'].value_counts()

5    93190
1    22821
2     2430
4     1159
3      349
0        4
Name: safety_measures, dtype: int64

**Nulls Conclusion:** all `safety_measures` == 0 are missing.  

In [23]:
calc_diff_counts_hebrew(i_all, 'safety_measures')

Shape of data: (119949, 49)


0    0
1    0
2    0
3    0
4    0
dtype: int64

## involve_yishuv_symbol and involve_yishuv_name (not related to hebrew!!)

In [25]:
# involve_yishuv_symbol           119953 non-null int64
# involve_yishuv_name             91473 non-null object
i_all[(i_all['involve_yishuv_symbol'].isnull() == False) & (i_all['involve_yishuv_name'].isnull() == True)].shape

(28480, 49)

In [26]:
i_all[(i_all['involve_yishuv_symbol'].isnull() == False) & (i_all['involve_yishuv_name'].isnull() == True)].describe()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,license_acquiring_date,age_group,sex,vehicle_type,safety_measures,involve_yishuv_symbol,injury_severity,injured_type,injured_position,population_type,home_region,home_district,home_natural_area,home_municipal_status,home_yishuv_shape,hospital_time,medical_type,release_dest,safety_measures_use,late_deceased,car_id,involve_id,accident_year,accident_month
count,28480.0,28480.0,28480.0,28480.0,28480.0,28480.0,28480.0,28480.0,27813.0,28480.0,28480.0,28480.0,28480.0,28480.0,28480.0,9.0,28480.0,9.0,0.0,9.0,0.0,0.0,0.0,135.0,37.0,28480.0,28480.0,28480.0,28480.0
mean,2019050000.0,30193210000.0,2.817416,2.824719,1.191292,96.480021,90.560077,0.11401,6.178406,4.840941,1.135604,0.382514,0.40007,7.487921,1.10481,7.0,98.991819,731.111111,,40.111111,,,,1.437037,1.027027,1.552282,2.30611,2019.0,5.930302
std,29583.72,5760583000.0,0.576058,0.565553,0.52728,428.869149,26.514492,0.377703,7.245765,0.76667,64.026765,0.995493,1.288062,1.865014,0.53127,0.0,0.461027,16.914819,,10.540926,,,,0.759098,0.164399,0.709205,0.976664,0.0,3.075159
min,2018025000.0,12019000000.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,7.0,71.0,710.0,,29.0,,,,1.0,1.0,0.0,1.0,2019.0,1.0
25%,2019025000.0,32019020000.0,3.0,3.0,1.0,0.0,99.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,7.0,99.0,720.0,,29.0,,,,1.0,1.0,1.0,2.0,2019.0,3.0
50%,2019050000.0,32019050000.0,3.0,3.0,1.0,0.0,99.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,7.0,99.0,730.0,,49.0,,,,1.0,1.0,1.0,2.0,2019.0,6.0
75%,2019075000.0,32019070000.0,3.0,3.0,1.0,0.0,99.0,0.0,11.0,5.0,0.0,0.0,0.0,8.0,1.0,7.0,99.0,730.0,,49.0,,,,2.0,1.0,2.0,3.0,2019.0,9.0
max,2019100000.0,32019100000.0,3.0,3.0,3.0,2019.0,99.0,2.0,25.0,5.0,3900.0,3.0,9.0,9.0,4.0,7.0,99.0,770.0,,49.0,,,,3.0,2.0,12.0,20.0,2019.0,11.0


In [27]:
i_all[(i_all['involve_yishuv_symbol'].isnull() == False) & (i_all['involve_yishuv_name'].isnull() == True)]['involve_yishuv_symbol'].nunique()

10

In [28]:
i_all[(i_all['involve_yishuv_symbol'].isnull() == False) & (i_all['involve_yishuv_name'].isnull() == True)]['involve_yishuv_symbol'].value_counts()

0       28471
3086        1
3900        1
3800        1
3545        1
3305        1
3702        1
3700        1
3813        1
3491        1
Name: involve_yishuv_symbol, dtype: int64

**Null conclusion**: there are 10 different values of `involve_yishuv_symbol` that don't have `involve_yishuv_name`, value 0 is the vast majority, but there are others (above).

Didn't compare the values

 ## injury_severity and injury_severity_hebrew

In [30]:
# injury_severity                 119953 non-null int64
# injury_severity_hebrew          73122 non-null object
i_all[(i_all['injury_severity'].isnull() == False) & (i_all['injury_severity_hebrew'].isnull() == True)].shape

(46831, 49)

In [31]:
i_all[(i_all['injury_severity'].isnull() == False) & (i_all['injury_severity_hebrew'].isnull() == True)].describe()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,license_acquiring_date,age_group,sex,vehicle_type,safety_measures,involve_yishuv_symbol,injury_severity,injured_type,injured_position,population_type,home_region,home_district,home_natural_area,home_municipal_status,home_yishuv_shape,hospital_time,medical_type,release_dest,safety_measures_use,late_deceased,car_id,involve_id,accident_year,accident_month
count,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,22045.0,46831.0,22045.0,21913.0,22045.0,0.0,0.0,0.0,3.0,0.0,46831.0,46831.0,46831.0,46831.0
mean,2019050000.0,28011450000.0,2.59924,2.609148,1.0,924.733019,56.184258,0.594435,4.969102,4.47144,2050.86968,0.0,0.0,8.0,1.149623,3.8978,71.714804,412.586255,16.742664,17.32166,,,,1.666667,,1.511947,1.991395,2019.0,5.867652
std,29969.47,8005781000.0,0.800578,0.793065,0.0,996.499112,44.904978,0.689262,6.591536,1.34764,3007.423364,0.0,0.0,0.0,0.437037,1.689824,31.178618,169.66884,34.986897,6.621216,,,,1.154701,,0.669092,0.953673,0.0,3.079257
min,2018025000.0,12018050000.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,8.0,1.0,1.0,11.0,111.0,0.0,12.0,,,,1.0,,1.0,1.0,2019.0,1.0
25%,2019025000.0,32019010000.0,3.0,3.0,1.0,0.0,9.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,2.0,44.0,293.0,0.0,13.0,,,,1.0,,1.0,1.0,2019.0,3.0
50%,2019050000.0,32019040000.0,3.0,3.0,1.0,0.0,99.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,4.0,99.0,432.0,0.0,15.0,,,,1.0,,1.0,2.0,2019.0,6.0
75%,2019075000.0,32019070000.0,3.0,3.0,1.0,1998.0,99.0,1.0,11.0,5.0,3000.0,0.0,0.0,8.0,1.0,5.0,99.0,513.0,0.0,17.0,,,,2.0,,2.0,2.0,2019.0,8.0
max,2019100000.0,32019100000.0,3.0,3.0,1.0,2019.0,99.0,2.0,25.0,5.0,9800.0,0.0,0.0,8.0,4.0,7.0,99.0,999.0,99.0,53.0,,,,3.0,,12.0,26.0,2019.0,11.0


In [32]:
i_all[i_all['injury_severity'] == 0].shape

(46831, 49)

**Null Conclusion:** `injury_severity` == 0 is missing the translation

In [34]:
calc_diff_counts_hebrew(i_all, 'injury_severity')

Shape of data: (73122, 49)


0    0
1    0
2    0
dtype: int64

## injured_type and injured_type_hebrew

In [35]:
# injured_type                    119953 non-null int64
# injured_type_hebrew             73122 non-null object

In [36]:
i_all[(i_all['injured_type'].isnull() == False) & (i_all['injured_type_hebrew'].isnull() == True)].shape

(46831, 49)

In [37]:
i_all[(i_all['injured_type'].isnull() == False) & (i_all['injured_type_hebrew'].isnull() == True)].describe()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,license_acquiring_date,age_group,sex,vehicle_type,safety_measures,involve_yishuv_symbol,injury_severity,injured_type,injured_position,population_type,home_region,home_district,home_natural_area,home_municipal_status,home_yishuv_shape,hospital_time,medical_type,release_dest,safety_measures_use,late_deceased,car_id,involve_id,accident_year,accident_month
count,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,46831.0,22045.0,46831.0,22045.0,21913.0,22045.0,0.0,0.0,0.0,3.0,0.0,46831.0,46831.0,46831.0,46831.0
mean,2019050000.0,28011450000.0,2.59924,2.609148,1.0,924.733019,56.184258,0.594435,4.969102,4.47144,2050.86968,0.0,0.0,8.0,1.149623,3.8978,71.714804,412.586255,16.742664,17.32166,,,,1.666667,,1.511947,1.991395,2019.0,5.867652
std,29969.47,8005781000.0,0.800578,0.793065,0.0,996.499112,44.904978,0.689262,6.591536,1.34764,3007.423364,0.0,0.0,0.0,0.437037,1.689824,31.178618,169.66884,34.986897,6.621216,,,,1.154701,,0.669092,0.953673,0.0,3.079257
min,2018025000.0,12018050000.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,8.0,1.0,1.0,11.0,111.0,0.0,12.0,,,,1.0,,1.0,1.0,2019.0,1.0
25%,2019025000.0,32019010000.0,3.0,3.0,1.0,0.0,9.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,2.0,44.0,293.0,0.0,13.0,,,,1.0,,1.0,1.0,2019.0,3.0
50%,2019050000.0,32019040000.0,3.0,3.0,1.0,0.0,99.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,4.0,99.0,432.0,0.0,15.0,,,,1.0,,1.0,2.0,2019.0,6.0
75%,2019075000.0,32019070000.0,3.0,3.0,1.0,1998.0,99.0,1.0,11.0,5.0,3000.0,0.0,0.0,8.0,1.0,5.0,99.0,513.0,0.0,17.0,,,,2.0,,2.0,2.0,2019.0,8.0
max,2019100000.0,32019100000.0,3.0,3.0,1.0,2019.0,99.0,2.0,25.0,5.0,9800.0,0.0,0.0,8.0,4.0,7.0,99.0,999.0,99.0,53.0,,,,3.0,,12.0,26.0,2019.0,11.0


In [38]:
i_all[i_all['injured_type'] == 0].shape

(46831, 49)

**Null conclusion:** all `injured_type` == 0 have dictionary issues

In [40]:
calc_diff_counts_hebrew(i_all, 'injured_type')

Shape of data: (73122, 49)


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

## injured_position and injured_position_hebrew

In [41]:
# injured_position                119953 non-null int64
# injured_position_hebrew         119953 non-null object

In [42]:
calc_diff_counts_hebrew(i_all, 'injured_position')

Shape of data: (119953, 49)


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

## population_type and population_type_hebrew

In [44]:
# population_type                 119953 non-null int64
# population_type_hebrew          119953 non-null object

In [45]:
calc_diff_counts_hebrew(i_all, 'population_type')

Shape of data: (119953, 49)


0    0
1    0
2    0
3    0
dtype: int64

## home_region and home_region_hebrew

In [46]:
# home_region                     91482 non-null float64
# home_region_hebrew              91482 non-null object

In [47]:
calc_diff_counts_hebrew(i_all, 'home_region')

Shape of data: (91482, 49)


0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

## home_district and home_district_hebrew

In [48]:
# home_district                   119953 non-null int64
# home_district_hebrew            91482 non-null object

In [49]:
i_all[(i_all['home_district'].isnull() == False) & (i_all['home_district_hebrew'].isnull() == True)].shape

(28471, 49)

In [50]:
i_all[(i_all['home_district'].isnull() == False) & (i_all['home_district_hebrew'].isnull() == True)].describe()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,license_acquiring_date,age_group,sex,vehicle_type,safety_measures,involve_yishuv_symbol,injury_severity,injured_type,injured_position,population_type,home_region,home_district,home_natural_area,home_municipal_status,home_yishuv_shape,hospital_time,medical_type,release_dest,safety_measures_use,late_deceased,car_id,involve_id,accident_year,accident_month
count,28471.0,28471.0,28471.0,28471.0,28471.0,28471.0,28471.0,28471.0,27805.0,28471.0,28471.0,28471.0,28471.0,28471.0,28471.0,0.0,28471.0,0.0,0.0,0.0,0.0,0.0,0.0,134.0,37.0,28471.0,28471.0,28471.0,28471.0
mean,2019050000.0,30196140000.0,2.817709,2.825015,1.190966,96.299182,90.58593,0.113695,6.178385,4.841453,0.0,0.381897,0.399494,7.488708,1.104211,,99.0,,,,,,,1.425373,1.027027,1.552352,2.305925,2019.0,5.930631
std,29582.3,5756416000.0,0.575642,0.565121,0.526859,428.486639,26.478711,0.377305,7.24585,0.765435,0.0,0.994799,1.287482,1.863691,0.529887,,0.0,,,,,,,0.749705,0.164399,0.709208,0.975596,0.0,3.075048
min,2018025000.0,12019000000.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,,99.0,,,,,,,1.0,1.0,0.0,1.0,2019.0,1.0
25%,2019025000.0,32019020000.0,3.0,3.0,1.0,0.0,99.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,,99.0,,,,,,,1.0,1.0,1.0,2.0,2019.0,3.0
50%,2019050000.0,32019050000.0,3.0,3.0,1.0,0.0,99.0,0.0,1.0,5.0,0.0,0.0,0.0,8.0,1.0,,99.0,,,,,,,1.0,1.0,1.0,2.0,2019.0,6.0
75%,2019075000.0,32019070000.0,3.0,3.0,1.0,0.0,99.0,0.0,11.0,5.0,0.0,0.0,0.0,8.0,1.0,,99.0,,,,,,,2.0,1.0,2.0,3.0,2019.0,9.0
max,2019100000.0,32019100000.0,3.0,3.0,3.0,2019.0,99.0,2.0,25.0,5.0,0.0,3.0,9.0,9.0,4.0,,99.0,,,,,,,3.0,2.0,12.0,20.0,2019.0,11.0


In [51]:
i_all[i_all['home_district'] == 99].shape

(28471, 49)

**Null conclusion**: all appearances of `home_district` == 99 are missing in dictionary

In [53]:
calc_diff_counts_hebrew(i_all, 'home_district')

Shape of data: (91482, 49)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
dtype: int64

## home_natural_area and home_natural_area_hebrew

In [54]:
# home_natural_area               91482 non-null float64
# home_natural_area_hebrew        91452 non-null object

In [55]:
i_all[(i_all['home_natural_area'].isnull() == False) & (i_all['home_natural_area_hebrew'].isnull() == True)].shape

(30, 49)

In [56]:
i_all[(i_all['home_natural_area'].isnull() == False) & (i_all['home_natural_area_hebrew'].isnull() == True)].describe()

Unnamed: 0,accident_id,provider_and_id,provider_code,file_type_police,involved_type,license_acquiring_date,age_group,sex,vehicle_type,safety_measures,involve_yishuv_symbol,injury_severity,injured_type,injured_position,population_type,home_region,home_district,home_natural_area,home_municipal_status,home_yishuv_shape,hospital_time,medical_type,release_dest,safety_measures_use,late_deceased,car_id,involve_id,accident_year,accident_month
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,29.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,0.0,0.0,0.0,3.0,0.0,30.0,30.0,30.0,30.0
mean,2019048000.0,26019050000.0,2.4,2.4,2.033333,1335.033333,8.4,1.333333,4.310345,3.8,3823.0,2.1,2.066667,5.233333,1.0,7.0,74.0,999.0,73.0,19.0,,,,1.666667,,1.233333,1.566667,2019.0,5.8
std,29239.34,9321847000.0,0.932183,0.932183,0.808717,960.214074,3.663426,0.479463,4.759227,1.864366,0.0,1.398275,1.552158,3.520221,0.0,0.0,0.0,0.0,0.0,0.0,,,,1.154701,,0.568321,0.773854,0.0,2.952497
min,2019000000.0,12019000000.0,1.0,1.0,1.0,0.0,4.0,1.0,1.0,1.0,3823.0,0.0,0.0,1.0,1.0,7.0,74.0,999.0,73.0,19.0,,,,1.0,,0.0,1.0,2019.0,1.0
25%,2019026000.0,12019060000.0,1.0,1.0,1.0,0.0,5.0,1.0,1.0,1.0,3823.0,0.0,0.0,1.0,1.0,7.0,74.0,999.0,73.0,19.0,,,,1.0,,1.0,1.0,2019.0,4.0
50%,2019050000.0,32019050000.0,3.0,3.0,2.0,1992.5,7.5,1.0,1.0,5.0,3823.0,3.0,2.5,8.0,1.0,7.0,74.0,999.0,73.0,19.0,,,,1.0,,1.0,1.0,2019.0,6.0
75%,2019073000.0,32019070000.0,3.0,3.0,3.0,2007.75,11.75,2.0,10.0,5.0,3823.0,3.0,3.0,8.0,1.0,7.0,74.0,999.0,73.0,19.0,,,,2.0,,1.0,2.0,2019.0,8.0
max,2019098000.0,32019100000.0,3.0,3.0,3.0,2019.0,16.0,2.0,17.0,5.0,3823.0,3.0,4.0,9.0,1.0,7.0,74.0,999.0,73.0,19.0,,,,3.0,,3.0,4.0,2019.0,10.0


In [57]:
i_all[i_all['home_natural_area'] == 999].shape

(30, 49)

**Null conclusion**: all instances of `home_natural_area`== 999 are problematic

In [59]:
calc_diff_counts_hebrew(i_all, 'home_natural_area')

Shape of data: (91452, 49)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
51    0
52    0
53    0
54    0
55    0
56    0
57    0
dtype: int64

## home_municipal_status and home_municipal_status_hebrew

In [60]:
# home_municipal_status           91133 non-null float64
# home_municipal_status_hebrew    91133 non-null object

In [61]:
calc_diff_counts_hebrew(i_all, 'home_municipal_status')

Shape of data: (91133, 49)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
51    0
52    0
53    0
54    0
55    0
dtype: int64

## home_yishuv_shape and home_yishuv_shape_hebrew

In [62]:
# home_yishuv_shape               91482 non-null float64
# home_yishuv_shape_hebrew        91482 non-null object

In [63]:
calc_diff_counts_hebrew(i_all, 'home_yishuv_shape')

Shape of data: (91482, 49)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
dtype: int64

## safety_measures_use and safety_measures_use_hebrew

In [64]:
# safety_measures_use             4937 non-null float64
# safety_measures_use_hebrew      4937 non-null object

In [65]:
calc_diff_counts_hebrew(i_all, 'safety_measures_use')

Shape of data: (4937, 49)


0    0
1    0
2    0
dtype: int64

## late_deceased and late_deceased_hebrew

In [66]:
# late_deceased                   327 non-null float64
# late_deceased_hebrew            327 non-null object

In [67]:
calc_diff_counts_hebrew(i_all, 'late_deceased')

Shape of data: (327, 49)


0    0
1    0
dtype: int64

## Categories with all values missing

**Conclusion:** following values missing completely

In [69]:
# hospital_time                   0 non-null float64
# hospital_time_hebrew            0 non-null object
# medical_type                    0 non-null float64
# medical_type_hebrew             0 non-null object
# release_dest                    0 non-null float64
# release_dest_hebrew             0 non-null object