In [7]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np


sample_size = 1000000 # choose 1000 to reduce runtime
raw_data = pd.read_csv("C://Users//user//PycharmProjects//anyway//data//views_2019//involved_markers_hebrew.csv",nrows=sample_size,low_memory=False)


In [25]:
input_type = 'road_segment_name' #place type: road1, accident_yishuv_name, 'road_segment_name',street1
input_value = 'כניסה למצפה שלם - צומת שדי תרומות'  #place name from news flash
filter1_column = 'road_type_hebrew' #(optional -  filter type, if not relevant write None)
filter1_value = 'לא-עירונית לא בצומת'#(optional -  filter value, if not relevant write None)
filter2_column = None   #(opional - another filter type, if not relevant write None)
filter2_value = None #(optional - another filter value, if not relevant write None)
hard_only = False# Only hard accidents?

segments_to_analyze = [
    'is_male',
    'is_over_65',
    'is_weekend',
    'is_jew',
    'is_daytime',
    'is_pedestrian',
    'is_summer',
    'is_head-on',
    'is_roll-over',
    'is_hard',
    'is_deadly',
]


In [26]:
#data cleaning 
def clean_data(data):
    data['is_male'] = data['sex'].replace(0,np.nan).replace(2,0) #male = 1, female = 0, remove unknown
    data['is_over_65'] = data['age_group'].replace(99,np.nan) #remove unknown
    data['is_over_65'] = np.where(data['is_over_65']>13,1,0) #above code 13 is over 65
    data['is_weekend'] = np.where(data['day_in_week']>5,1,0) #Fri and Sat are weekends
    data['is_jew'] = data['population_type'].replace([2,3,4,''],0)#convert non jews to 0
    data['is_daytime'] = data['day_night'].replace(5,0)#convert night (5) to 0
    data['is_pedestrian'] = data['injured_type'].replace([2,3,4,5,6,7,8,9],0) #convert non pedestrian to 0, pedestrian to 1
    data['is_summer'] = data['accident_month'].replace([10,11,12,1,2,3],0).replace([4,5,6,7,8,9],1) #convert winter to 0 ,summer to 1
    data['is_head-on'] = np.where(data['accident_type']==5,1,0) # type 5 is headon haziti
    data['is_hard'] = data['accident_severity'].replace(3,0).replace([1,2],1) # hard accidents will get 1
    data['is_deadly'] = data['accident_severity'].replace([3,2],0)# deadly accidents will get 1
    data['is_driver'] = data['injured_type'].replace([2,3,4,5,6,7,8,9],0) #convert non pedestrian to 0, pedestrian to 1
    data['is_roll-over'] = np.where(data['accident_type']==10,1,0)# type 10 is roll-over
    
    data = data[data[input_type].notnull()] # filter for not null of input type (type of place)
    data = data[data['involved_type'] != 1] # only NIFGAIM, not drivers
    
    value_dict = {} #set real values for the binary values in segments
    value_dict['is_male'] = ('female','male')
    value_dict['is_over_65'] = ('under_65','above_65')
    value_dict['is_weekend'] = ('weekday','weekend')
    value_dict['is_jew'] = ('non_jew','jew')
    value_dict['is_daytime'] = ('night-time','day-time')
    value_dict['is_pedestrian'] = ('not_pedestrian','pedestrian')
    value_dict['is_summer'] = ('winter','summer')
    value_dict['is_head-on'] = ('not head-on','head-on')
    value_dict['is_roll-over'] = ('not roll-over','roll-over')
    value_dict['is_hard'] = ('not hard accident','hard accident')
    value_dict['is_deadly'] = ('not deadly','deadly accident')
    value_dict['is_driver'] = ('not driver','driver')
    if filter1_value is not None:
        data = data[data[filter1_column] == filter1_value] # filter of 'road_type'
    if filter2_value is not None:
        data = data[data[filter2_column] == filter2_value] # filter of 'road_type'
    if hard_only:
        data = data[data['accident_severity']<3]  # filter of severity. under 3 its serious injury and death
    return data,value_dict

acc_data, value_dict = clean_data(raw_data)


In [27]:
def analyse_segment(data,i):
    message = None
    result = None  
    filtered_acc_data = data[data[input_type]==input_value] #table of only the value chosen (road number 90 for example)
    all_rows = data[i].dropna().reset_index(drop=True) #only the segment column
    filtered_rows = data[acc_data[input_type]==input_value][i].dropna() #only the segment column  - filtered
    s,p = ttest_ind(all_rows,filtered_rows) # ttest for thesegment columns
    #print("\n==Segment: [",i,"] ttest p-value is: ",np.round(p,3))
    if p > 0.1 or np.isnan(p):
        return message,result
    percent_filtered = 'percent_'+str(input_value) #create column name for percent in segmented data
    count_filtered = "count_"+str(input_value)#create column name for count in segmented data
    acc_data_tmp = data.groupby(i).size().reset_index().rename(columns={0: "count"}) #create groupby table
    acc_data_tmp['percent'] = acc_data_tmp["count"]/acc_data_tmp["count"].sum() # create percent column
    acc_data_tmp['percent'] = acc_data_tmp['percent'].astype(float).map(lambda n: '{:.0%}'.format(n)) #convert to percent format
    filtered_acc_data_tmp =  filtered_acc_data.groupby(i).size().reset_index().rename(columns={0: count_filtered})
    filtered_acc_data_tmp[percent_filtered] = \
        filtered_acc_data_tmp[count_filtered]/filtered_acc_data_tmp[count_filtered].sum()
    filtered_acc_data_tmp[percent_filtered] = filtered_acc_data_tmp[percent_filtered].astype(float).map(lambda n: '{:.0%}'.format(n))
    final_table = pd.concat([acc_data_tmp, filtered_acc_data_tmp.drop(i,axis = 1)], axis=1, sort=False)
    if final_table.isnull().values.any():
        return message,result
    for j in range(len(final_table)):
        filtered_percent = float(final_table.loc[j,percent_filtered].strip('%'))/100
        original_percent = float(final_table.loc[j,'percent'].strip('%'))/100
        if original_percent == 0:
            break 
        if filtered_percent/original_percent > 1.3: #if the difference is significant  - tell me
            message = "The percentage of %s is higher than average in %s (%s vs. %s)"\
                      %(value_dict[i][j],'{:.0%}'.format(filtered_percent/original_percent-1),'{:.0%}'.format(filtered_percent),'{:.0%}'.format(original_percent))
            result = final_table   
    return message,result
    

In [28]:

print("********General*******\n")
print("Checking significance for accidents when {%s} is {%s}"%(input_type,input_value))
print("Filters:")
if filter1_value is not None:
    print("%s = %s"%(filter1_column,filter1_value))
if filter2_value is not None:
    print("%s = %s"%(filter2_column,filter2_value))
if hard_only:
    print("accident severity = hard")
print("\n*******Significant segments in words*******\n")
results_container = []
for i in segments_to_analyze:
    message,result = analyse_segment(acc_data,i)
    if message is not None:
        print(message)
    if result is not None:
        results_container.append(result)
print("\n*******Significant segments in details*******\n")
for r in results_container:
    print(r.to_string(index=False))
print("\n")


********General*******

Checking significance for accidents when {road_segment_name} is {כניסה למצפה שלם - צומת שדי תרומות}
Filters:
road_type_hebrew = לא-עירונית לא בצומת

*******Significant segments in words*******



The percentage of pedestrian is higher than average in 100% (2% vs. 1%)
The percentage of head-on is higher than average in 217% (19% vs. 6%)


The percentage of roll-over is higher than average in 475% (23% vs. 4%)
The percentage of hard accident is higher than average in 200% (24% vs. 8%)
The percentage of deadly accident is higher than average in 267% (11% vs. 3%)

*******Significant segments in details*******

 is_pedestrian   count percent  count_כניסה למצפה שלם - צומת שדי תרומות percent_כניסה למצפה שלם - צומת שדי תרומות
             0  116259     99%                                      908                                       98%
             1    1269      1%                                       23                                        2%
 is_head-on   count percent  count_כניסה למצפה שלם - צומת שדי תרומות percent_כניסה למצפה שלם - צומת שדי תרומות
          0  110858     94%                                      753                                       81%
          1    6670      6%                                      178                                       19%
 is_roll-over   count percent  count_כניסה למצפה של


 is_hard   count percent  count_כניסה למצפה שלם - צומת שדי תרומות percent_כניסה למצפה שלם - צומת שדי תרומות
       0  107840     92%                                      711                                       76%
       1    9688      8%                                      220                                       24%
 is_deadly   count percent  count_כניסה למצפה שלם - צומת שדי תרומות percent_כניסה למצפה שלם - צומת שדי תרומות
         0  114531     97%                                      830                                       89%
         1    2997      3%                                      101                                       11%


