In [225]:
# t - test analysis to find significant features of a specific road segment (or municipality) compared to all other road segments (or municipalities).
# output is list of streets / roads / road segments and their outstanding values


import pandas as pd
from scipy.stats import ttest_ind
import numpy as np

sample_size = 10000000 # choose 1000 to reduce runtime
raw_data = pd.read_csv("C://Users//user//PycharmProjects//anyway//data//views_2019//involved_markers_hebrew.csv",nrows=sample_size,low_memory=False)

In [229]:
# get list of top X groups of category Y (top 20 yishiuvim)
def get_top(number,category):
    top_list= raw_data.groupby(category).size().reset_index().rename(columns={0: "count"}).sort_values(by='count',ascending=False)
    return top_list.head(number)[category]

#segments to analyze (everything is optional, remove segments that create unuseful output
demographic_segments = [
    'is_male',
    'is_over_70',
    'is_under_14',
   # 'is_pedestrian'
   # ,'is_jew'
]
accident_segments = [
    'is_daytime',
    'is_summer',
    'is_weekend',
    'is_head-on',
    'is_roll-over',
    'is_hard',
    'is_deadly',
    'is_animal',
    'is_truck',
    'is_electric'
    
]



In [230]:
#data cleaning 
def clean_data(data,category):
    data['is_male'] = data['sex'].replace(0,np.nan).replace(2,0) #male = 1, female = 0, remove unknown
    data['is_over_70'] = data['age_group'].replace(99,np.nan) #remove unknown
    data['is_over_70'] = np.where(data['is_over_70']>14,1,0) #above code 14 is over 70
    data['is_under_14'] = data['age_group'].replace(99,np.nan) #remove unknown
    data['is_under_14'] = np.where(data['is_under_14']<4,1,0) #under code 4 is over 14
    data['is_weekend'] = np.where(data['day_in_week']>5,1,0) #Fri and Sat are weekends
    data['is_jew'] = data['population_type'].replace([2,3,4,''],0)#convert non jews to 0
    data['is_daytime'] = data['day_night'].replace(5,0)#convert night (5) to 0
    data['is_pedestrian'] = data['injured_type'].replace([2,3,4,5,6,7,8,9],0) #convert non pedestrian to 0, pedestrian to 1
    data['is_summer'] = data['accident_month'].replace([10,11,12,1,2,3],0).replace([4,5,6,7,8,9],1) #convert winter to 0 ,summer to 1
    data['is_head-on'] = np.where(data['accident_type']==5,1,0) # type 5 is headon (TEUNA HAZITIT)
    data['is_animal'] = np.where(data['accident_type']==19,1,0) # type 19 is animal crash
    data['is_hard'] = data['accident_severity'].replace(3,0).replace([1,2],1) # hard accidents will get 1
    data['is_deadly'] = data['accident_severity'].replace([3,2],0)# deadly accidents will get 1
    data['is_driver'] = data['injured_type'].replace([2,3,4,5,6,7,8,9],0) #convert non drivers to 0, drivers to 1
    data['is_roll-over'] = np.where(data['accident_type']==10,1,0)# type 10 is roll-over
    data['is_motorcycle'] = np.where((data['vehicle_vehicle_type']>7) & (data['vehicle_vehicle_type']<11),1,0) # numbers that represent motorcycle
    data['is_truck'] = np.where((data['vehicle_vehicle_type']>3) & (data['vehicle_vehicle_type']<8),1,0) # numbers that represent truck
    data['is_electric'] = np.where((data['vehicle_vehicle_type']>20) & (data['vehicle_vehicle_type']<24),1,0) # numbers that represent truck
   
    data['full_street1_hebrew'] = data['accident_yishuv_name'] +"_" + data['street1_hebrew'] # new column of city+street name
    
    data = data[data[category].notnull()] # filter for not null of input type (type of place)
    
    data = data[data['involved_type'] != 1] # only NIFGAIM, not drivers
    
    data = data[data['accident_year'] >2008] # play with the dates
    
    #set real values for the binary values in segments
    
    value_dict = {} 
    value_dict['is_male'] = ('נקבה','זכר')
    value_dict['is_over_70'] = ('מתחת_70','מעל_70')
    value_dict['is_under_14'] = ('מעל_14','מתחת_14')
    value_dict['is_weekend'] = ('ימי_השבוע','סוף_שבוע')
    value_dict['is_jew'] = ('לא_יהודי','יהודי')
    value_dict['is_daytime'] = ('שעות_הלילה','שעות_היום')
    value_dict['is_pedestrian'] = ('לא_הולך_רגל','הולך_רגל')
    value_dict['is_summer'] = ('חורף','קיץ')
    value_dict['is_head-on'] = ('לא_התנגשות_חזיתית','התנגשות_חזיתית')
    value_dict['is_roll-over'] = ('לא_התהפכות','התהפכות')
    value_dict['is_hard'] = ('פגיעה_קלה','פגיעה_קשה')
    value_dict['is_deadly'] = ('תאונה_לא_קטלנית','תאונה_קטלנית')
    value_dict['is_driver'] = ('לא_נהג','נהג')
    value_dict['is_animal'] = ('לא_פגיעה_בבעל_חיים','פגיעה_בבעל_חיים')
    value_dict['is_motorcycle'] = ('לא_אופנוע','אופנוע')
    value_dict['is_truck'] = ('לא_משאית','משאית')
    value_dict['is_electric'] = ('לא_דו_גלגלי_חשמלי','דו_גלגלי_חשמלי')
    
    #if category is intercity - compare only to intercity accidents, if its innercity - compare to innercity
    
    if category == 'road_segment_name':
        data = data[data['road_type_hebrew'].isin(['לא-עירונית לא בצומת','לא-עירונית בצומת'])] 
    if category == 'accident_yishuv_name':
        data = data[data['road_type_hebrew'].isin(['עירונית לא בצומת','עירונית בצומת'])]
    if category == 'full_street1_hebrew':
        data = data[data['road_type_hebrew'].isin(['עירונית לא בצומת','עירונית בצומת'])]
        
    # create data table that contains only unique accidents - for analysing accident segments
    
    data_unique = data.drop_duplicates(subset ="provider_and_id") 
    
    return data,data_unique,value_dict
    





In [231]:
# function for analysing segments
def analyse_segment(data,segment,location,category):
    result = []
    filtered_data = data[data[category]==location] #table of only the value chosen (MIKTA X for example)
    all_rows = data[segment].dropna().reset_index(drop=True) #only the segment column
    filtered_rows = filtered_data[segment].dropna() #only the segment column  - filtered data
    s,p = ttest_ind(all_rows,filtered_rows,nan_policy='omit') # ttest for the segment columns
    if p > 0.1 or np.isnan(p):
        return result
    percent_filtered = 'percent_of_filtered' #create column name for percent in segmented data
    count_filtered = "count_of_filtered"#create column name for count in segmented data
    acc_data_tmp = data.groupby(segment).size().reset_index().rename(columns={0: "count"}) #create groupby table
    acc_data_tmp['percent'] = acc_data_tmp["count"]/acc_data_tmp["count"].sum() # create percent column
    acc_data_tmp['percent'] = acc_data_tmp['percent'].astype(float).map(lambda n: '{:.0%}'.format(n)) #convert to percent format
    filtered_acc_data_tmp =  filtered_data.groupby(segment).size().reset_index().rename(columns={0: count_filtered})
    filtered_acc_data_tmp[percent_filtered] = \
        filtered_acc_data_tmp[count_filtered]/filtered_acc_data_tmp[count_filtered].sum()
    filtered_acc_data_tmp[percent_filtered] = filtered_acc_data_tmp[percent_filtered].astype(float).map(lambda n: '{:.0%}'.format(n))
    
    #create new table: rows: 0,1 (values of specific segment). columns: count of accidents, percent of accidents per total accidents and per filtered accidents
    final_table = pd.concat([acc_data_tmp, filtered_acc_data_tmp.drop(segment,axis = 1)], axis=1, sort=False)
    if final_table.isnull().values.any():
        return result
    for j in range(len(final_table)):
        filtered_count = final_table.loc[j,count_filtered]
        filtered_percent = float(final_table.loc[j,percent_filtered].strip('%'))/100
        original_percent = float(final_table.loc[j,'percent'].strip('%'))/100
        if original_percent == 0:
            return result
        if (filtered_percent - original_percent > 0.10 or filtered_percent/original_percent > 2) and filtered_percent > 0.04 and filtered_count > 20: #if the difference is significant  - tell me
            significant_segment = value_dict[segment][j]
            percents = 'Number of accidents: {:d} = {:.0%} vs. {:.0%}'.format(filtered_count,filtered_percent,original_percent)
            result.append(significant_segment)
            result.append(percents)
            return result
    

In [232]:
for category in ['full_street1_hebrew','road_segment_name','road1']:
    data,data_unique,value_dict = clean_data(raw_data,category)
    top_list = get_top(50,category)
    print("\n******************************************************")
    print("Category:",category)
    print("******************************************************")
    for i in top_list:
        list = []
        for j in demographic_segments:
            result = analyse_segment(data,j,i,category)
            if result:
                list.append(result)
        for j in accident_segments:
            result = analyse_segment(data_unique,j,i,category)
            if result:
                list.append(result)
        if list:
            print("=====================")
            print(i)
            print("=====================")
            for x in list:
                print (x)
            


******************************************************
Category: full_street1_hebrew
******************************************************


תל אביב -יפו_יפת
['סוף_שבוע', 'Number of accidents: 297 = 28% vs. 18%']


תל אביב -יפו_דרך שלמה
['זכר', 'Number of accidents: 704 = 71% vs. 56%']


ירושלים_בית חנינה החדשה
['מתחת_14', 'Number of accidents: 252 = 23% vs. 9%']



******************************************************
Category: road_segment_name
******************************************************


צומת יסיף - צומת כברי
['סוף_שבוע', 'Number of accidents: 400 = 32% vs. 20%']


צומת יבור - צומת חנא
['התנגשות_חזיתית', 'Number of accidents: 72 = 8% vs. 3%']


מחלף לקייה - צומת ערד
['התהפכות', 'Number of accidents: 61 = 7% vs. 3%']
['תאונה_קטלנית', 'Number of accidents: 41 = 5% vs. 1%']


צומת לאלפי מנשה - יישוב שכם
['התנגשות_חזיתית', 'Number of accidents: 97 = 12% vs. 3%']


כניסה למצפה שלם - צומת שדי תרומות
['התנגשות_חזיתית', 'Number of accidents: 79 = 12% vs. 3%']
['התהפכות', 'Number of accidents: 140 = 21% vs. 3%']
['פגיעה_קשה', 'Number of accidents: 99 = 15% vs. 6%']
['תאונה_קטלנית', 'Number of accidents: 34 = 5% vs. 1%']


מחלף בן שמן - ב' - מחלף נחשונים
['משאית', 'Number of accidents: 38 = 7% vs. 2%']


מחלף ינאי - מחלף אולגה
['סוף_שבוע', 'Number of accidents: 178 = 31% vs. 20%']


צומת דימונה - צומת בית אשל
['התהפכות', 'Number of accidents: 71 = 10% vs. 3%']



******************************************************
Category: road1
******************************************************


6.0
['משאית', 'Number of accidents: 244 = 6% vs. 2%']


60.0
['התנגשות_חזיתית', 'Number of accidents: 426 = 11% vs. 3%']


90.0
['התהפכות', 'Number of accidents: 343 = 12% vs. 3%']


25.0
['התהפכות', 'Number of accidents: 178 = 9% vs. 3%']


35.0
['התהפכות', 'Number of accidents: 75 = 7% vs. 3%']


31.0
['התהפכות', 'Number of accidents: 110 = 8% vs. 3%']
['פגיעה_קשה', 'Number of accidents: 149 = 11% vs. 5%']
['תאונה_קטלנית', 'Number of accidents: 62 = 5% vs. 1%']


55.0
['התנגשות_חזיתית', 'Number of accidents: 108 = 10% vs. 3%']


784.0
['התנגשות_חזיתית', 'Number of accidents: 79 = 9% vs. 3%']


805.0
['התנגשות_חזיתית', 'Number of accidents: 73 = 8% vs. 3%']


232.0
['התהפכות', 'Number of accidents: 89 = 12% vs. 3%']
['משאית', 'Number of accidents: 33 = 5% vs. 2%']


71.0
['התנגשות_חזיתית', 'Number of accidents: 43 = 9% vs. 3%']
['פגיעה_קשה', 'Number of accidents: 65 = 14% vs. 5%']


437.0
['התנגשות_חזיתית', 'Number of accidents: 81 = 14% vs. 3%']
574.0
['התנגשות_חזיתית', 'Number of accidents: 43 = 8% vs. 3%']


446.0
['התנגשות_חזיתית', 'Number of accidents: 51 = 9% vs. 3%']


505.0
['התנגשות_חזיתית', 'Number of accidents: 48 = 8% vs. 3%']
['התהפכות', 'Number of accidents: 44 = 8% vs. 3%']
['פגיעה_קשה', 'Number of accidents: 64 = 11% vs. 5%']


854.0
['פגיעה_בבעל_חיים', 'Number of accidents: 27 = 5% vs. 1%']
34.0
['התהפכות', 'Number of accidents: 48 = 10% vs. 3%']


672.0
['סוף_שבוע', 'Number of accidents: 137 = 31% vs. 20%']
['התנגשות_חזיתית', 'Number of accidents: 57 = 13% vs. 3%']
9998.0
['התהפכות', 'Number of accidents: 59 = 10% vs. 3%']
