In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/step0-cleaned-france-road-accident/Step0_Cleaned-_2025_01_30_1331.parquet


In [9]:
%%time
## all imports
import dask
import pyarrow.parquet as pq
import dask.dataframe as dd
import os
import shutil
import json
from enum import Enum
from datetime import datetime
from ydata_profiling import ProfileReport
from pathlib import Path
import random
import joblib


from dask.distributed import LocalCluster
##Metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import mutual_info_score, adjusted_rand_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
import shap
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pydot

#For excel stuff
import openpyxl
from openpyxl.drawing.image import Image


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from shapely.geometry import Point


##h3
import h3



# Concurrency
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

#  Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random state
random_state = 42
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)
import seaborn as sns
sns.set_style("whitegrid")
# Paris coordinates (latitude, longitude)
paris_lat = 48.8566
paris_long = 2.3522


parquet_file = '/kaggle/input/step0-cleaned-france-road-accident/Step0_Cleaned-_2025_01_30_1331.parquet'


from dask.distributed import Client
client = Client()

pd.set_option('display.max_columns', None)

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("MAP_TILER_USER")
mapbox_alt = user_secrets.get_secret("Mapbox_Alt")
mapbox_token = user_secrets.get_secret("MAPBOX_TOKEN_TEST")





class ExtensionMethods:
    @staticmethod
    def generate_filename(filename=None,extension=None):
        current_datetime = datetime.now()
        f = current_datetime.strftime("%Y_%m_%d_%H%M")
        if (filename is None) or (extension is None):
            return str(f)
        else:
            stitched_f = str(filename)+"_"+str(f)+"."+str(extension)
            return str(stitched_f)

    @staticmethod
    def get_file_name_without_extension(filename):
        if filename == None:
            return "Provide a file"
        return Path(filename).stem


def create_parquet(data, filename='test'):
    if data is None:
        raise ValueError("Data can't be None for Parquet Creation")
    obj_cols = data.select_dtypes(include =['object','category'], exclude=['datetime','datetime64','datetimetz']).columns
    for col in obj_cols:
        data[col]=data[col].astype(str)
    filepath = os.path.join(os.getcwd(),ExtensionMethods.generate_filename(f"{filename}", "parquet"))
    data.to_parquet(filepath, engine='pyarrow',compression="zstd", compression_level=10, index=False)
    print(f"\n Finished Saving parquet to: {filepath}")

CPU times: user 7.16 s, sys: 996 ms, total: 8.15 s
Wall time: 10.8 s


In [10]:
%%time
class CategoryBaseEnum(Enum):
    @classmethod
    def Name(cls):
        return f"{cls.__name__}"
    @classmethod
    def IsCategory(cls):
        return True if len(cls.__members__) > 0 else False
    @classmethod
    def get_description(cls):
        return cls.__doc__ or "No description available"
    @classmethod
    def to_dict(cls):
        return {member.name:member.value for member in cls}
    @classmethod
    def to_json(cls, indent=4):
        return json.dumps(cls.to_dict(), indent=indent)
    @classmethod
    def enum_keys(cls):
        return [member.name for member in cls]


class FileTypeEnum(Enum):
    @classmethod
    def Name(cls):
        return f"{cls.__name__}"
    @classmethod
    def get_description(cls):
        return cls.__doc__ or "No description available"
    @classmethod
    def to_dict(cls):
        return {member.name:member.value.to_dict() for member in cls}
    @classmethod
    def enum_keys(cls):
        return [member.name for member in cls]
    @classmethod
    def IsCategory(cls):
        return {member.name:member.value.IsCategory() for member in cls}
    @classmethod
    def MegaDictionary(cls):
        return {member.name:[member.value.to_dict(),member.value.IsCategory()] for member in cls}

class RoadAccidentEnum(FileTypeEnum):
    '''Collection of all the categories for file type 'vehicle.csv' '''

    class accident_id(CategoryBaseEnum):
        '''The Index/Number of the Crash follows the pattern yyyyxxxxx and is the index column''' 
    class vehicle_id(CategoryBaseEnum):
        '''The vehicle id  in terms of xxx-xxx'''
    
    class vehicle_category(CategoryBaseEnum): 
        '''The Category of Vehicle involved in the crash'''
        UNDETERMINED = 0
        BICYCLE = 1
        MOPED_LESS_EQUAL_50CC = 2
        MICROCAR = 3
        TOURISM_VEHICLE = 7
        UTILITY_VEHICLE_PTAC_1_5T_3_5T = 10 #1.5<PTAC<3.5
        HEAVY_TRUCK_PTAC_3_5T_7_5T = 13
        HEAVY_TRUCK_PTAC_OVER_7_5T = 14
        HEAVY_TRUCK_OVER_3_5T_WITH_TRAILER = 15
        TRACTOR_ONLY = 16
        TRACTOR_WITH_SEMI_TRAILER = 17
        SPECIAL_VEHICLE = 20
        AGRICULTURAL_TRACTOR = 21
        SCOOTER_LESS_EQUAL_50CC = 30
        MOTORCYCLE_50CC_125CC = 31
        SCOOTER_50CC_125CC = 32
        MOTORCYCLE_OVER_125CC = 33
        SCOOTER_OVER_125CC = 34
        LIGHT_QUAD_LESS_EQUAL_50CC = 35
        HEAVY_QUAD_OVER_50CC = 36
        BUS = 37
        COACH = 38
        TRAIN = 39
        TRAMWAY = 40
        THREE_WHEELED_VEHICLE_LESS_EQUAL_50CC = 41
        THREE_WHEELED_VEHICLE_50CC_125CC = 42
        THREE_WHEELED_VEHICLE_OVER_125CC = 43
        MOTORIZED_PERSONAL_TRANSPORT = 50
        NON_MOTORIZED_PERSONAL_TRANSPORT = 60
        OTHER_VEHICLE = 99

    class obstacle_static(CategoryBaseEnum):  
        ''' The Static/Stationary Obstacle Hit'''
        UNKNOWN = 0
        PARKED_VEHICLE = 1
        TREE_ON_ROADSIDE = 2
        METAL_BARRIER = 3
        CONCRETE_BARRIER = 4
        OTHER_BARRIER=5
        BUILDING_WALL_BRIDGE_PIER = 6 
        VERTICAL_SIGNPOST_OR_EMERGENCY_CALLBOX = 7
        POLE = 8
        URBAN_FURNITURE = 9 
        PARAPET = 10
        REFUGE_ISLAND_BOLLARD = 11 
        SIDEWALK = 12 
        DITCH = 13 
        OTHER_OBS_ON_ROADWAY = 14
        OTHER_OBS_ON_SIDEWALK = 15
        ROADWAY_EXIT_WITOUT_OBSTACLES = 16
        AQUEDUCT_HEAD = 17

    class obstacle_mobile(CategoryBaseEnum): 
        '''The Dynamic Obstacle Hit'''
        UNKNOWN = 0
        PEDESTRIAN =1 
        VEHICLE = 2
        RAIL_VEHICLE = 3
        ANIMAL_DOMESTIC = 4
        ANIMAL_WILD = 5
        OTHER = 9
        
    class impact_point(CategoryBaseEnum): 
        '''The Initial Point of Impact of the crash'''
        UNKNOWN = 0
        FRONT = 1
        FRONT_LEFT = 2
        FRONT_RIGHT = 3
        REAR = 4
        REAR_RIGHT = 5
        REAR_LEFT = 6
        SIDE_LEFT = 7
        SIDE_RIGHT = 8
        MULTIPLE = 9

    
    class action(CategoryBaseEnum): 
        '''The Main Action performed by the user before the crash'''
        UNKNOWN = 0
        CIRCULATING_NO_DIRECTION_CHANGE =1
        CIRCULATING_SAME_DIRECTION = 2 
        CIRCULATION_BETWEEN_2_LANES = 3
        CIRCULATING_REVERSING = 4
        CIRCULATING_AGAINTS_FLOW_TRAFFIC = 5
        CIRCULATING_CROSSING_MEDIAN_STRIP = 6
        CIRCULATING_IN_BUSLANE_SAME_DIRECTION = 7
        CIRCULATING_IN_BUSLANE_OPP_DIRECTION = 8
        CIRCULATING_INSERTION =9
        CIRCULATING_TURNING_AROUND_CARRIAGE_WAY = 10
        CHANGING_LANE_LEFT = 11
        CHANGING_LANE_RIGHT = 12
        DEPORT_LEFT = 13
        DEPORT_RIGHT = 14
        TURNING_LEFT = 15
        TURNING_RIGHT = 16
        OVERTAKING_LEFT = 17
        OVERTAKING_RIGHT = 18 
        CROSSING_ROAD = 19
        PARKING_ACTION = 20
        AVOIDANCE_ACTION = 21
        DOOR_OPENED = 22
        STOP_NO_PARKING = 23
        PARKED_WITH_PASS = 24 
        DRIVING_SIDEWALK = 25
        OTHER_ACTIONS = 26
        
    
    class motor(CategoryBaseEnum): 
        '''The Type of Motor(In terms of fuel type) involved in the crash'''
        UNKNOWN = 0
        CONVENTIONAL_FUEL = 1 
        HYBRID_ELECTRIC = 2
        ELECTRIC = 3
        HYDROGEN = 4
        HUMAN_POWERED = 5
        OTHER = 6

    class road(CategoryBaseEnum):
        '''Category of the road'''
        MOTORWAY = 1
        NATIONAL_ROAD = 2
        DEPARTMENTAL_ROAD = 3
        MUNICIPAL_ROAD = 4
        OFF_NETWORK = 5
        PARKING_AREA = 6
        URBAN_METROPOLE_ROAD = 7
        OTHER = 9

    class traffic(CategoryBaseEnum):
        '''Traffic regime:'''
        NOT_PROVIDED = -1
        ONE_WAY = 1
        BIDIRECTIONAL = 2
        SEPARATE_LANES = 3
        VARIABLE_LANE = 4

        
    class road_surface(CategoryBaseEnum):
        '''Surface condition'''
        NOT_PROVIDED = -1
        NORMAL = 1
        WET = 2
        PUDDLES = 3
        FLOODED = 4
        SNOWY = 5
        MUDDY = 6
        ICY = 7
        GREASE = 8
        OTHER = 9
       
    class infra(CategoryBaseEnum):
        '''Development - Infrastructure:'''
        NOT_PROVIDED = -1
        NONE = 0
        UNDERGROUND = 1
        BRIDGE = 2
        INTERCHANGE_RAMP = 3
        RAILROAD = 4
        AMENAGED_CROSSROAD = 5
        PEDESTRIAN_ZONE = 6
        TOLL_ZONE = 7
        WORKZONE = 8
        OTHERS = 9
        

    class situation(CategoryBaseEnum):
        '''Situation of the accident'''
        NOT_PROVIDED = -1
        NONE = 0
        ON_ROADWAY = 1
        ON_EMERGENCY_LANE = 2
        ON_SHOULDER = 3
        ON_SIDEWALK = 4
        ON_CYCLE_PATH = 5
        ON_OTHER_SPECIAL_LANE = 6
        OTHERS = 8

    class speed_limit(CategoryBaseEnum):
        '''Maximum speed permitted at the location and time of the accident.'''

    
    class user_category(CategoryBaseEnum): # 
        '''User category'''
        UNDETERMINED = 0
        DRIVER = 1
        PASSENGER = 2
        PEDESTRIAN = 3
        
       
    class severity(CategoryBaseEnum):  
        ''' Severity of the accident: The injured users are classified into three categories of victims plus the uninjured'''
        UNKNOWN = 0
        NO_INJURY = 1
        KILLED = 2
        INJURED_HOSPITALIZED = 3
        MINOR_INJURY = 4
        
       
    class sex(CategoryBaseEnum):
        '''User's gender'''
        UNKNOWN = 0
        MALE =1 
        FEMALE = 2
        
       
    class dob(CategoryBaseEnum): 
        '''Year of birth of the user'''     
  
    class safety_equipment(CategoryBaseEnum):
        '''The existence of a safety equipment'''
        NOT_PROVIDED = -1  
        NO_EQUIPMENT = 0  
        SEATBELT = 1  
        HELMET = 2  
        CHILD_DEVICE = 3  
        REFLECTIVE_VEST = 4  
        AIRBAG_2W_3W = 5  
        GLOVES_2W_3W = 6  
        GLOVES_AIRBAG_2W_3W = 7  
        NOT_DETERMINABLE = 8  
        OTHER = 9  

       
    class datetime(CategoryBaseEnum):
        '''Date Time of the accident'''
       
    class lum(CategoryBaseEnum):
        '''Lighting conditions during the accident'''
        DAYLIGHT = 1
        DUSK_OR_DAWN = 2
        NIGHT_WITHOUT_PUBLIC_LIGHT = 3
        NIGHT_WITH_PUBLIC_LIGHT_OFF = 4
        NIGHT_WITH_PUBLIC_LIGHT_ON = 5
        
       
    class dep(CategoryBaseEnum):
        '''Department code (INSEE code for French departments)'''
       
    class is_urban_area(CategoryBaseEnum):
        '''Location of the accident: inside or outside the urban area'''
        OUTSIDE_URBAN_AREA = 1
        INSIDE_URBAN_AREA = 2
        
       
    class intersection(CategoryBaseEnum):
        '''Type of intersection where the accident occurred'''
        NO_INTERSECTION = 1
        X_INTERSECTION = 2
        T_INTERSECTION = 3
        Y_INTERSECTION = 4
        INTERSECTION_WITH_MORE_THAN_4_BRANCHES = 5
        ROUNDABOUT = 6
        SQUARE = 7
        RAILWAY_CROSSING = 8
        OTHER_INTERSECTION = 9
        
       
    class weather(CategoryBaseEnum):
        '''Weather conditions during the accident'''
        NOT_PROVIDED = -1
        NORMAL = 1
        LIGHT_RAIN = 2
        HEAVY_RAIN = 3
        SNOW_HAIL = 4
        FOG_SMOKE = 5
        STRONG_WIND_STORM = 6
        BLINDING_WEATHER = 7
        CLOUDY_WEATHER = 8
        OTHER = 9
        
       
    class collision_type(CategoryBaseEnum):
        '''Type of collision'''
        NOT_PROVIDED = -1
        TWO_VEHICLES_FRONT = 1
        TWO_VEHICLES_REAR = 2
        TWO_VEHICLES_SIDE = 3
        THREE_VEHICLES_CHAIN = 4
        THREE_VEHICLES_MULTIPLE_COLLISIONS = 5
        OTHER_COLLISION = 6
        NO_COLLISION = 7
    
       
    class lat(CategoryBaseEnum):
        '''Latitude of the accident location'''
       
    class long(CategoryBaseEnum):
        '''Longitude of the accident location'''


road_accident_dictionary = {
    'vehicle_category': {
        0:  'UNKNOWN',  
        99: 'UNKNOWN',
        1:  'LIGHT',  
        2:  'LIGHT',  
        3:  'LIGHT', 
        30: 'LIGHT', 
        50: 'LIGHT',  
        60: 'LIGHT',  
        7:  'MEDIUM',  
        10: 'MEDIUM',  
        31: 'MEDIUM',  
        32: 'MEDIUM',  
        33: 'MEDIUM',  
        34: 'MEDIUM',  
        35: 'MEDIUM',  
        36: 'MEDIUM',  
        41: 'MEDIUM',  
        42: 'MEDIUM',  
        43: 'MEDIUM',  
        13: 'HEAVY',  
        14: 'HEAVY',  
        15: 'HEAVY',  
        16: 'HEAVY',  
        17: 'HEAVY',  
        37: 'HEAVY',  
        38: 'HEAVY',  
        39: 'HEAVY',  
        40: 'HEAVY',  
        20: 'MISC',  
        21: 'MISC', 
        80: 'MISC'
    },

     "obstacle_static":
    {
    0: 'UNKNOWN',
    1: 'MEDIUM',
    2: 'HEAVY',
    3: 'MEDIUM',
    4: 'MEDIUM',
    5: 'MEDIUM',
    6: 'HEAVY',
    7: 'LIGHT',
    8: 'LIGHT',
    9: 'LIGHT',
    10: 'HEAVY',
    11: 'LIGHT',
    12: 'MEDIUM',
    13: 'HEAVY',
    14: 'MEDIUM',
    15: 'LIGHT',
    16: 'NO_OBSTACLE',
    17: 'HEAVY'
},


    "obstacle_mobile":
    {
    0: 'UNKNOWN',
    -1:'UNKNOWN',
    1: 'PEDESTRIAN',
    2: 'VEHICLE',
    3: 'VEHICLE',
    4: 'ANIMAL_OR_OTHER',
    6: 'ANIMAL_OR_OTHER',
    5: 'ANIMAL_OR_OTHER',
    9: 'ANIMAL_OR_OTHER'
    },

    "impact_point":
    {
    0: 'UNKNOWN',
    1: 'FRONT_IMPACT',
    2: 'FRONT_IMPACT',
    3: 'FRONT_IMPACT',
    4: 'REAR_IMPACT',
    5: 'REAR_IMPACT',
    6: 'REAR_IMPACT',
    7: 'SIDE_IMPACT',
    8: 'SIDE_IMPACT',
    9: 'MULTIPLE_IMPACT'
    },

    "action": {
    0: 'UNKNOWN',
    1: 'NORMAL_RISK',
    2: 'NORMAL_RISK',
    3: 'MEDIUM_RISK',
    4: 'MEDIUM_RISK',
    5: 'HIGH_RISK',
    6: 'HIGH_RISK',
    7: 'MEDIUM_RISK',
    8: 'MEDIUM_RISK',
    9: 'MEDIUM_RISK',
    10: 'MEDIUM_RISK',
    11: 'MEDIUM_RISK',
    12: 'MEDIUM_RISK',
    13: 'MEDIUM_RISK',
    14: 'MEDIUM_RISK',
    15: 'MEDIUM_RISK',
    16: 'MEDIUM_RISK',
    17: 'HIGH_RISK',
    18: 'HIGH_RISK',
    19: 'HIGH_RISK',
    20: 'STATIC',
    21: 'HIGH_RISK',
    22: 'STATIC',
    23: 'STATIC',
    24: 'STATIC',
    25: 'HIGH_RISK',
    26: 'HIGH_RISK'
    },

    "motor": {
    0: 'UNKNOWN',
    1: 'TRADITIONAL',
    2: 'ELECTRIC_HYBRID',
    3: 'ELECTRIC_HYBRID',
    4: 'OTHER',
    5: 'NON_MOTORIZED',
    6: 'OTHER'
    },

    "road": {
    0: 'UNKNOWN',
    1: 'HIGHWAY',
    2: 'HIGHWAY',
    3: 'URBAN_ROAD',
    7: 'URBAN_ROAD',
    4: 'URBAN_ROAD',
    5: 'OTHER',
    9: 'OTHER',
    6: 'OTHER'
},
    "road_surface":{
        -1: 'UNKNOWN',
        1: 'NORMAL',
        9: 'NORMAL',
        2: 'MODERATE',
        3: 'MODERATE',
        4: 'SEVERE',
        5: 'SEVERE',
        6: 'SEVERE',
        7: 'SEVERE',
        8: 'MODERATE'
    },

    "traffic": {
    -1: 'UNKNOWN',
    1: 'ONE_WAY',
    2: 'BIDIRECTIONAL',
    3: 'SEPARATE_LANES',
    4: 'VARIABLE_LANE'
},


    "user_category": {
    -1: 'UNKNOWN',
    0: 'UNKNOWN',
    1: 'DRIVER',
    2: 'PASSENGER',
    3: 'PEDESTRIAN'
},

    "safety_equipment": {
    -1: 'UNKNOWN',
    0: 'NORMAL',
    1: 'GOOD',
    2: 'GOOD',
    3: 'BEST',
    4: 'BEST',
    5: 'BEST',
    6: 'GOOD',
    7: 'BEST',
    8: 'UNKNOWN',
    9: 'NORMAL'
},
    "lum": {
    -1: 'UNKNOWN',
    1: 'DAYLIGHT',
    2: 'LOW_LIGHT',
    3: 'NIGHT_DARK',
    4: 'NIGHT_LIT',
    5: 'NIGHT_DARK',
},

 

    "intersection": {
    -1: 'UNKNOWN',
    1: 'NO_INTERSECTION',
    2: 'TYPICAL_INTERSECTIONS',
    3: 'TYPICAL_INTERSECTIONS',
    4: 'TYPICAL_INTERSECTIONS',
    5: 'COMPLEX_INTERSECTIONS',
    6: 'COMPLEX_INTERSECTIONS',
    7: 'COMPLEX_INTERSECTIONS',
    8: 'COMPLEX_INTERSECTIONS',
    9: 'COMPLEX_INTERSECTIONS'
},
    "weather": {
    -1: 'UNKNOWN',
    1: 'LIGHT',
    2: 'LIGHT',
    3: 'MEDIUM',
    4: 'SEVERE',
    5: 'SEVERE',
    6: 'SEVERE',
    7: 'SEVERE',
    8: 'MEDIUM',
    9: 'LIGHT'
},
    "collision_type": {
    -1: 'UNKNOWN',
    1: 'SIMPLE',
    2: 'SIMPLE',
    3: 'SIMPLE',
    4: 'COMPLEX',
    5: 'COMPLEX',
    6: 'COMPLEX',
    7: 'NO_COLLISION'
}
    ,"severity":{
        -1: 'UNKNOWN',
        0: 'UNKNOWN',
        1: 'NO_INJURY',
        2: 'KILLED',
        3: 'MAJOR_INJURY',
        4: 'MINOR_INJURY'
    },

    'sex':{
        -1: 'UNKNOWN',
        0: 'UNKNOWN',
        1: 'MALE',
        2: 'FEMALE'
    }
    
}


class RoadAccident(FileTypeEnum):
    '''Collection of all the categories for file type 'vehicle.csv' '''

    class accident_id(CategoryBaseEnum):
        '''The Index/Number of the Crash follows the pattern yyyyxxxxx and is the index column'''

    class vehicle_id(CategoryBaseEnum):
        '''The vehicle id  in terms of xxx-xxx'''

    class vehicle_category(CategoryBaseEnum):  
        '''The Category of Vehicle involved in the crash'''
        UNKNOWN = 0
        LIGHT = 1
        MEDIUM = 2
        HEAVY = 3
        MISC = 4

    class obstacle_static(CategoryBaseEnum):  
        ''' The Static/Stationary Obstacle Hit'''
        UNKNOWN = 0
        NO_OBSTACLE = 1
        LIGHT = 2
        MEDIUM = 3
        HEAVY = 4

    class obstacle_mobile(CategoryBaseEnum):  
        '''The Dynamic Obstacle Hit'''
        UNKNOWN = 0
        ANIMAL_OR_OTHER = 1
        VEHICLE = 2
        PEDESTRIAN = 3

    class impact_point(CategoryBaseEnum):  
        '''The Initial Point of Impact of the crash'''
        UNKNOWN = 0
        FRONT_IMPACT = 1
        SIDE_IMPACT = 2
        REAR_IMPACT = 3
        MULTIPLE_IMPACT = 4

    class action(CategoryBaseEnum):  
        '''The Main Action performed by the user before the crash'''
        UNKNOWN = 0
        STATIC = 1
        NORMAL_RISK = 2
        MEDIUM_RISK = 3
        HIGH_RISK = 4

    class motor(CategoryBaseEnum):
        '''The Type of Motor(In terms of fuel type) involved in the crash'''
        UNKNOWN = 0
        OTHER = 1
        TRADITIONAL = 2
        ELECTRIC_HYBRID = 3
        NON_MOTORIZED = 4

    class road(CategoryBaseEnum):
        '''Category of the road'''
        UNKNOWN = 0
        OTHER = 1
        URBAN_ROAD = 2
        HIGHWAY = 3

    class road_surface(CategoryBaseEnum):
        '''Surface condition'''
        UNKNOWN = 0
        NORMAL = 1
        MODERATE = 2
        SEVERE = 3

    class speed_limit(CategoryBaseEnum):
        '''Maximum speed permitted at the location and time of the accident.'''

    class user_category(CategoryBaseEnum):  
        '''User category'''
        UNKNOWN = 0
        DRIVER = 1
        PASSENGER = 2
        PEDESTRIAN = 3

    class severity(CategoryBaseEnum):
        ''' Severity of the accident: The injured users are classified into three categories of victims plus the uninjured'''
        UNKNOWN = 0
        NO_INJURY = 1
        MINOR_INJURY = 2
        MAJOR_INJURY = 3
        KILLED = 4

    class sex(CategoryBaseEnum):
        '''User's gender'''
        UNKNOWN = 0
        MALE = 1
        FEMALE = 2

    class dob(CategoryBaseEnum):
        '''Year of birth of the user'''

    class safety_equipment(CategoryBaseEnum):
        '''The existence of a safety equipment'''
        UNKNOWN = 0
        NORMAL = 1
        GOOD = 2
        BEST = 3

    class datetime(CategoryBaseEnum):
        '''Date Time of the accident'''

    class lum(CategoryBaseEnum):
        '''Lighting conditions during the accident'''
        UNKNOWN = 0
        DAYLIGHT = 1
        NIGHT_LIT = 2
        LOW_LIGHT = 3
        NIGHT_DARK = 4

    class weather(CategoryBaseEnum):
        '''Weather conditions during the accident'''
        UNKNOWN = 0
        LIGHT = 1
        MEDIUM = 2
        SEVERE = 3

    class collision_type(CategoryBaseEnum):
        '''Type of collision'''
        UNKNOWN = 0
        NO_COLLISION = 1
        SIMPLE = 2
        COMPLEX = 3

    class lat(CategoryBaseEnum):
        '''Latitude of the accident location'''

    class long(CategoryBaseEnum):
        '''Longitude of the accident location'''

    class h3(CategoryBaseEnum):
        '''H3 Index of the accident'''

    class age(CategoryBaseEnum):
        '''Age of the person at time of accident'''
        
    class risk_score(CategoryBaseEnum):
        '''Age of the person at time of accident'''
    class accident_hex_count(CategoryBaseEnum):
        '''Age of the person at time of accident'''

CPU times: user 7.98 ms, sys: 0 ns, total: 7.98 ms
Wall time: 7.89 ms


In [11]:
def load_data():
    valid_columns = RoadAccidentEnum.to_dict()
    valid_columns = valid_columns.keys()
    data = pd.read_parquet(parquet_file)
    data = data[[col for col in data.columns if col in valid_columns]]
    return data
    

In [12]:
def convert_to_categories(data):
    data = data.replace(road_accident_dictionary).fillna("UNKNOWN")
    valid_dict = RoadAccidentEnum.IsCategory()
    for col,is_cat in valid_dict.items():
        if is_cat:
            data[col] = pd.Categorical(data[col])
    return data ,[col for col , is_cat in valid_dict.items() if is_cat]
    

In [13]:
def check_and_drop_duplicates(data):
    duplicates = data[data.duplicated()]
    num_duplicates = duplicates.shape[0]
    print("Duplicate Check:")
    print(f"Number of duplicate rows: {num_duplicates}")
    if num_duplicates > 0:
        print("\nDropping Duplicate Rows:")
        data = data.drop_duplicates()
    return data

In [14]:
def count_plots(df,cat_cols):
    for col in cat_cols:
        sns.countplot(x=col,data=df)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.legend()
        plt.savefig(ExtensionMethods.generate_filename(f"{col}_countplot",'png'))  
        plt.close()

In [15]:
def pie_charts(df,cat_cols):
    for col in cat_cols:
        df[col].value_counts().plot.pie(autopct='%1.1f%%')
        plt.title(f'Pie Chart of {col}')
        plt.legend()
        plt.savefig(ExtensionMethods.generate_filename(f"Pie Chart_{col}",'png'))  
        plt.close()

In [16]:
def clean_lat_long(data):
    LAT_MIN, LAT_MAX = 40.0, 60.0
    LONG_MIN, LONG_MAX = -10.0, 10.0
    mask = (data['lat'] >= LAT_MIN) & (data['lat'] <= LAT_MAX) & (data['long'] >= LONG_MIN) & (data['long'] <= LONG_MAX)
    no_data = data[~mask]
    data = data[mask]
    H3_RESOLUTION = 4 # Higher res may cause too many hexsb and that causes kepler gl to crsah
    data['h3'] = data.apply(lambda row: h3.latlng_to_cell(row['lat'], row['long'], H3_RESOLUTION), axis=1)
    data['centroid'] = data['h3'].apply(lambda x: h3.cell_to_latlng(x))
    data['accident_hex_count'] = data.groupby('h3')['h3'].transform('count')
    return data

In [17]:
def get_rid_of_crap(data):
    data['age'] = data['datetime'].dt.year - data['dob']
    mask = (data['age']>0) & (data['age']<=90) # why 90 , i think beyond that yo're bed ridden
    data = data[mask]
    mask = (data['speed_limit']>0) & (data['speed_limit']<=140) # speed limit of france
    data = data[mask]
    return data

In [18]:
def save_map(data):
    fig = px.scatter_mapbox(data,lat="lat",lon="long",zoom=5,mapbox_style="carto-positron",
                            title="Traffic Accidents from 2019-2023",
                            opacity=0.5,animation_group=data['datetime'].dt.year,
                            center = dict(lat=paris_lat, lon=paris_long))

    fig.update_traces(marker=dict(size=2, color='#e57373', opacity=0.3))
    fig.update_layout(mapbox_accesstoken=mapbox_token)
    fig.write_html(ExtensionMethods.generate_filename("Traffic Accidents from 2019-2023",'html'))
    

In [19]:
def save_heatmap(data):
    hex_counts = data[['accident_hex_count','centroid']]
    hex_counts['lat'] = hex_counts['centroid'].apply(lambda x: x[0])
    hex_counts['long'] = hex_counts['centroid'].apply(lambda x: x[1])
    fig = px.scatter_mapbox(hex_counts, 
    lat="lat", 
    lon="long", 
    size="accident_hex_count",  
    color="accident_hex_count",
    color_continuous_scale="thermal",  
    zoom=5, 
    mapbox_style="carto-positron",
    title="Traffic Accidents Density Map")
    
    fig.update_traces(marker=dict(opacity=1))
    fig.update_traces(marker=dict(size=2, color='#e57373', opacity=0.3))
    fig.update_layout(mapbox_accesstoken=mapbox_token)
    fig.show()
    #fig.write_html(ExtensionMethods.generate_filename("HeatMaps from 2019-2023",'html'))

In [20]:
def y_data_profiling(data):
    profile = ProfileReport(data, title="Road Accident Profle Report")
    profile.to_file(ExtensionMethods.generate_filename("Raod Accident Profile from 2019-2023",'html'))
    

In [21]:
def drop_duplicates(data):
    duplicates = data[data.duplicated()]
    num_duplicates = duplicates.shape[0]
    print("Duplicate Check:")
    print(f"Number of duplicate rows: {num_duplicates}")
    if num_duplicates > 0:
        print("\nDuplicate Rows:")
        print(duplicates.head())
        print("\nDropping Duplicate Rows:")
        data = data.drop_duplicates()
    return data

In [22]:
def convert_to_ordinal(data):
    a = RoadAccident.MegaDictionary()
    replacement_dict={}
    for index, value in a.items():
        if a[index][1]==True:
            replacement_dict[index]=value[0]
    data = data.replace(replacement_dict).fillna(0)
    for index, value in replacement_dict.items():
        data[index] = pd.to_numeric(data[index])
        data[index] = data[index].replace(-1,0)
    return data

In [23]:
def MinMaxScaled(data):
    a = RoadAccident.MegaDictionary()
    replacement_dict={}
    for index, value in a.items():
        if a[index][1]==True:
            replacement_dict[index]=value[0]
    for index, value in replacement_dict.items():
        data[f'{index}_scaled']= MinMaxScaler(feature_range=(1, 4)).fit_transform(data[[index]])
    return data

In [24]:
def create_accident_score(data):
    data['hex_score'] = MinMaxScaler(feature_range=(1, 4)).fit_transform(data[['accident_hex_count']])
    data['accident_score'] = data['speed_limit'] + data['hex_score']**2 + data['collision_type_scaled']+ data['impact_point_scaled']
    data['accident_score'] = MinMaxScaler(feature_range=(1, 4)).fit_transform(data[['accident_score']])
    return data

In [25]:
def create_god_score(data):
    data['god_factor_score'] = data['lum_scaled']*0.2 + data['weather_scaled']*0.4 + data['road_scaled']*0.15+ data['road_surface_scaled']*0.25
    return data

In [26]:
def create_user_score(data):
    data['user_score'] = data['vehicle_category_scaled']*0.35  + data['obstacle_mobile_scaled']*0.10 + data['action_scaled']*0.20 +(data['safety_equipment_scaled'].max()-data['safety_equipment_scaled'])*0.35
    return data

In [27]:
def risk_score(data):
    data['risk_score'] = data['accident_score']**2+ data['god_factor_score'] + data['user_score']
    data['risk_score'] = MinMaxScaler(feature_range=(1, 4)).fit_transform(data[['risk_score']])
    return data

In [28]:
import seaborn as sns
import matplotlib.pyplot as plt


def do_plot(data):
    sns.kdeplot(data['severity'], fill=True, label='Severity')
    sns.kdeplot(data['risk_score'], fill=True, label='Score')
    plt.title("Risk Score vs Severity Score")
    plt.xlabel("Value")
    plt.ylabel("Density")
    plt.legend()
    plt.savefig(ExtensionMethods.generate_filename("RiskScore_Vs_SeverityScore",'png'))
    plt.close()

In [29]:
def clean_data_for_export(data):
    cols = RoadAccident.to_dict()
    cols = cols.keys()
    data = data[[col for col in data.columns if col in cols]]
    return data

In [30]:
def remap_to_human(data):
    new_dic={}
    a = RoadAccident.MegaDictionary()
    for index,value in a.items():
        if a[index][1] == True:
            new_dic[index]= {j:i for i,j in value[0].items()}
    data = data.replace(new_dic).fillna("UNKNOWN")
    return data

In [31]:
def process_and_save_data(data, create_parquet):
    years = list(data['datetime'].dt.year.unique())
    df_dic = {}
    df_dic['merged'] = data
    for year in years:
        mask = (data['datetime'].dt.year == year)
        _data = data[mask]
        df_dic[year] = _data
    for index, value in df_dic.items():
        create_parquet(value, index)

In [32]:
def prepare_train_test_split(data):
    features = ['vehicle_category', 'obstacle_mobile', 'impact_point', 'action', 'safety_equipment',
                'road_surface', 'situation', 'speed_limit', 'lum', 'weather', 'collision_type', 
                'accident_hex_count']
    target = 'severity'

    X = data[features]
    y = data[target]

    y = y.map(lambda r: 1 if r >= 3 else 0)

    ordinal_features = ['vehicle_category', 'obstacle_mobile', 'impact_point', 'action', 
                        'safety_equipment', 'road_surface', 'situation', 'lum', 'weather', 
                        'collision_type']
    for col in ordinal_features:
        X[col] = X[col].replace(-1, 0).astype(int)

    scaler = StandardScaler()
    num_features = ['speed_limit', 'accident_hex_count']
    X[num_features] = scaler.fit_transform(X[num_features])

    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, 
                                                        random_state=random_state)
    
    return X_train, X_test, y_train, y_test


In [33]:
#https://medium.com/@sidakmenyadik/save-and-load-machine-learning-models-with-joblib-in-python-kneighborsclassifier-e474512e2683
def gradboost_classifier(X_train,X_test,y_train,y_test):
    params = {
        'n_estimators': [200],
        'max_depth': [10]}
    random_search = RandomizedSearchCV(GradientBoostingClassifier(random_state=random_state),cv=3,n_jobs=-1,param_distributions=params)
    random_search.fit(X_train,y_train)
    print(f"The best parameters: {random_search.best_params_}")
    #joblib.dump(random_search.best_estimator_, ExtensionMethods.generate_filename("GradientBoostingClassifier",'pkl'))
    ## too heavy

    best_est = random_search.best_estimator_
    y_pred = best_est.predict(X_test)
    
    report = classification_report(y_test, y_pred, output_dict=True) 
    _df = pd.DataFrame(report).transpose()
    _df.to_csv(ExtensionMethods.generate_filename("GradientBoostingClassifier",'csv'), index=True)

    #explainer = shap.Explainer(best_est)  #cant handle the long loads on the kernel
    #shap_values = explainer(X_test) 
    #shap.summary_plot(shap_values,X_test,show=False)

    #plt.savefig(ExtensionMethods.generate_filename("GradientBoostingClassifierShapPlot",'png'))
    #plt.close()
    
    return best_est
    

In [34]:
#https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format
def rando_classifier(X_train,X_test,y_train,y_test):
    params = {
        'n_estimators': [200],
        'max_depth': [10]
     }
    random_search = RandomizedSearchCV(RandomForestClassifier(random_state=random_state),cv=3,n_jobs=-1,param_distributions=params)
    random_search.fit(X_train,y_train)
    print(f"The best parameters: {random_search.best_params_}")

    best_est = random_search.best_estimator_
    y_pred = best_est.predict(X_test)
    
    report = classification_report(y_test, y_pred, output_dict=True) 
    _df = pd.DataFrame(report).transpose()
    _df.to_csv(ExtensionMethods.generate_filename("RandomForestClassifier",'csv'), index=True)
    
    #explainer = shap.Explainer(best_est)
    #shap_values = explainer(X_test)
    #shap.summary_plot(shap_values,X_test,show=False)

    #plt.savefig(ExtensionMethods.generate_filename("RandomForestClassifierShapPlot",'png'))
    #plt.close()
    return best_est

In [35]:
data = load_data()
data = check_and_drop_duplicates(data)
data = clean_lat_long(data)
data = get_rid_of_crap(data)
data = data.replace(road_accident_dictionary)

data = convert_to_ordinal(data)
data = check_and_drop_duplicates(data)


Duplicate Check:
Number of duplicate rows: 0
Duplicate Check:
Number of duplicate rows: 21

Dropping Duplicate Rows:


In [36]:
data = MinMaxScaled(data)
data = create_accident_score(data)
data = create_god_score(data)
data = create_user_score(data)
data = risk_score(data)
#do_plot(data)


In [37]:
#%%time
#https://www.geeksforgeeks.org/ways-to-visualize-individual-decision-trees-in-a-random-forest/
#X_train, X_test, y_train, y_test = prepare_train_test_split(data)
#best_est_rand = rando_classifier(X_train, X_test, y_train, y_test)


The best parameters: {'n_estimators': 200, 'max_depth': 10}
CPU times: user 5min 43s, sys: 9.12 s, total: 5min 52s
Wall time: 8min 26s


In [None]:
%%time
X_train, X_test, y_train, y_test = prepare_train_test_split(data)
best_est = gradboost_classifier(X_train, X_test, y_train, y_test)





explainer = shap.TreeExplainer(best_est)
shap_values = explainer(X_test)
shap.summary_plot(shap_values,X_test,show=False)

plt.savefig(ExtensionMethods.generate_filename("RandomForestClassifierShapPlot",'png'))
plt.close()

sub_tree_42 = best_est.estimators_[42, 0]

# Visualization
# Install graphviz: https://www.graphviz.org/download/
from pydotplus import graph_from_dot_data
from IPython.display import Image
dot_data = export_graphviz(
    sub_tree_42,
    out_file=None, filled=True, rounded=True,feature_names=X_train.columns,
    special_characters=True,
    proportion=False, impurity=False, # enable them if you want
)
graph = graph_from_dot_data(dot_data)
png = graph.create_png()
# Save (optional)
from pathlib import Path
Path('./out.png').write_bytes(png)
# Display
Image(png)

best_est = grad_model
explainer = shap.Explainer(best_est,X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values,X_test)

plt.savefig(ExtensionMethods.generate_filename("GradientBoosterShapPlot",'png'))
plt.close()

from sklearn.ensemble import RandomForestClassifier
params ={'n_estimators': [50, 100, 200],'max_depth': [None, 2,5] }

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=random_state),cv=3,n_jobs=-1,param_distributions=params)
random_search.fit(X_train,y_train)

best_est = random_search.best_estimator_

y_pred = best_est.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))


In [None]:
#data = remap_to_human(data)

In [None]:
#data = clean_data_for_export(data)


In [None]:
#process_and_save_data(data,create_parquet)