In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/france-road-merged/merged-vehicles-users-places-characteristics_2025_01_06_2138.csv
/kaggle/input/france-road-merged/preprocessedMerged-_2025_01_20_1655.parquet
/kaggle/input/france-road-merged/merged-vehicles-users-places-characteristics_2025_01_06_2141.parquet
/kaggle/input/france-road-merged/merged-vehicles-users-places-characteristics_2025_01_06_2140.feather


In [25]:
%%time
## all imports
import dask
import pyarrow.parquet as pq
import dask.dataframe as dd
import os
import shutil
import json
from enum import Enum
from datetime import datetime
from ydata_profiling import ProfileReport
from pathlib import Path
import random


from dask.distributed import LocalCluster
##Metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import mutual_info_score, adjusted_rand_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

import plotly.express as px
#For excel stuff
import openpyxl
from openpyxl.drawing.image import Image


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from plotly_resampler import register_plotly_resampler

# Call the register function once and all Figures/FigureWidgets will be wrapped
# according to the register_plotly_resampler its `mode` argument
register_plotly_resampler(mode='auto')

# Concurrency
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

#  Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random state
random_state = 42
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)

csv_file = '/kaggle/input/france-road-merged/merged-vehicles-users-places-characteristics_2025_01_06_2138.csv'
pre_file = '/kaggle/input/france-road-merged/preprocessedMerged-_2025_01_20_1655.parquet'


from dask.distributed import Client
client = Client()

pd.set_option('display.max_columns', None)



class ExtensionMethods:
    @staticmethod
    def generate_filename(filename=None,extension=None):
        current_datetime = datetime.now()
        f = current_datetime.strftime("%Y_%m_%d_%H%M")
        if (filename is None) or (extension is None):
            return str(f)
        else:
            stitched_f = str(filename)+"_"+str(f)+"."+str(extension)
            return str(stitched_f)

    @staticmethod
    def get_file_name_without_extension(filename):
        if filename == None:
            return "Provide a file"
        return Path(filename).stem


def create_parquet(data):
    if data is None:
        raise ValueError("Data can't be None for Parquet Creation")
    obj_cols = data.select_dtypes(include =['object']).columns
    for col in obj_cols:
        data[col]=data[col].astype(str)
    filepath = os.path.join(os.getcwd(),ExtensionMethods.generate_filename("Step0_Cleaned-", "parquet"))
    data.to_parquet(filepath, engine='pyarrow',compression="zstd", compression_level=10, index=False)
    print(f"\n Finished Saving parquet to: {filepath}")

CPU times: user 808 ms, sys: 222 ms, total: 1.03 s
Wall time: 2.8 s


In [36]:
%%time
class CategoryBaseEnum(Enum):
    @classmethod
    def Name(cls):
        return f"{cls.__name__}"
    @classmethod
    def IsCategory(cls):
        return(True if len(cls.__members__)>0 else False)
    @classmethod
    def get_description(cls):
        return cls.__doc__ or "No description available"
    @classmethod
    def to_dict(cls):
        return {member.name:member.value for member in cls}
    @classmethod
    def to_json(cls, indent=4):
        return json.dumps(cls.to_dict(), indent=indent)
    @classmethod
    def enum_name_list(cls):
        return [member.name for member in cls]


class FileTypeEnum(Enum):
    @classmethod
    def Name(cls):
        return f"{cls.__name__}"
    @classmethod
    def get_description(cls):
        return cls.__doc__ or "No description available"
    @classmethod
    def to_dict(cls):
        return {
            member.name:member.value.to_dict() for member in cls}
    
    @classmethod
    def enum_name_list(cls):
        return [member.name for member in cls]
    


class Road_Accident_Enum(FileTypeEnum):
    '''Collection of all the categories for file type 'vehicle.csv' '''

    class Num_Acc(CategoryBaseEnum):
        '''The Index/Number of the Crash follows the pattern yyyyxxxxx and is the index column''' 
    class id_vehicule(CategoryBaseEnum):
        '''The vehicle id  in terms of xxx-xxx'''
    
    class catv(CategoryBaseEnum): # catv Vehicle Category
        '''The Category of Vehicle involved in the crash'''
        UNDETERMINED = 0
        BICYCLE = 1
        MOPED_LESS_EQUAL_50CC = 2
        MICROCAR = 3
        TOURISM_VEHICLE = 7
        UTILITY_VEHICLE_PTAC_1_5T_3_5T = 10 #1.5<PTAC<3.5
        HEAVY_TRUCK_PTAC_3_5T_7_5T = 13
        HEAVY_TRUCK_PTAC_OVER_7_5T = 14
        HEAVY_TRUCK_OVER_3_5T_WITH_TRAILER = 15
        TRACTOR_ONLY = 16
        TRACTOR_WITH_SEMI_TRAILER = 17
        SPECIAL_VEHICLE = 20
        AGRICULTURAL_TRACTOR = 21
        SCOOTER_LESS_EQUAL_50CC = 30
        MOTORCYCLE_50CC_125CC = 31
        SCOOTER_50CC_125CC = 32
        MOTORCYCLE_OVER_125CC = 33
        SCOOTER_OVER_125CC = 34
        LIGHT_QUAD_LESS_EQUAL_50CC = 35
        HEAVY_QUAD_OVER_50CC = 36
        BUS = 37
        COACH = 38
        TRAIN = 39
        TRAMWAY = 40
        THREE_WHEELED_VEHICLE_LESS_EQUAL_50CC = 41
        THREE_WHEELED_VEHICLE_50CC_125CC = 42
        THREE_WHEELED_VEHICLE_OVER_125CC = 43
        MOTORIZED_PERSONAL_TRANSPORT = 50
        NON_MOTORIZED_PERSONAL_TRANSPORT = 60
        OTHER_VEHICLE = 99

    class obs(CategoryBaseEnum):  #obs Static ObstacleHit
        ''' The Static/Stationary Obstacle Hit'''
        UNKNOWN = 0
        PARKED_VEHICLE = 1
        TREE_ON_ROADSIDE = 2
        METAL_BARRIER = 3
        CONCRETE_BARRIER = 4
        OTHER_BARRIER=5
        BUILDING_WALL_BRIDGE_PIER = 6 #BUILDING, WALL OR BRIDGE PIER
        VERTICAL_SIGNPOST_OR_EMERGENCY_CALLBOX = 7
        POLE = 8
        URBAN_FURNITURE = 9 
        PARAPET = 10
        REFUGE_ISLAND_BOLLARD = 11 #THE ROAD ISLAND / BOLLARD
        SIDEWALK = 12 #SIDEWALK OR CURB
        DITCH = 13 #DITCH OR EMBANKMENT
        OTHER_OBS_ON_ROADWAY = 14
        OTHER_OBS_ON_SIDEWALK = 15
        ROADWAY_EXIT_WITOUT_OBSTACLES = 16
        AQUEDUCT_HEAD = 17

    class obsm(CategoryBaseEnum): #obsm #Mobile obstacle hit
        '''The Dynamic Obstacle Hit'''
        UNKNOWN = 0
        PEDESTRIAN =1 
        VEHICLE = 2
        RAIL_VEHICLE = 3
        ANIMAL_DOMESTIC = 4
        ANIMAL_WILD = 5
        OTHER = 9
        
    class choc(CategoryBaseEnum): #choc Initial Point of Impact
        '''The Initial Point of Impact of the crash'''
        UNKNOWN = 0
        FRONT = 1
        FRONT_LEFT = 2
        FRONT_RIGHT = 3
        REAR = 4
        REAR_RIGHT = 5
        REAR_LEFT = 6
        SIDE_LEFT = 7
        SIDE_RIGHT = 8
        MULTIPLE = 9

    
    class manv(CategoryBaseEnum): #manv , Main action before crash
        '''The Main Action performed by the user before the crash'''
        UNKNOWN = 0
        CIRCULATING_NO_DIRECTION_CHANGE =1
        CIRCULATING_SAME_DIRECTION = 2 #SAME DIRECTION SAME LANE
        CIRCULATION_BETWEEN_2_LANES = 3
        CIRCULATING_REVERSING = 4
        CIRCULATING_AGAINTS_FLOW_TRAFFIC = 5
        CIRCULATING_CROSSING_MEDIAN_STRIP = 6
        CIRCULATING_IN_BUSLANE_SAME_DIRECTION = 7
        CIRCULATING_IN_BUSLANE_OPP_DIRECTION = 8
        CIRCULATING_INSERTION =9
        CIRCULATING_TURNING_AROUND_CARRIAGE_WAY = 10
        CHANGING_LANE_LEFT = 11
        CHANGING_LANE_RIGHT = 12
        DEPORT_LEFT = 13
        DEPORT_RIGHT = 14
        TURNING_LEFT = 15
        TURNING_RIGHT = 16
        OVERTAKING_LEFT = 17
        OVERTAKING_RIGHT = 18 
        #VARIOUS S
        CROSSING_ROAD = 19
        PARKING_ACTION = 20
        AVOIDANCE_ACTION = 21
        DOOR_OPENED = 22
        STOP_NO_PARKING = 23
        PARKED_WITH_PASS = 24 # PARKED WITH PASSANGERS
        DRIVING_SIDEWALK = 25
        OTHER_ACTIONS = 26
        
    
    class motor(CategoryBaseEnum): # motor
        '''The Type of Motor(In terms of fuel type) involved in the crash'''
        UNKNOWN = 0
        CONVENTIONAL_FUEL = 1 # PETROL, DIESEL ,ETC
        HYBRID_ELECTRIC = 2
        ELECTRIC = 3
        HYDROGEN = 4
        HUMAN_POWERED = 5
        OTHER = 6

    class catr(CategoryBaseEnum):
        '''Category of the road'''
        MOTORWAY = 1
        NATIONAL_ROAD = 2
        DEPARTMENTAL_ROAD = 3
        MUNICIPAL_ROAD = 4
        OFF_NETWORK = 5
        PARKING_AREA = 6
        URBAN_METROPOLE_ROAD = 7
        OTHER = 9

    class circ(CategoryBaseEnum):
        '''Traffic regime:'''
        NOT_PROVIDED = -1
        ONE_WAY = 1
        BIDIRECTIONAL = 2
        SEPARATE_LANES = 3
        VARIABLE_LANE = 4

        
    class surf(CategoryBaseEnum):
        '''Surface condition'''
        NOT_PROVIDED = -1
        NORMAL = 1
        WET = 2
        PUDDLES = 3
        FLOODED = 4
        SNOWY = 5
        MUDDY = 6
        ICY = 7
        GREASE = 8
        OTHER = 9
       
    class infra(CategoryBaseEnum):
        '''Development - Infrastructure:'''
        NOT_PROVIDED = -1
        NONE = 0
        UNDERGROUND = 1
        BRIDGE = 2
        INTERCHANGE_RAMP = 3
        RAILROAD = 4
        AMENAGED_CROSSROAD = 5
        PEDESTRIAN_ZONE = 6
        TOLL_ZONE = 7
        WORKZONE = 8
        OTHERS = 9
        

    class situ(CategoryBaseEnum):
        '''Situation of the accident'''
        NOT_PROVIDED = -1
        NONE = 0
        ON_ROADWAY = 1
        ON_EMERGENCY_LANE = 2
        ON_SHOULDER = 3
        ON_SIDEWALK = 4
        ON_CYCLE_PATH = 5
        ON_OTHER_SPECIAL_LANE = 6
        OTHERS = 8

    class vma(CategoryBaseEnum):
        '''Maximum speed permitted at the location and time of the accident.'''

    
    class catu(CategoryBaseEnum): # 
        '''User category'''
        UNDETERMINED = 0
        DRIVER = 1
        PASSENGER = 2
        PEDESTRIAN = 3
        
       
    class grav(CategoryBaseEnum):  
        ''' Severity of the accident: The injured users are classified into three categories of victims plus the uninjured'''
        UNKNOWN = 0
        NO_INJURY = 1
        KILLED = 2
        INJURED_HOSPITALIZED = 3
        MINOR_INJURY = 4
        
       
    class sexe(CategoryBaseEnum):
        '''User's gender'''
        UNKNOWN = 0
        MALE =1 
        FEMALE = 2
        
       
    class an_nais(CategoryBaseEnum): 
        '''Year of birth of the user'''     
  
    class secu1(CategoryBaseEnum):
        '''The existence of a safety equipment'''
        NOT_PROVIDED = -1  
        NO_EQUIPMENT = 0  
        SEATBELT = 1  
        HELMET = 2  
        CHILD_DEVICE = 3  
        REFLECTIVE_VEST = 4  
        AIRBAG_2W_3W = 5  
        GLOVES_2W_3W = 6  
        GLOVES_AIRBAG_2W_3W = 7  
        NOT_DETERMINABLE = 8  
        OTHER = 9  
            

    class jour(CategoryBaseEnum):
        '''Day of the accident'''
       
    class mois(CategoryBaseEnum):
        '''Month of the accident'''
       
    class an(CategoryBaseEnum):
        '''Year of the accident'''
       
    class hrmn(CategoryBaseEnum):
        '''Hour and minute of the accident'''
       
    class lum(CategoryBaseEnum):
        '''Lighting conditions during the accident'''
        DAYLIGHT = 1
        DUSK_OR_DAWN = 2
        NIGHT_WITHOUT_PUBLIC_LIGHT = 3
        NIGHT_WITH_PUBLIC_LIGHT_OFF = 4
        NIGHT_WITH_PUBLIC_LIGHT_ON = 5
        
       
    class dep(CategoryBaseEnum):
        '''Department code (INSEE code for French departments)'''
       
    class agg(CategoryBaseEnum):
        '''Location of the accident: inside or outside the urban area'''
        OUTSIDE_URBAN_AREA = 1
        INSIDE_URBAN_AREA = 2
        
       
    class int(CategoryBaseEnum):
        '''Type of intersection where the accident occurred'''
        NO_INTERSECTION = 1
        X_INTERSECTION = 2
        T_INTERSECTION = 3
        Y_INTERSECTION = 4
        INTERSECTION_WITH_MORE_THAN_4_BRANCHES = 5
        ROUNDABOUT = 6
        SQUARE = 7
        RAILWAY_CROSSING = 8
        OTHER_INTERSECTION = 9
        
       
    class atm(CategoryBaseEnum):
        '''Atmospheric conditions during the accident'''
        NOT_PROVIDED = -1
        NORMAL = 1
        LIGHT_RAIN = 2
        HEAVY_RAIN = 3
        SNOW_HAIL = 4
        FOG_SMOKE = 5
        STRONG_WIND_STORM = 6
        BLINDING_WEATHER = 7
        CLOUDY_WEATHER = 8
        OTHER = 9
        
       
    class col(CategoryBaseEnum):
        '''Type of collision'''
        NOT_PROVIDED = -1
        TWO_VEHICLES_FRONT = 1
        TWO_VEHICLES_REAR = 2
        TWO_VEHICLES_SIDE = 3
        THREE_VEHICLES_CHAIN = 4
        THREE_VEHICLES_MULTIPLE_COLLISIONS = 5
        OTHER_COLLISION = 6
        NO_COLLISION = 7
    
       
    class lat(CategoryBaseEnum):
        '''Latitude of the accident location'''
       
    class long(CategoryBaseEnum):
        '''Longitude of the accident location'''

CPU times: user 7.85 ms, sys: 931 µs, total: 8.78 ms
Wall time: 7.8 ms


In [38]:
class Road_Accident_English(FileTypeEnum):
    '''Collection of all the categories for file type 'vehicle.csv' '''

    class accident_id(CategoryBaseEnum):
        '''The Index/Number of the Crash follows the pattern yyyyxxxxx and is the index column''' 
    class vehicle_id(CategoryBaseEnum):
        '''The vehicle id  in terms of xxx-xxx'''
    
    class vehicle_category(CategoryBaseEnum): # catv Vehicle Category
        '''The Category of Vehicle involved in the crash'''
        UNDETERMINED = 0
        BICYCLE = 1
        MOPED_LESS_EQUAL_50CC = 2
        MICROCAR = 3
        TOURISM_VEHICLE = 7
        UTILITY_VEHICLE_PTAC_1_5T_3_5T = 10 #1.5<PTAC<3.5
        HEAVY_TRUCK_PTAC_3_5T_7_5T = 13
        HEAVY_TRUCK_PTAC_OVER_7_5T = 14
        HEAVY_TRUCK_OVER_3_5T_WITH_TRAILER = 15
        TRACTOR_ONLY = 16
        TRACTOR_WITH_SEMI_TRAILER = 17
        SPECIAL_VEHICLE = 20
        AGRICULTURAL_TRACTOR = 21
        SCOOTER_LESS_EQUAL_50CC = 30
        MOTORCYCLE_50CC_125CC = 31
        SCOOTER_50CC_125CC = 32
        MOTORCYCLE_OVER_125CC = 33
        SCOOTER_OVER_125CC = 34
        LIGHT_QUAD_LESS_EQUAL_50CC = 35
        HEAVY_QUAD_OVER_50CC = 36
        BUS = 37
        COACH = 38
        TRAIN = 39
        TRAMWAY = 40
        THREE_WHEELED_VEHICLE_LESS_EQUAL_50CC = 41
        THREE_WHEELED_VEHICLE_50CC_125CC = 42
        THREE_WHEELED_VEHICLE_OVER_125CC = 43
        MOTORIZED_PERSONAL_TRANSPORT = 50
        NON_MOTORIZED_PERSONAL_TRANSPORT = 60
        OTHER_VEHICLE = 99

    class obstacle_static(CategoryBaseEnum):  #obs Static ObstacleHit
        ''' The Static/Stationary Obstacle Hit'''
        UNKNOWN = 0
        PARKED_VEHICLE = 1
        TREE_ON_ROADSIDE = 2
        METAL_BARRIER = 3
        CONCRETE_BARRIER = 4
        OTHER_BARRIER=5
        BUILDING_WALL_BRIDGE_PIER = 6 #BUILDING, WALL OR BRIDGE PIER
        VERTICAL_SIGNPOST_OR_EMERGENCY_CALLBOX = 7
        POLE = 8
        URBAN_FURNITURE = 9 
        PARAPET = 10
        REFUGE_ISLAND_BOLLARD = 11 #THE ROAD ISLAND / BOLLARD
        SIDEWALK = 12 #SIDEWALK OR CURB
        DITCH = 13 #DITCH OR EMBANKMENT
        OTHER_OBS_ON_ROADWAY = 14
        OTHER_OBS_ON_SIDEWALK = 15
        ROADWAY_EXIT_WITOUT_OBSTACLES = 16
        AQUEDUCT_HEAD = 17

    class obstacle_mobile(CategoryBaseEnum): #obsm #Mobile obstacle hit
        '''The Dynamic Obstacle Hit'''
        UNKNOWN = 0
        PEDESTRIAN =1 
        VEHICLE = 2
        RAIL_VEHICLE = 3
        ANIMAL_DOMESTIC = 4
        ANIMAL_WILD = 5
        OTHER = 9
        
    class impact_point(CategoryBaseEnum): #choc Initial Point of Impact
        '''The Initial Point of Impact of the crash'''
        UNKNOWN = 0
        FRONT = 1
        FRONT_LEFT = 2
        FRONT_RIGHT = 3
        REAR = 4
        REAR_RIGHT = 5
        REAR_LEFT = 6
        SIDE_LEFT = 7
        SIDE_RIGHT = 8
        MULTIPLE = 9

    
    class action(CategoryBaseEnum): #manv , Main action before crash
        '''The Main Action performed by the user before the crash'''
        UNKNOWN = 0
        CIRCULATING_NO_DIRECTION_CHANGE =1
        CIRCULATING_SAME_DIRECTION = 2 #SAME DIRECTION SAME LANE
        CIRCULATION_BETWEEN_2_LANES = 3
        CIRCULATING_REVERSING = 4
        CIRCULATING_AGAINTS_FLOW_TRAFFIC = 5
        CIRCULATING_CROSSING_MEDIAN_STRIP = 6
        CIRCULATING_IN_BUSLANE_SAME_DIRECTION = 7
        CIRCULATING_IN_BUSLANE_OPP_DIRECTION = 8
        CIRCULATING_INSERTION =9
        CIRCULATING_TURNING_AROUND_CARRIAGE_WAY = 10
        CHANGING_LANE_LEFT = 11
        CHANGING_LANE_RIGHT = 12
        DEPORT_LEFT = 13
        DEPORT_RIGHT = 14
        TURNING_LEFT = 15
        TURNING_RIGHT = 16
        OVERTAKING_LEFT = 17
        OVERTAKING_RIGHT = 18 
        #VARIOUS S
        CROSSING_ROAD = 19
        PARKING_ACTION = 20
        AVOIDANCE_ACTION = 21
        DOOR_OPENED = 22
        STOP_NO_PARKING = 23
        PARKED_WITH_PASS = 24 # PARKED WITH PASSANGERS
        DRIVING_SIDEWALK = 25
        OTHER_ACTIONS = 26
        
    
    class motor(CategoryBaseEnum): # motor
        '''The Type of Motor(In terms of fuel type) involved in the crash'''
        UNKNOWN = 0
        CONVENTIONAL_FUEL = 1 # PETROL, DIESEL ,ETC
        HYBRID_ELECTRIC = 2
        ELECTRIC = 3
        HYDROGEN = 4
        HUMAN_POWERED = 5
        OTHER = 6

    class road(CategoryBaseEnum):
        '''Category of the road'''
        MOTORWAY = 1
        NATIONAL_ROAD = 2
        DEPARTMENTAL_ROAD = 3
        MUNICIPAL_ROAD = 4
        OFF_NETWORK = 5
        PARKING_AREA = 6
        URBAN_METROPOLE_ROAD = 7
        OTHER = 9

    class traffic(CategoryBaseEnum):
        '''Traffic regime:'''
        NOT_PROVIDED = -1
        ONE_WAY = 1
        BIDIRECTIONAL = 2
        SEPARATE_LANES = 3
        VARIABLE_LANE = 4

        
    class road_surface(CategoryBaseEnum):
        '''Surface condition'''
        NOT_PROVIDED = -1
        NORMAL = 1
        WET = 2
        PUDDLES = 3
        FLOODED = 4
        SNOWY = 5
        MUDDY = 6
        ICY = 7
        GREASE = 8
        OTHER = 9
       
    class infra(CategoryBaseEnum):
        '''Development - Infrastructure:'''
        NOT_PROVIDED = -1
        NONE = 0
        UNDERGROUND = 1
        BRIDGE = 2
        INTERCHANGE_RAMP = 3
        RAILROAD = 4
        AMENAGED_CROSSROAD = 5
        PEDESTRIAN_ZONE = 6
        TOLL_ZONE = 7
        WORKZONE = 8
        OTHERS = 9
        

    class situation(CategoryBaseEnum):
        '''Situation of the accident'''
        NOT_PROVIDED = -1
        NONE = 0
        ON_ROADWAY = 1
        ON_EMERGENCY_LANE = 2
        ON_SHOULDER = 3
        ON_SIDEWALK = 4
        ON_CYCLE_PATH = 5
        ON_OTHER_SPECIAL_LANE = 6
        OTHERS = 8

    class speed_limit(CategoryBaseEnum):
        '''Maximum speed permitted at the location and time of the accident.'''

    
    class user_category(CategoryBaseEnum): # 
        '''User category'''
        UNDETERMINED = 0
        DRIVER = 1
        PASSENGER = 2
        PEDESTRIAN = 3
        
       
    class severity(CategoryBaseEnum):  
        ''' Severity of the accident: The injured users are classified into three categories of victims plus the uninjured'''
        UNKNOWN = 0
        NO_INJURY = 1
        KILLED = 2
        INJURED_HOSPITALIZED = 3
        MINOR_INJURY = 4
        
       
    class sex(CategoryBaseEnum):
        '''User's gender'''
        UNKNOWN = 0
        MALE =1 
        FEMALE = 2
        
       
    class dob(CategoryBaseEnum): 
        '''Year of birth of the user'''     
  
    class safety_equipment(CategoryBaseEnum):
        '''The existence of a safety equipment'''
        NOT_PROVIDED = -1  
        NO_EQUIPMENT = 0  
        SEATBELT = 1  
        HELMET = 2  
        CHILD_DEVICE = 3  
        REFLECTIVE_VEST = 4  
        AIRBAG_2W_3W = 5  
        GLOVES_2W_3W = 6  
        GLOVES_AIRBAG_2W_3W = 7  
        NOT_DETERMINABLE = 8  
        OTHER = 9  

       
    class datetime(CategoryBaseEnum):
        '''Date Time of the accident'''
       
    class lum(CategoryBaseEnum):
        '''Lighting conditions during the accident'''
        DAYLIGHT = 1
        DUSK_OR_DAWN = 2
        NIGHT_WITHOUT_PUBLIC_LIGHT = 3
        NIGHT_WITH_PUBLIC_LIGHT_OFF = 4
        NIGHT_WITH_PUBLIC_LIGHT_ON = 5
        
       
    class dep(CategoryBaseEnum):
        '''Department code (INSEE code for French departments)'''
       
    class is_urban_area(CategoryBaseEnum):
        '''Location of the accident: inside or outside the urban area'''
        OUTSIDE_URBAN_AREA = 1
        INSIDE_URBAN_AREA = 2
        
       
    class intersection(CategoryBaseEnum):
        '''Type of intersection where the accident occurred'''
        NO_INTERSECTION = 1
        X_INTERSECTION = 2
        T_INTERSECTION = 3
        Y_INTERSECTION = 4
        INTERSECTION_WITH_MORE_THAN_4_BRANCHES = 5
        ROUNDABOUT = 6
        SQUARE = 7
        RAILWAY_CROSSING = 8
        OTHER_INTERSECTION = 9
        
       
    class weather(CategoryBaseEnum):
        '''Weather conditions during the accident'''
        NOT_PROVIDED = -1
        NORMAL = 1
        LIGHT_RAIN = 2
        HEAVY_RAIN = 3
        SNOW_HAIL = 4
        FOG_SMOKE = 5
        STRONG_WIND_STORM = 6
        BLINDING_WEATHER = 7
        CLOUDY_WEATHER = 8
        OTHER = 9
        
       
    class collision_type(CategoryBaseEnum):
        '''Type of collision'''
        NOT_PROVIDED = -1
        TWO_VEHICLES_FRONT = 1
        TWO_VEHICLES_REAR = 2
        TWO_VEHICLES_SIDE = 3
        THREE_VEHICLES_CHAIN = 4
        THREE_VEHICLES_MULTIPLE_COLLISIONS = 5
        OTHER_COLLISION = 6
        NO_COLLISION = 7
    
       
    class lat(CategoryBaseEnum):
        '''Latitude of the accident location'''
       
    class long(CategoryBaseEnum):
        '''Longitude of the accident location'''
    

In [32]:
valid_columns_dict = Road_Accident_Enum.to_dict()
valid_columns = valid_columns_dict.keys()
print(valid_columns)

dict_keys(['Num_Acc', 'id_vehicule', 'catv', 'obs', 'obsm', 'choc', 'manv', 'motor', 'catr', 'circ', 'surf', 'infra', 'situ', 'vma', 'catu', 'grav', 'sexe', 'an_nais', 'secu1', 'jour', 'mois', 'an', 'hrmn', 'lum', 'dep', 'agg', 'int', 'atm', 'col', 'lat', 'long'])


In [14]:
data =  pd.read_parquet(pre_file)

data = data[[col for col in data.columns if col in valid_columns]]
data

Unnamed: 0,Num_Acc,id_vehicule,catv,obs,obsm,choc,manv,motor,catu,grav,sexe,an_nais,secu1,catr,circ,surf,infra,situ,vma,jour,mois,an,hrmn,lum,dep,agg,int,atm,col,lat,long
0,201900000001,138 306 524,7,0,2,5,23,1,2,4,2,2002.0,1,1,3,1,2,1,70,30,11,2019,01:30,4,93,1,1,1,2,48.896210,2.470120
1,201900000001,138 306 524,7,0,2,5,23,1,1,4,2,1993.0,1,1,3,1,2,1,70,30,11,2019,01:30,4,93,1,1,1,2,48.896210,2.470120
2,201900000001,138 306 525,17,1,0,3,11,1,1,1,1,1959.0,1,1,3,1,2,1,70,30,11,2019,01:30,4,93,1,1,1,2,48.896210,2.470120
3,201900000002,138 306 523,7,4,0,1,0,1,1,4,2,1994.0,1,1,1,1,0,1,70,30,11,2019,02:50,3,93,1,1,1,6,48.930700,2.368800
4,201900000003,138 306 520,7,0,2,1,2,1,1,1,1,1996.0,1,1,3,1,0,1,90,28,11,2019,15:15,1,92,1,1,1,4,48.935872,2.319174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612931,202300054816,155 583 354,7,0,2,1,1,1,1,3,2,1976.0,1,3,2,1,0,1,80,28,6,2023,12:25,1,13,1,1,1,1,43.620450,5.491925
612932,202300054816,155 583 354,7,0,2,1,1,1,2,4,2,1985.0,1,3,2,1,0,1,80,28,6,2023,12:25,1,13,1,1,1,1,43.620450,5.491925
612933,202300054822,155 583 344,7,0,2,7,22,1,2,1,2,2003.0,1,4,2,1,0,5,30,20,10,2023,16:30,1,69,2,1,6,3,45.733060,4.825400
612934,202300054822,155 583 344,7,0,2,7,22,1,1,1,2,2002.0,1,4,2,1,0,5,30,20,10,2023,16:30,1,69,2,1,6,3,45.733060,4.825400


In [15]:
## Convert to english
road_accident_remap = {
    "Num_Acc": "accident_id",
    "id_vehicule": "vehicle_id",
    "catv": "vehicle_category",
    "obs": "obstacle_static",
    "obsm": "obstacle_mobile",
    "choc": "impact_point",
    "manv": "action",
    "motor": "motor",
    "catr": "road",
    "circ": "traffic",
    "surf": "road_surface",
    "infra": "infra",
    "situ": "situation",
    "vma": "speed_limit",
    "catu": "user_category",
    "grav": "severity",
    "sexe": "sex",
    "an_nais": "dob",
    "trajet": "travel_reason",
    "secu1": "safety_equipment",
    "jour": "day",
    "mois": "month",
    "an": "year",
    "hrmn": "hrmn",
    "lum": "lum",
    "dep": "dep",
    "agg": "is_urban_area",
    "int": "intersection",
    "atm": "weather",
    "col": "collision_type",
    "adr": "adr",
    "lat": "lat",
    "long": "long",
}


data = data.rename(columns=road_accident_remap)
data

Unnamed: 0,accident_id,vehicle_id,vehicle_category,obstacle_static,obstacle_mobile,impact_point,action,motor,user_category,severity,sex,dob,safety_equipment,road,traffic,road_surface,infra,situation,speed_limit,day,month,year,hrmn,lum,dep,is_urban_area,intersection,weather,collision_type,lat,long
0,201900000001,138 306 524,7,0,2,5,23,1,2,4,2,2002.0,1,1,3,1,2,1,70,30,11,2019,01:30,4,93,1,1,1,2,48.896210,2.470120
1,201900000001,138 306 524,7,0,2,5,23,1,1,4,2,1993.0,1,1,3,1,2,1,70,30,11,2019,01:30,4,93,1,1,1,2,48.896210,2.470120
2,201900000001,138 306 525,17,1,0,3,11,1,1,1,1,1959.0,1,1,3,1,2,1,70,30,11,2019,01:30,4,93,1,1,1,2,48.896210,2.470120
3,201900000002,138 306 523,7,4,0,1,0,1,1,4,2,1994.0,1,1,1,1,0,1,70,30,11,2019,02:50,3,93,1,1,1,6,48.930700,2.368800
4,201900000003,138 306 520,7,0,2,1,2,1,1,1,1,1996.0,1,1,3,1,0,1,90,28,11,2019,15:15,1,92,1,1,1,4,48.935872,2.319174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612931,202300054816,155 583 354,7,0,2,1,1,1,1,3,2,1976.0,1,3,2,1,0,1,80,28,6,2023,12:25,1,13,1,1,1,1,43.620450,5.491925
612932,202300054816,155 583 354,7,0,2,1,1,1,2,4,2,1985.0,1,3,2,1,0,1,80,28,6,2023,12:25,1,13,1,1,1,1,43.620450,5.491925
612933,202300054822,155 583 344,7,0,2,7,22,1,2,1,2,2003.0,1,4,2,1,0,5,30,20,10,2023,16:30,1,69,2,1,6,3,45.733060,4.825400
612934,202300054822,155 583 344,7,0,2,7,22,1,1,1,2,2002.0,1,4,2,1,0,5,30,20,10,2023,16:30,1,69,2,1,6,3,45.733060,4.825400


In [16]:
##drop duplicates
print("Duplicate Check:")
duplicates = data[data.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")
if not duplicates.empty:
    print("\nDuplicate Rows:")
    print(duplicates.head())
    print("\nDropping Duplicate Rows:")
    data = data.drop_duplicates()



print("Duplicate Check:")
duplicates = data[data.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")

Duplicate Check:
Number of duplicate rows: 15513

Duplicate Rows:
      accident_id   vehicle_id  vehicle_category  obstacle_static  \
399  201900000191  138 306 163                 7                0   
470  201900000222  138 306 107                 7                0   
562  201900000260  138 306 034                 7                7   
609  201900000283  138 306 001                 7                0   
947  201900000450  138 305 700                 3                0   

     obstacle_mobile  impact_point  action  motor  user_category  severity  \
399                2             1       0      2              2         1   
470                2             1      13      1              2         1   
562                0             7       1      1              2         1   
609                2             1       1      0              2         3   
947                2             8       1      6              2         3   

     sex     dob  safety_equipment  road  traffic 

In [17]:
##convert hrmn
data['hrmn'] = data['hrmn'].astype(str).str.replace(r'\D', '', regex=True)
data['hrmn'] = data['hrmn'].str.zfill(4)  
data['hour'] = data['hrmn'].str[:2].astype(int)
data['minute'] = data['hrmn'].str[2:].astype(int)

##Missing values
date_cols = ['year','month', 'day','hour','minute']
missing_values = data[date_cols].isna().sum()
print("Missing values:\n", missing_values)

#Drop NA
data = data.dropna(subset=date_cols)

#Convert
for col in date_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')


#Convert to datetime
data['datetime'] = pd.to_datetime(data[date_cols], format='%Y%m%d%H%M', errors='coerce')


## Drop cols
data = data.drop(columns=date_cols)
data = data.drop(columns='hrmn')

Missing values:
 year      0
month     0
day       0
hour      0
minute    0
dtype: int64


In [18]:
#Date of Birth--Convert to age
data['dob'] = pd.to_numeric(data['dob'], errors='coerce')
data['dob'] = pd.to_datetime(data['dob'], format='%Y', errors='coerce').dt.year


In [40]:
data

Unnamed: 0,accident_id,vehicle_id,vehicle_category,obstacle_static,obstacle_mobile,impact_point,action,motor,user_category,severity,sex,dob,safety_equipment,road,traffic,road_surface,infra,situation,speed_limit,lum,dep,is_urban_area,intersection,weather,collision_type,lat,long,datetime
0,201900000001,138 306 524,7,0,2,5,23,1,2,4,2,2002,1,1,3,1,2,1,70,4,93,1,1,1,2,48.896210,2.470120,2019-11-30 01:30:00
1,201900000001,138 306 524,7,0,2,5,23,1,1,4,2,1993,1,1,3,1,2,1,70,4,93,1,1,1,2,48.896210,2.470120,2019-11-30 01:30:00
2,201900000001,138 306 525,17,1,0,3,11,1,1,1,1,1959,1,1,3,1,2,1,70,4,93,1,1,1,2,48.896210,2.470120,2019-11-30 01:30:00
3,201900000002,138 306 523,7,4,0,1,0,1,1,4,2,1994,1,1,1,1,0,1,70,3,93,1,1,1,6,48.930700,2.368800,2019-11-30 02:50:00
4,201900000003,138 306 520,7,0,2,1,2,1,1,1,1,1996,1,1,3,1,0,1,90,1,92,1,1,1,4,48.935872,2.319174,2019-11-28 15:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612931,202300054816,155 583 354,7,0,2,1,1,1,1,3,2,1976,1,3,2,1,0,1,80,1,13,1,1,1,1,43.620450,5.491925,2023-06-28 12:25:00
612932,202300054816,155 583 354,7,0,2,1,1,1,2,4,2,1985,1,3,2,1,0,1,80,1,13,1,1,1,1,43.620450,5.491925,2023-06-28 12:25:00
612933,202300054822,155 583 344,7,0,2,7,22,1,2,1,2,2003,1,4,2,1,0,5,30,1,69,2,1,6,3,45.733060,4.825400,2023-10-20 16:30:00
612934,202300054822,155 583 344,7,0,2,7,22,1,1,1,2,2002,1,4,2,1,0,5,30,1,69,2,1,6,3,45.733060,4.825400,2023-10-20 16:30:00


In [41]:
create_parquet(data)


 Finished Saving parquet to: /kaggle/working/Step0_Cleaned-_2025_01_30_1331.parquet
