In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/merged-vehicles/vehicles-merged_2025_01_05_1818.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2019.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2022.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2021.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2020.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2023.csv


In [2]:
%%time
## all imports
import polars as pl
import pyarrow.parquet as pq
import os
import shutil
import json
from enum import Enum
from datetime import datetime
from ydata_profiling import ProfileReport
from pathlib import Path

#For excel stuff
import openpyxl
from openpyxl.drawing.image import Image


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Concurrency
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

#  Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random state
random_state = 42
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)

from abc import ABC, abstractmethod


## Set the static file locactions
filepaths = {'vehicles':
    {
    2019: '/kaggle/input/nov24-bds-roadies-users/vehicules-2019.csv',
    2020: '/kaggle/input/nov24-bds-roadies-users/vehicules-2020.csv',
    2021: '/kaggle/input/nov24-bds-roadies-users/vehicules-2021.csv',
    2022: '/kaggle/input/nov24-bds-roadies-users/vehicules-2022.csv',
    2023: '/kaggle/input/nov24-bds-roadies-users/vehicules-2023.csv',
    },
    'users':{},
    'places': {},
    'characteristics':{}
             
}

CPU times: user 3.8 s, sys: 636 ms, total: 4.44 s
Wall time: 5.62 s


In [3]:
class ExtensionMethods:
    @staticmethod
    def generate_filename(filename=None,extension=None):
        current_datetime = datetime.now()
        f = current_datetime.strftime("%Y_%m_%d_%H%M")
        if (filename is None) or (extension is None):
            return str(f)
        else:
            stitched_f = str(filename)+"_"+str(f)+"."+str(extension)
            return str(stitched_f)

    @staticmethod
    def get_file_name_without_extension(filename):
        if filename == None:
            return "Provide a file"
        return Path(filename).stem
        
        

In [92]:
class CategoryBaseEnum(Enum):
    @classmethod
    def Name(cls):
        return f"{cls.__name__}"
    @classmethod
    def IsCategory(cls):
        return(True if len(cls.__members__)>0 else False)
    @classmethod
    def get_description(cls):
        return cls.__doc__ or "No description available"
    @classmethod
    def to_dict(cls):
        return {member.name:member.value for member in cls}
    @classmethod
    def to_json(cls, indent=4):
        return json.dumps(cls.to_dict(), indent=indent)

In [102]:
class FileTypeEnum(Enum):
    @classmethod
    def Name(cls):
        return f"{cls.__name__}"
    @classmethod
    def get_description(cls):
        return cls.__doc__ or "No description available"
    @classmethod
    def to_dict(cls):
        return {
            member.name:member.value.to_dict() for member in cls}

In [103]:
class Mammals(FileTypeEnum):
    """All the Mammals."""
    class Dog(CategoryBaseEnum):
        """All dogs."""
        A = 1
        B = 2
        C = 3

    class Cat(CategoryBaseEnum):
        """All the cats."""
        F = 1
        G = 2
        HH = 3

    class Timon(CategoryBaseEnum):
        """The Timon stuff."""
        # Add any members if needed


class Insects(FileTypeEnum):
    """All the Insects."""
    class Bug(CategoryBaseEnum):
        """Insects."""
        A = 1
        BB = 2

    class Anty(CategoryBaseEnum):
        """Insecty Anty."""
        # Add any members if needed


class Animals(Enum):
    """Top-level class for grouping animals."""
    MAMMALS = Mammals
    INSECTS = Insects
    @classmethod
    def to_dict(cls):
        """Convert Animals enum to a dictionary, including its subcategories."""
        return {en.name: en.value.to_dict() for en in cls}
    @classmethod
    def to_json(cls,indent=4):
        return json.dumps(cls.to_dict(), indent=indent)

In [4]:
%%time
## I want to enummify the metadata to make it human readable . TODO perhaps make a data struct 
class Vehicle_Enum(Enum):

    class Num_Acc(Enum): ## Not the best method since Nummm_ACC are not cat values
        '''The Index/Number of the Crash follows the pattern yyyyxxxxx and is the index column''' 
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)

    class id_vehicule(Enum):
        '''The vehicle id  in terms of xxx-xxx'''
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)

    class num_veh(Enum):
        '''The Number of the vehicle category , something in french '''
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)


    class senc(Enum): # senc DIRECTION OF TRAFFIC
        '''The Direction of Traffic '''
        UNKNOWN = 0 
        PK_INCREASING = 1 #PK ascending or address ascending
        PK_DECREASING = 2
        NO_REFERENCE_POINT = 3
        RANDOM = -1 ##TODO VERIFY THIS
            
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)
    
    class catv(Enum): # catv Vehicle Category
        '''The Category of Vehicle involved in the crash'''
        UNDETERMINED = 0
        BICYCLE = 1
        MOPED_LESS_EQUAL_50CC = 2
        MICROCAR = 3
        TOURISM_VEHICLE = 7
        UTILITY_VEHICLE_PTAC_1_5T_3_5T = 10 #1.5<PTAC<3.5
        HEAVY_TRUCK_PTAC_3_5T_7_5T = 13
        HEAVY_TRUCK_PTAC_OVER_7_5T = 14
        HEAVY_TRUCK_OVER_3_5T_WITH_TRAILER = 15
        TRACTOR_ONLY = 16
        TRACTOR_WITH_SEMI_TRAILER = 17
        SPECIAL_VEHICLE = 20
        AGRICULTURAL_TRACTOR = 21
        SCOOTER_LESS_EQUAL_50CC = 30
        MOTORCYCLE_50CC_125CC = 31
        SCOOTER_50CC_125CC = 32
        MOTORCYCLE_OVER_125CC = 33
        SCOOTER_OVER_125CC = 34
        LIGHT_QUAD_LESS_EQUAL_50CC = 35
        HEAVY_QUAD_OVER_50CC = 36
        BUS = 37
        COACH = 38
        TRAIN = 39
        TRAMWAY = 40
        THREE_WHEELED_VEHICLE_LESS_EQUAL_50CC = 41
        THREE_WHEELED_VEHICLE_50CC_125CC = 42
        THREE_WHEELED_VEHICLE_OVER_125CC = 43
        MOTORIZED_PERSONAL_TRANSPORT = 50
        NON_MOTORIZED_PERSONAL_TRANSPORT = 60
        OTHER_VEHICLE = 99
        
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)


    class obs(Enum):  #obs Static ObstacleHit
        ''' The Static/Stationary Obstacle Hit'''
        UNKNOWN = 0
        PARKED_VEHICLE = 1
        TREE_ON_ROADSIDE = 2
        METAL_BARRIER = 3
        CONCRETE_BARRIER = 4
        OTHER_BARRIER=5
        BUILDING_WALL_BRIDGE_PIER = 6 #BUILDING, WALL OR BRIDGE PIER
        VERTICAL_SIGNPOST_OR_EMERGENCY_CALLBOX = 7
        POLE = 8
        URBAN_FURNITURE = 9 # NO CLUE WHAT URBAN FURNITURE
        PARAPET = 10
        REFUGE_ISLAND_BOLLARD = 11 #THE ROAD ISLAND / BOLLARD
        SIDEWALK = 12 #SIDEWALK OR CURB
        DITCH = 13 #DITCH OR EMBANKMENT
        OTHER_OBS_ON_ROADWAY = 14
        OTHER_OBS_ON_SIDEWALK = 15
        ROADWAY_EXIT_WITOUT_OBSTACLES = 16
        AQUEDUCT_HEAD = 17
        
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)


    class obsm(Enum): #obsm #Mobile obstacle hit
        '''The Dynamic Obstacle Hit'''
        UNKNOWN = 0
        PEDESTRIAN =1 
        VEHICLE = 2
        RAIL_VEHICLE = 3
        ANIMAL_DOMESTIC = 4
        ANIMAL_WILD = 5
        OTHER = 9
        
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)


    class choc(Enum): #choc Initial Point of Impact
        '''The Initial Point of Impact of the crash'''
        UNKNOWN = 0
        FRONT = 1
        FRONT_LEFT = 2
        FRONT_RIGHT = 3
        REAR = 4
        REAR_RIGHT = 5
        REAR_LEFT = 6
        SIDE_LEFT = 7
        SIDE_RIGHT = 8
        MULTIPLE = 9
    
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)


    
    class manv(Enum): #manv , Main action before crash
        '''The Main Action performed by the user before the crash'''
        UNKNOWN = 0
        CIRCULATING_NO_DIRECTION_CHANGE =1
        CIRCULATING_SAME_DIRECTION = 2 #SAME DIRECTION SAME LANE
        CIRCULATION_BETWEEN_2_LANES = 3
        CIRCULATING_REVERSING = 4
        CIRCULATING_AGAINTS_FLOW_TRAFFIC = 5
        CIRCULATING_CROSSING_MEDIAN_STRIP = 6
        CIRCULATING_IN_BUSLANE_SAME_DIRECTION = 7
        CIRCULATING_IN_BUSLANE_OPP_DIRECTION = 8
        CIRCULATING_INSERTION =9
        CIRCULATING_TURNING_AROUND_CARRIAGE_WAY = 10
        CHANGING_LANE_LEFT = 11
        CHANGING_LANE_RIGHT = 12
        DEPORT_LEFT = 13
        DEPORT_RIGHT = 14
        TURNING_LEFT = 15
        TURNING_RIGHT = 16
        OVERTAKING_LEFT = 17
        OVERTAKING_RIGHT = 18 
        #VARIOUS S
        CROSSING_ROAD = 19
        PARKING_ACTION = 20
        AVOIDANCE_ACTION = 21
        DOOR_OPENED = 22
        STOP_NO_PARKING = 23
        PARKED_WITH_PASS = 24 # PARKED WITH PASSANGERS
        DRIVING_SIDEWALK = 25
        OTHER_ACTIONS = 26
        
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)

        
    
    class motor(Enum): # motor
        '''The Type of Motor(In terms of fuel type) involved in the crash'''
        UNKNOWN = 0
        CONVENTIONAL_FUEL = 1 # PETROL, DIESEL ,ETC
        HYBRID_ELECTRIC = 2
        ELECTRIC = 3
        HYDROGEN = 4
        HUMAN_POWERED = 5
        OTHER = 6
        
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)


    #occ : Number of occupants in the Public transport (bus) including the driver irrespective if they were injured or not
    class occutc(Enum):
        '''Number of occupants in the Public transport (bus) including the driver irrespective if they were injured or not'''
    
        @classmethod
        def IsCategory(cls):
            return (True if len(cls.__members__)>0 else False)

   
    ##TODO Refactor this , get rid of one of the 2 dics below 

    @staticmethod
    def to_json():
        _dic = {}
        for val in (Vehicle_Enum):
            _temp_inner= {}
            for i in (val.value):
                _temp_inner[i.name] = i.value
            _dic[val.name] = _temp_inner
        return json.dumps(_dic,indent=4)

    
    @staticmethod ## This is just a temp thing, to display use teh to_json
    def save_json():
        _data = Vehicle_Enum.to_json()
        path  = os.getcwd() ## TODO implement path if needed later
        #filename = "vehicle_enum_" +ExtensionMethods.generate_filename()+".json" 
        filename = ExtensionMethods.generate_filename("vehicle_enum","json")
        filepath = os.path.join(os.getcwd(),filename) # fixed_nam
        file = open(filepath, "w")
        file.write(_data)
        file.close()
        print(f"The data has been printed to {filepath}")

    @staticmethod
    def get_desc_dic():
        '''Get the descriptions of the sub enums of the Vehicle Enum '''
        _dic = {}
        for val in (Vehicle_Enum):
            _dic[val.name]= val.value.__doc__
        return _dic


    @staticmethod
    def get_description(col=None):
        _dict = Vehicle_Enum.get_desc_dic() # this is expensive 
        if col in _dict.keys():
            return _dict[col]
        else:
            return "Key Not Found"

    @staticmethod
    def get_dic():
        _dic = {}
        for val in (Vehicle_Enum):
            _temp_inner = {}
            for i in (val.value):
                _temp_inner[i.name]= i.value
            _dic[val.name] = _temp_inner
        return _dic

    @staticmethod
    def get_col_dic(col=None):
        _dic = Vehicle_Enum.get_dic()
        if col in _dic.keys():
            return _dic[col]
        else:
            return "Key Not Found"

    @staticmethod
    def enum_category_check(col=None):
        if col is None:
            return "Sub Enum not found"
        if col in Vehicle_Enum.__members__:
            inner_enum_class = Vehicle_Enum[col].value
            return inner_enum_class.IsCategory()
        else:
            return "Col not found"
            
        

    @staticmethod
    def display():
        print("Vehicles Printed")
            

CPU times: user 2.63 ms, sys: 0 ns, total: 2.63 ms
Wall time: 2.16 ms


In [5]:
'''These are dummy inplace enum definitions for the corresponding enums '''
class User_Enum(Enum):
    '''User stuff''' ##TODO add as required
    @staticmethod
    def display():
        print("User")
        
class Place_Enum(Enum):
    '''Place Enum'''
    @staticmethod
    def display():
        print("Place")

class Character_Enum(Enum):
    '''Characteristic Enum'''
    @staticmethod
    def display():
        print("Character")

In [6]:
###IMPORTANT
### Meta Enum allows some parts of the code to be generalized, change or add enums to 
###TODO : Refactor : Make a base class (overkill??)
class Meta_Enum(Enum):
    VEHICLES = Vehicle_Enum
    USERS = User_Enum
    PLACE = Place_Enum
    CHARACTER = Character_Enum

    @staticmethod
    def display(en = 'VEHICLES'):
        if en in Meta_Enum.__members__:
            enum_class = Meta_Enum[en].value  
            enum_class.display()  
        else:
            print(f"'{en}' is not a valid member of Meta_Enum.")


In [7]:
%%time
class DataFrameExplorer:
    def __init__(self, df, filename=None, which_type='VEHICLES'):
        self.df = df
        self.which_type = which_type  # allows for generalizing the type for which meta_enum // See Meta_Enums
        # TODO add filepaths and name : currently we decide the path and name
        self.normal_columns = [
            'Column_Number', 'Name_of_Column', 'Variable_Type', 'Description',
            'Percentage_Missing_Values', 'Cat_or_Quant', 'Comments', 'Enum_Category_Check'
        ]
        if filename is None:
            self.filename = "DFE_UNAMED" + ExtensionMethods.generate_filename()
        else:
            self.filename = ExtensionMethods.get_file_name_without_extension(filename)
        self.dirpath = self.create_dir()
        self.work_tuple = self.run_workbook()

    def create_dir(self):
        dirpath = os.path.join(os.getcwd(), self.filename)
        if not os.path.exists(dirpath):
            os.makedirs(dirpath)
        return dirpath

    def run_workbook(self):
        wb = openpyxl.Workbook()
        ws = wb.active
        first_row = ("Name of the file", self.filename)
        second_row = ("Num of Rows of the DataFrame", len(self.df.index))
        third_row = ("Index col of the DataFrame", self.df.index.name)
        fourth_row = ("Columns of the DataFrame", ", ".join(self.df.columns))
        ws.append(first_row)
        ws.append(second_row)
        ws.append(third_row)
        ws.append(fourth_row)
        ws.append(self.normal_columns)
        return wb, ws

    def explore(self):
        if isinstance(self.df, pd.DataFrame):
            self.explore_pandas()
        elif isinstance(self.df, pl.DataFrame):
            self.explore_polars()
        else:
            print("Unsupported DataFrame type.")

    def explore_pandas(self):
        print("DataFrame Info:")
        self.df.info()

        print("\nNull Values per Column:")
        null_counts = self.df.isnull().sum()
        print(null_counts)

        print("\nNormal Values per Column:")
        normal_counts = self.df.notnull().sum()
        print(normal_counts)

        print("\nDescriptive Statistics:")
        print(self.df.describe(include='all'))

        categorical_cols = self.df.select_dtypes(include=['object']).columns
        print("\nValue Counts for Categorical Columns:")
        for col in categorical_cols:
            print(f"\nColumn: {col}")
            print(self.df[col].value_counts())

        print("Duplicate Check:")
        duplicates = self.df[self.df.duplicated()]
        print(f"Number of duplicate rows: {duplicates.shape[0]}")
        if not duplicates.empty:
            print("\nDuplicate Rows:")
            print(duplicates)

    def save_summary(self):
        wb, ws = self.work_tuple
        for index, col in enumerate(self.df.columns):
            name_col = col
            col_type = self.df[col].dtype
            dec = Meta_Enum[self.which_type].value.get_description(col)
            percentage_missing = (self.df[col].isnull().sum() / len(self.df)) * 100
            cat_or_quant = 'Categorical' if (col_type == 'object' or col_type == 'category') else 'Quantitative'
            print(cat_or_quant)
            comments = "Add Comments"
            check = Meta_Enum[self.which_type].value.enum_category_check(col)
            row_data = [
                index + 1, str(name_col), str(col_type), str(dec),
                str(percentage_missing), str(cat_or_quant), comments, str(check)
            ]
            ws.append(row_data)
            print(f"Summary for {col} is done")

        fp = os.path.join(self.dirpath, f"Data_Audit_Summary_for_{self.filename}.xlsx")
        wb.save(fp)
        print(f"Finished creating the excel worksheet for {self.filename}")

    def convert_single_col_to_cat(self, data=None, col_to_convert=None): # refactor this , this is chaos
        if data is None:
            return "Data Not found cannot convert single column"
        if col_to_convert is None:
            return "Nothing to Convert"

        if col_to_convert not in data.columns:
            return "Column not Found"

        _enum_dic = Meta_Enum[self.which_type].value.get_dic()
        if col_to_convert not in _enum_dic.keys():
            return "Column not Found in Enum Key Values"

        if not Meta_Enum[self.which_type].value.enum_category_check(col_to_convert):
            return "Cannot convert a Non Categorical data"

        _enum_col = _enum_dic[col_to_convert]
        _dic = {value: key for key, value in _enum_dic[col_to_convert].items()}

        _data = data[col_to_convert].copy()
        _data = _data.map(_dic)
        return _data

    def convert_cols(self, data=None):
        if data is None:
            return "No data Found hence conversion to Categories will fail"
        for col in data.columns:
            _temp = self.convert_single_col_to_cat(data, col)
            if not (isinstance(_temp, pd.DataFrame) or isinstance(_temp, pd.Series)):
                print(_temp)
            else:
                _new_col = "new_" + str(col)
                data[_new_col] = _temp
        return data

    def get_slice(self, data=None, percentage=0.25):
        if data is None:
            return "Error: Need data"
        num_samples = int(len(data) * percentage)
        return data.sample(n=num_samples, random_state=random_state)

    def _plot_summary(self, data=None):
        '''This creates plots for different combinations of data , This is private but not strictly enforced'''
        if data is None:
            return "Data not found"
        cat_cols = data.select_dtypes(include=['category', 'object']).columns
        num_cols = data.select_dtypes(exclude=['category', 'object', 'datetime']).columns
        print(f"cat cols {cat_cols}")
        print(f"num cols {num_cols}")

        if num_cols.size > 0:
            try:
                correlation = data[num_cols].corr()
                plt.figure()
                ax = sns.heatmap(correlation, annot=True)
                ax.set(xlabel="", ylabel="")
                ax.xaxis.tick_top()
                ax.set_title("Heatmap of Corr Data")
                filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename("heatmap", "png"))
                plt.savefig(filepath)
                plt.close()
            except Exception as ex:
                print(f"Error creating plot for {filepath}: {ex}")

        for cat in cat_cols:
            try:
                plt.figure()
                fig = sns.histplot(data[cat])
                fig.set(xlabel="cat", ylabel="Freq")
                fig.set_title(f"Distribution of {cat}")
                filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename(f"Distribution_{cat}", "png"))
                plt.savefig(filepath)
                plt.close()
                print(f"Finished dis plot of {cat}")
            except Exception as ex:
                print(f"Error creating plot for {cat}: {ex}")

        for num in num_cols:
            try:
                plt.figure()
                fig = sns.kdeplot(data=data, x=num)
                fig.set_title(f"KDE Plot of {num}")
                filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename(f"KDEPlot_{num}", "png"))
                plt.savefig(filepath)
                plt.close()
                print(f"Finished kde plot of {num}")
            except Exception as ex:
                print(f"Error creating plot for {num}: {ex}")

    def save_profiling_report(self):
        filename = f"ProfilingReport_{self.filename}"
        filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename(filename, "html"))
        profile = ProfileReport(self.df, title=filename)
        profile.to_file(filepath)
        print(f"Finished Profile Report {filename}")

    def plot_me(self, col_drop=None):
        data = self.df.copy()
        if col_drop is not None:
            data = data.drop(columns=col_drop)
        for col in data.columns:
            if data[col].is_unique:
                print(f"Dropping {col} as its not categorical or is unique")
                data = data.drop(columns=col)
            print(f"Converting {col} to a category for better comprehension")
        self._plot_summary(self.convert_cols(data))

    def save_enum(self):
        Meta_Enum[self.which_type].value.save_json()
        print(f"Saved the Enums for type{self.which_type}")

    def _do_zip(self): ## Just for testing // Remove this later 
        dirpath = os.path.join(os.getcwd(), self.filename)
        if os.path.exists(dirpath):
            archive_filename = self.filename
            shutil.make_archive(archive_filename, 'zip', root_dir=dirpath)
            print(f"Zipped Up: {archive_filename}.zip")
        else:
            raise FileNotFoundError(f"Directory not found: {dirpath}")
             

    def _do_magic(self): # multithread this 
        '''This a magic function that does everything '''
        try:
            print("Do all the magic in this function") 
            print(f"\n=== Starting the spell for {self.which_type} ===")
            self.explore()
            print(f"\n=== Finished {self.explore.__name__} ===")
            self.save_summary()
            print(f"\n=== Finished {self.save_summary.__name__} ===")
            self.plot_me()
            print(f"\n=== Finished {self.plot_me.__name__} ===")
            self.save_profiling_report()
            print(f"\n=== Finished {self.save_profiling_report.__name__} ===")
            self._do_zip()
            print(f"\n=== Finished the Zip Process")
            
            
        except Exception as ex:
            return (f"Error as {ex}")
        finally:
            return (f"\n=== All Done for {self.filename} ===")
   


CPU times: user 34 µs, sys: 0 ns, total: 34 µs
Wall time: 38.4 µs


In [8]:
%%time

filename = '/kaggle/input/merged-vehicles/vehicles-merged_2025_01_05_1818.csv'
_df = pd.read_csv(filename,sep=',')
explorer = DataFrameExplorer(_df,filename = filename)
explorer._do_magic()
#filename_2019 = filepaths['vehicles'][2019]
#vehicles_df  = pd.read_csv(filename_2019, sep=';',index_col ='Num_Acc')
#explorer  = DataFrameExplorer(vehicles_df,filename=filename_2019)
#explorer._do_magic()

Do all the magic in this function

=== Starting the spell for VEHICLES ===
DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467169 entries, 0 to 467168
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Num_Acc      467169 non-null  int64  
 1   id_vehicule  467169 non-null  object 
 2   num_veh      467169 non-null  object 
 3   senc         467169 non-null  int64  
 4   catv         467169 non-null  int64  
 5   obs          467169 non-null  int64  
 6   obsm         467169 non-null  int64  
 7   choc         467169 non-null  int64  
 8   manv         467169 non-null  int64  
 9   motor        467169 non-null  int64  
 10  occutc       3912 non-null    float64
 11  csv_info     467169 non-null  object 
dtypes: float64(1), int64(8), object(3)
memory usage: 42.8+ MB

Null Values per Column:
Num_Acc             0
id_vehicule         0
num_veh             0
senc                0
catv                

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Finished Profile Report ProfilingReport_vehicles-merged_2025_01_05_1818

=== Finished save_profiling_report ===
Zipped Up: vehicles-merged_2025_01_05_1818.zip

=== Finished the Zip Process
CPU times: user 1min 36s, sys: 35.7 s, total: 2min 11s
Wall time: 1min 48s


'\n=== All Done for vehicles-merged_2025_01_05_1818 ==='

In [None]:
%%time
## Call a ProcessExecutor to do it all parallely
years = [2019,2020,2021,2022,2023]
file = 'vehicles' # change this to whatever your file is


def do_something(_file='vehicles',_year=2019,_index='Num_Acc'):
    _filename= filepaths[_file][_year]
    print(f"\n=== File {_filename} is starting up ===")
    _df = pd.read_csv(_filename,sep=';', index_col=_index)
    explorer = DataFrameExplorer(_df,filename=_filename) ## Don't forget which_type
    explorer._do_magic()
    

print("\n=== ProcessPoolExecutor Started ===")
with ProcessPoolExecutor(max_workers=3) as executor:
    ## Don't forget to change the stuff below
    futures = [executor.submit(do_something,file,year,'Num_Acc') for year in years]
    for future in as_completed(futures):
        try:
            result = future.result()
            print("Future Result:", result)
        except Exception as ex:
            print(f"Error in {ex}")
        
    