## DataTransformer:
A notebook to help with the transformation of `vehicle` data into the a merged format and possibly a parquet format


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/road-accidents-2019-2023/vehicles-2020.csv
/kaggle/input/road-accidents-2019-2023/char-2019.csv
/kaggle/input/road-accidents-2019-2023/char-2021.csv
/kaggle/input/road-accidents-2019-2023/place-2022.csv
/kaggle/input/road-accidents-2019-2023/users-2023.csv
/kaggle/input/road-accidents-2019-2023/users-2021.csv
/kaggle/input/road-accidents-2019-2023/vehicles-2021.csv
/kaggle/input/road-accidents-2019-2023/char-2020.csv
/kaggle/input/road-accidents-2019-2023/place-2019.csv
/kaggle/input/road-accidents-2019-2023/users-2019.csv
/kaggle/input/road-accidents-2019-2023/place-2021.csv
/kaggle/input/road-accidents-2019-2023/users-2020.csv
/kaggle/input/road-accidents-2019-2023/char-2023.csv
/kaggle/input/road-accidents-2019-2023/users-2022.csv
/kaggle/input/road-accidents-2019-2023/vehicles-2022.csv
/kaggle/input/road-accidents-2019-2023/vehicles-2019.csv
/kaggle/input/road-accidents-2019-2023/place-2020.csv
/kaggle/input/road-accidents-2019-2023/place-2023.csv
/kaggle/input/road-a

In [2]:
%%time
## all imports
import polars as pl
import pyarrow.parquet as pq
import dask.dataframe as dd
import os
import shutil
import json
from enum import Enum
from datetime import datetime
from ydata_profiling import ProfileReport
from pathlib import Path

#For excel stuff
import openpyxl
from openpyxl.drawing.image import Image


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Concurrency
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

#  Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random state
random_state = 42
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)


## Set the static file locactions
filepaths = {'vehicles':
    {
    2019: '/kaggle/input/road-accidents-2019-2023/vehicles-2019.csv',
    2020: '/kaggle/input/road-accidents-2019-2023/vehicles-2020.csv',
    2021: '/kaggle/input/road-accidents-2019-2023/vehicles-2021.csv',
    2022: '/kaggle/input/road-accidents-2019-2023/vehicles-2022.csv',
    2023: '/kaggle/input/road-accidents-2019-2023/vehicles-2023.csv',
    },
    'users':{
        2019: '/kaggle/input/road-accidents-2019-2023/users-2019.csv',
        2020: '/kaggle/input/road-accidents-2019-2023/users-2020.csv',
        2021: '/kaggle/input/road-accidents-2019-2023/users-2021.csv',
        2022: '/kaggle/input/road-accidents-2019-2023/users-2022.csv',
        2023: '/kaggle/input/road-accidents-2019-2023/users-2023.csv'
    },
    'places': {
        2019: '/kaggle/input/road-accidents-2019-2023/place-2019.csv',
        2020: '/kaggle/input/road-accidents-2019-2023/place-2020.csv',
        2021: '/kaggle/input/road-accidents-2019-2023/place-2021.csv',
        2022: '/kaggle/input/road-accidents-2019-2023/place-2022.csv',
        2023: '/kaggle/input/road-accidents-2019-2023/place-2023.csv'
    },
    'characteristics':{
        2019: '/kaggle/input/road-accidents-2019-2023/char-2019.csv',
        2020: '/kaggle/input/road-accidents-2019-2023/char-2020.csv',
        2021: '/kaggle/input/road-accidents-2019-2023/char-2021.csv',
        2022: '/kaggle/input/road-accidents-2019-2023/char-2022.csv',
        2023: '/kaggle/input/road-accidents-2019-2023/char-2023.csv'
    }
             
}

CPU times: user 4.65 s, sys: 973 ms, total: 5.62 s
Wall time: 8.67 s


In [3]:
class ExtensionMethods:
    @staticmethod
    def generate_filename(filename=None,extension=None):
        current_datetime = datetime.now()
        f = current_datetime.strftime("%Y_%m_%d_%H%M")
        if (filename is None) or (extension is None):
            return str(f)
        else:
            stitched_f = str(filename)+"_"+str(f)+"."+str(extension)
            return str(stitched_f)

    @staticmethod
    def get_file_name_without_extension(filename):
        if filename == None:
            return "Provide a file"
        return Path(filename).stem
        
        

In [4]:
class DataTransformer:
    def __init__(self,filedict,sep=';'): #mostly the sep is fixed with ";", just incase
        if filedict is None or not bool(filedict): ## checks if none or empty
            raise ValueError("Provide a File Path Dictionary")
        self.filedict = filedict
        self.sep = sep
        self.key = [key for key in self.filedict] ## Here key means which type of file ex. vehicle, user...
        self.years  = self.get_years()
        self.dirpath = self.create_dir()
        

    def get_years(self):
        _years= []
        for value in self.filedict.values():
            for key, val in value.items():
                _years.append(key)
        return list(set(_years))
        
    def create_dir(self):
        merged_path = os.path.join(os.getcwd(),'Merged')
        if not os.path.exists(merged_path):
            os.makedirs(merged_path)
        return merged_path


    def _datalist_creator(self,year=None):
        '''Creates a list consisting [vehicle,user,char,place] per year where the vals are the corrs Dataframe'''
        _dataframe = []
        if year is None:
            raise ValueError("year cannot be None")
        for key, value in self.filedict.items():
            _filename = self.filedict[key][year]
            print(f"\n Reading {key} for year {year}")
            _df =  pd.read_csv(_filename,sep=self.sep)
            _dataframe.append(_df)
        return _dataframe
    
    def _merge(self,year=None):
        if year is None:
            raise ValueError("year cannot be None")
        _data = self._datalist_creator(year)
        if not _data:
            raise ValueError("_Datalist is empty")
        _merged_df = _data[0]
        for df in _data[1:]:
            _merged_df = pd.merge(_merged_df,df)
        return _merged_df

    def concat_all_merged(self):
        _mega = {}
        _dfs = []
        for year in self.years:
            print(f"\n Merging Year {year}")
            _mega[year]= self._merge(year)
        for key, value in _mega.items():
            print(f"\n Concating for {key}")
            value['csv_info'] = key
            _dfs.append(value)
        _concated = pd.concat(_dfs)
        return _concated    
        
   
    def create_csv(self):
        data = self.concat_all_merged() 
        filepath = os.path.join(self.dirpath,ExtensionMethods.generate_filename(f"merged-{'-'.join(self.key)}", "csv"))
        data.to_csv(filepath,index=False)
        print(f"\n Finished Saving csv to: {filepath}")
        
    def create_parquet(self):
        data = self.concat_all_merged()
        obj_cols = data.select_dtypes(include =['object']).columns
        for col in obj_cols:
            data[col]=data[col].astype(str)
        filepath = os.path.join(self.dirpath,ExtensionMethods.generate_filename(f"merged-{'-'.join(self.key)}", "parquet"))
        data.to_parquet(filepath, engine='pyarrow',compression="zstd", compression_level=10, index=False)
        print(f"\n Finished Saving parquet to: {filepath}")
        
    def create_feather(self):
        data = self.concat_all_merged()
        obj_cols = data.select_dtypes(include =['object']).columns
        for col in obj_cols:
            data[col]=data[col].astype(str)
        filepath = os.path.join(self.dirpath,ExtensionMethods.generate_filename(f"merged-{'-'.join(self.key)}", "feather"))
        data.to_parquet(filepath, compression="zstd", compression_level=10)
        print(f"\n Finished Saving feather to: {filepath}")
        
    def create_h5(self):
        data = self.concat_all_merged()
        filepath = os.path.join(self.dirpath,ExtensionMethods.generate_filename(f"merged-{'-'.join(self.key)}", "h5"))
        store= pd.HDFStore(filepath, 'w')
        store.put('data',data)
        print(f"\n Finished Saving HDF5 to: {filepath}")
        
    def _do_magic(self):
        funcs = [self.create_feather,self.create_parquet,self.create_h5, self.create_csv]
        with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
            futures = [executor.submit(func) for func in funcs]
        for future in as_completed(futures):
            print(f"Result: {future.result()}")
            
    

In [5]:
%%time
dtransformer = DataTransformer(filedict=filepaths)
dtransformer.create_feather()


 Merging Year 2019

 Reading vehicles for year 2019

 Reading users for year 2019

 Reading places for year 2019

 Reading characteristics for year 2019

 Merging Year 2020

 Reading vehicles for year 2020

 Reading users for year 2020

 Reading places for year 2020

 Reading characteristics for year 2020

 Merging Year 2021

 Reading vehicles for year 2021

 Reading users for year 2021

 Reading places for year 2021

 Reading characteristics for year 2021

 Merging Year 2022

 Reading vehicles for year 2022

 Reading users for year 2022

 Reading places for year 2022

 Reading characteristics for year 2022

 Merging Year 2023

 Reading vehicles for year 2023

 Reading users for year 2023

 Reading places for year 2023

 Reading characteristics for year 2023

 Concating for 2019

 Concating for 2020

 Concating for 2021

 Concating for 2022

 Concating for 2023

 Finished Saving feather to: /kaggle/working/Merged/merged-vehicles-users-places-characteristics_2025_01_06_2145.feather
CPU