## DataTransformer:
A notebook to help with the transformation of `vehicle` data into the a merged format and possibly a parquet format


In [48]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/road-accidents-2019-2023/vehicles-2020.csv
/kaggle/input/road-accidents-2019-2023/char-2019.csv
/kaggle/input/road-accidents-2019-2023/char-2021.csv
/kaggle/input/road-accidents-2019-2023/place-2022.csv
/kaggle/input/road-accidents-2019-2023/users-2023.csv
/kaggle/input/road-accidents-2019-2023/users-2021.csv
/kaggle/input/road-accidents-2019-2023/vehicles-2021.csv
/kaggle/input/road-accidents-2019-2023/char-2020.csv
/kaggle/input/road-accidents-2019-2023/place-2019.csv
/kaggle/input/road-accidents-2019-2023/users-2019.csv
/kaggle/input/road-accidents-2019-2023/place-2021.csv
/kaggle/input/road-accidents-2019-2023/users-2020.csv
/kaggle/input/road-accidents-2019-2023/char-2023.csv
/kaggle/input/road-accidents-2019-2023/users-2022.csv
/kaggle/input/road-accidents-2019-2023/vehicles-2022.csv
/kaggle/input/road-accidents-2019-2023/vehicles-2019.csv
/kaggle/input/road-accidents-2019-2023/place-2020.csv
/kaggle/input/road-accidents-2019-2023/place-2023.csv
/kaggle/input/road-a

In [49]:
%%time
## all imports
import polars as pl
import pyarrow.parquet as pq
import dask.dataframe as dd
import os
import shutil
import json
from enum import Enum
from datetime import datetime
from ydata_profiling import ProfileReport
from pathlib import Path

#For excel stuff
import openpyxl
from openpyxl.drawing.image import Image


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Concurrency
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

#  Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random state
random_state = 42
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)


## Set the static file locactions
filepaths = {'vehicles':
    {
    2019: '/kaggle/input/road-accidents-2019-2023/vehicles-2019.csv',
    2020: '/kaggle/input/road-accidents-2019-2023/vehicles-2020.csv',
    2021: '/kaggle/input/road-accidents-2019-2023/vehicles-2021.csv',
    2022: '/kaggle/input/road-accidents-2019-2023/vehicles-2022.csv',
    2023: '/kaggle/input/road-accidents-2019-2023/vehicles-2023.csv',
    },
    'users':{
        2019: '/kaggle/input/road-accidents-2019-2023/users-2019.csv',
        2020: '/kaggle/input/road-accidents-2019-2023/users-2020.csv',
        2021: '/kaggle/input/road-accidents-2019-2023/users-2021.csv',
        2022: '/kaggle/input/road-accidents-2019-2023/users-2022.csv',
        2023: '/kaggle/input/road-accidents-2019-2023/users-2023.csv'
    },
    'places': {
        2019: '/kaggle/input/road-accidents-2019-2023/place-2019.csv',
        2020: '/kaggle/input/road-accidents-2019-2023/place-2020.csv',
        2021: '/kaggle/input/road-accidents-2019-2023/place-2021.csv',
        2022: '/kaggle/input/road-accidents-2019-2023/place-2022.csv',
        2023: '/kaggle/input/road-accidents-2019-2023/place-2023.csv'
    },
    'characteristics':{
        2019: '/kaggle/input/road-accidents-2019-2023/char-2019.csv',
        2020: '/kaggle/input/road-accidents-2019-2023/char-2020.csv',
        2021: '/kaggle/input/road-accidents-2019-2023/char-2021.csv',
        2022: '/kaggle/input/road-accidents-2019-2023/char-2022.csv',
        2023: '/kaggle/input/road-accidents-2019-2023/char-2023.csv'
    }
             
}

CPU times: user 125 µs, sys: 0 ns, total: 125 µs
Wall time: 129 µs


In [6]:
class ExtensionMethods:
    @staticmethod
    def generate_filename(filename=None,extension=None):
        current_datetime = datetime.now()
        f = current_datetime.strftime("%Y_%m_%d_%H%M")
        if (filename is None) or (extension is None):
            return str(f)
        else:
            stitched_f = str(filename)+"_"+str(f)+"."+str(extension)
            return str(stitched_f)

    @staticmethod
    def get_file_name_without_extension(filename):
        if filename == None:
            return "Provide a file"
        return Path(filename).stem
        
        

In [67]:
%%time
years = [2019,2020,2021,2022,2023]
file = 'characteristics'

### Just testing

def load_and_merge():
    dfs = []
    for year in years:
        filename = filepaths[file][year]
        _df = pd.read_csv(filename, sep=';')
        _df['csv_info'] = ExtensionMethods.get_file_name_without_extension(filename)
        dfs.append(_df)
    vehicles_concated = pd.concat(dfs)
    return vehicles_concated

a = load_and_merge()
a.to_csv('test.csv', index=False)

CPU times: user 2.58 s, sys: 120 ms, total: 2.7 s
Wall time: 2.8 s


In [60]:
class DataTransformer:
    def __init__(self,file_dict,key=None):
        if file_dict is None or not bool(file_dict): ## checks if none or empty
            raise "Please provide a file path dictionary"
        self.file_dict = file_dict
        if key is None:
            #Do it for all keys
            self.key = [key for key in self.file_dict] ## should get keys list
        else:
            self.key = [key] ## assume list for all

        self.dirpath = self.create_dir()
        
    def create_dir(self):
        # Dic as we assume its a list
        dic ={}
        for k in self.key:
            dirpath =  os.path.join(os.getcwd(),k)
            if not os.path.exists(dirpath):
                os.makedirs(dirpath)
            dic[k] = dirpath
        return dic        
        
    def _concat(self, key='vehicles'): ## this still remains the same , except we want to do it for keys =='vehicles'
        dfs = []
        for year in self.file_dict[key]:
            filename = self.file_dict[key][year]
            _df = pd.read_csv(filename,sep=';')
            _df['csv_info'] = ExtensionMethods.get_file_name_without_extension(filename)
            dfs.append(_df)
        _concated = pd.concat(dfs)
        return _concated

    def _concat_all(self): ## should just call the above for all the keys
        _mega_concat = {}
        for key in self.key:
            _mega_concat[key] = self._concat(key)
        return _mega_concat ## returns a dict with {key:Concated_Dataframe} 
        

    def create_csv(self):
        vehicles_concated = self._concat_all() ## this is a dict
        for key,value in vehicles_concated.items():
            filepath = os.path.join(self.dirpath[key], ExtensionMethods.generate_filename(f"{key}-merged", "csv"))
            value.to_csv(filepath, index=False)

    def create_parquet(self):
        vehicles_concated = self._concat_all()
        vehicles_concated = vehicles_concated.compute()
        filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename(f"{self.key}-merged", "parquet"))
        vehicles_concated.to_parquet(filepath, engine='pyarrow', index=False)

In [61]:
dtransformer = DataTransformer(file_dict=filepaths)
dtransformer.create_csv()

vehicles
{2019: '/kaggle/input/road-accidents-2019-2023/vehicles-2019.csv', 2020: '/kaggle/input/road-accidents-2019-2023/vehicles-2020.csv', 2021: '/kaggle/input/road-accidents-2019-2023/vehicles-2021.csv', 2022: '/kaggle/input/road-accidents-2019-2023/vehicles-2022.csv', 2023: '/kaggle/input/road-accidents-2019-2023/vehicles-2023.csv'}
users
{2019: '/kaggle/input/road-accidents-2019-2023/users-2019.csv', 2020: '/kaggle/input/road-accidents-2019-2023/users-2020.csv', 2021: '/kaggle/input/road-accidents-2019-2023/users-2021.csv', 2022: '/kaggle/input/road-accidents-2019-2023/users-2022.csv', 2023: '/kaggle/input/road-accidents-2019-2023/users-2023.csv'}
places
{2019: '/kaggle/input/road-accidents-2019-2023/place-2019.csv', 2020: '/kaggle/input/road-accidents-2019-2023/place-2020.csv', 2021: '/kaggle/input/road-accidents-2019-2023/place-2021.csv', 2022: '/kaggle/input/road-accidents-2019-2023/place-2022.csv', 2023: '/kaggle/input/road-accidents-2019-2023/place-2023.csv'}
characteristics

In [28]:
d = {"a":[1,2,3]}
c = None

if not bool(d):
    print("dfc")