## DataTransformer:
A notebook to help with the transformation of `vehicle` data into the a merged format and possibly a parquet format


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nov24-bds-roadies-users/vehicules-2019.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2022.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2021.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2020.csv
/kaggle/input/nov24-bds-roadies-users/vehicules-2023.csv


In [2]:
%%time
## all imports
import polars as pl
import pyarrow.parquet as pq
import dask.dataframe as dd
import os
import shutil
import json
from enum import Enum
from datetime import datetime
from ydata_profiling import ProfileReport
from pathlib import Path

#For excel stuff
import openpyxl
from openpyxl.drawing.image import Image


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Concurrency
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import time

#  Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random state
random_state = 42
# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)


## Set the static file locactions
filepaths = {'vehicles':
    {
    2019: '/kaggle/input/nov24-bds-roadies-users/vehicules-2019.csv',
    2020: '/kaggle/input/nov24-bds-roadies-users/vehicules-2020.csv',
    2021: '/kaggle/input/nov24-bds-roadies-users/vehicules-2021.csv',
    2022: '/kaggle/input/nov24-bds-roadies-users/vehicules-2022.csv',
    2023: '/kaggle/input/nov24-bds-roadies-users/vehicules-2023.csv',
    },
    'users':{},
    'places': {},
    'characteristics':{}
             
}

CPU times: user 3.95 s, sys: 635 ms, total: 4.58 s
Wall time: 5.57 s


In [3]:
class ExtensionMethods:
    @staticmethod
    def generate_filename(filename=None,extension=None):
        current_datetime = datetime.now()
        f = current_datetime.strftime("%Y_%m_%d_%H%M")
        if (filename is None) or (extension is None):
            return str(f)
        else:
            stitched_f = str(filename)+"_"+str(f)+"."+str(extension)
            return str(stitched_f)

    @staticmethod
    def get_file_name_without_extension(filename):
        if filename == None:
            return "Provide a file"
        return Path(filename).stem
        
        

In [4]:
%%time
years = [2019,2020,2021,2022,2023]
file = 'vehicles'

### Just testing

def load_and_merge():
    dfs = []
    for year in years:
        filename = filepaths['vehicles'][year]
        _df = dd.read_csv(filename, sep=';')
        _df['csv_info'] = ExtensionMethods.get_file_name_without_extension(filename)
        dfs.append(_df)
    vehicles_merged = dd.concat(dfs)
    return vehicles_merged

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 16.2 µs


In [5]:
class DataTransformer:
    def __init__(self,filepath_dict,key='vehicles'):
        self.filepath_dict = filepath_dict
        self.key = key
        self.dirpath = self.create_dir()
        
    def create_dir(self):
        dirpath = os.path.join(os.getcwd(), self.key)
        if not os.path.exists(dirpath):
            os.makedirs(dirpath)
        return dirpath
        
    def _merge_all(self):
        dfs = []
        for year in self.filepath_dict[self.key]:
            filename = self.filepath_dict[self.key][year]
            _df = dd.read_csv(filename,sep=';')
            _df['csv_info'] = ExtensionMethods.get_file_name_without_extension(filename)
            dfs.append(_df)
        _merged = dd.concat(dfs)
        return _merged

    def create_csv(self):
        vehicles_merged = self._merge_all()
        filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename(f"{self.key}-merged", "csv"))
        vehicles_merged.to_csv(filepath, single_file=True, index=False)

    def create_parquet(self):
        vehicles_merged = self._merge_all()
        vehicles_merged = vehicles_merged.compute()
        filepath = os.path.join(self.dirpath, ExtensionMethods.generate_filename(f"{self.key}-merged", "parquet"))
        vehicles_merged.to_parquet(filepath, engine='pyarrow', index=False)

In [6]:
dtransformer = DataTransformer(filepath_dict=filepaths)
dtransformer.create_csv()
dtransformer.create_parquet()