# Convert LDBC dataset from CSV to Parquet
This notebook crawls through the LDBC csv directory and converts csv to parquet files

In [1]:
from pathlib import Path
import pandas as pd
import dask.dataframe as dd
import numpy as np
import pyarrow

base_path = Path('/Users/andrew/datasets')
dataset_path = base_path / 'sf-0.003/csv/bi/composite-projected-fk'
output_path = base_path / 'sf-0.003/parquet/bi/composite-projected-fk'

In [2]:
from datetime import date
def csv_to_parquet(input_path:Path, output_path:Path, output_name_function):
    df = dd.read_csv(input_path,
                       delimiter='|',
                       dtype={
                           'id': int,
                           'length': np.dtype('long'),
                           'content': np.dtype('O'),
                           'imageFile': np.dtype('O'),
                           'classYear': np.dtype('long'),
                           'workFrom': np.dtype('long'),
                           'language': np.dtype('O'),
                           'email': np.dtype('O'),
                       },
                       converters={
                           'creationDate': lambda x: pd.to_datetime(x, unit='ns'),
                           'deletionDate': lambda x: pd.to_datetime(x, unit='ns'),
                           'birthday': lambda x: pd.to_datetime(x).date(),
                           #'language': lambda x: x.split(';'),
                           #'email': lambda x: x.split(';')
                       })
    df.to_parquet(output_path,
                  name_function=output_name_function,
                  write_index=False,
                  write_metadata_file=False,
                  engine="pyarrow",
                  version="2.6")
    
def process_directory(input_directory:Path, output_directory:Path):
    if not input_directory.is_dir():
        return None
    paths = [input_directory]
    while paths:
        path = paths.pop()
        if not path.is_dir():
            continue
        for p in path.iterdir():
            if p.match('*.csv'):
                output_path = output_directory / p.relative_to(input_directory).parent
                output_name_function = lambda x: f"{p.stem}{x}.parquet"
                csv_to_parquet(p, output_path, output_name_function)
            if p.is_dir():
                paths.append(p)

In [3]:
process_directory(dataset_path, output_path)