In [1]:
import glob

In [2]:
src_file_names = glob.glob('data/retail_db/*/part*')

In [3]:
import re

In [4]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    print(file_path_list)

['data', 'retail_db', 'customers', 'part-00000']
['data', 'retail_db', 'products', 'part-00000']
['data', 'retail_db', 'departments', 'part-00000']
['data', 'retail_db', 'order_items', 'part-00000']
['data', 'retail_db', 'orders', 'part-00000']
['data', 'retail_db', 'categories', 'part-00000']


In [5]:
base_dir = 'data/retail_db_json'

In [6]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{base_dir}/{ds_name}/{file_name}'
    print(json_file_path)

data/retail_db_json/customers/part-00000
data/retail_db_json/products/part-00000
data/retail_db_json/departments/part-00000
data/retail_db_json/order_items/part-00000
data/retail_db_json/orders/part-00000
data/retail_db_json/categories/part-00000


In [7]:
import json

In [8]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [9]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [10]:
import pandas as pd

In [11]:
for file in src_file_names:
    print(f'Processing {file}')
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    print(f'Shape of {ds_name} is {df.shape}')

Processing data/retail_db/customers/part-00000
Shape of customers is (12435, 9)
Processing data/retail_db/products/part-00000
Shape of products is (1345, 6)
Processing data/retail_db/departments/part-00000
Shape of departments is (6, 2)
Processing data/retail_db/order_items/part-00000
Shape of order_items is (172198, 6)
Processing data/retail_db/orders/part-00000
Shape of orders is (68883, 4)
Processing data/retail_db/categories/part-00000
Shape of categories is (58, 3)


In [12]:
import os

In [13]:
tgt_base_dir = 'data/retail_db_json'
for file in src_file_names:
    print(f'Processing {file}')
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{base_dir}/{ds_name}/{file_name}'
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    os.makedirs(f'{tgt_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

Processing data/retail_db/customers/part-00000
Processing data/retail_db/products/part-00000
Processing data/retail_db/departments/part-00000
Processing data/retail_db/order_items/part-00000
Processing data/retail_db/orders/part-00000
Processing data/retail_db/categories/part-00000
