In [153]:
import json
import glob
import pandas as pd
import re
import os

In [154]:
def get_column_name(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [155]:
schemas = json.load(open('retail_db/schemas.json'))

In [156]:
src_file_names = glob.glob('retail_db/*/part*')

In [157]:
src_file_names

['retail_db\\categories\\part-00000',
 'retail_db\\customers\\part-00000',
 'retail_db\\departments\\part-00000',
 'retail_db\\orders\\part-00000',
 'retail_db\\order_items\\part-00000',
 'retail_db\\products\\part-00000']

In [158]:
sp = src_file_names[1].split('\\')

In [159]:
sp

['retail_db', 'customers', 'part-00000']

In [160]:
s = 's/a\\b\\c'

In [161]:
re.split('[/\\\]',s)

['s', 'a', 'b', 'c']

In [162]:
for file in src_file_names:
    print(re.split('[/\\\]',file))

['retail_db', 'categories', 'part-00000']
['retail_db', 'customers', 'part-00000']
['retail_db', 'departments', 'part-00000']
['retail_db', 'orders', 'part-00000']
['retail_db', 'order_items', 'part-00000']
['retail_db', 'products', 'part-00000']


In [163]:
file = src_file_names[0]

In [164]:
file

'retail_db\\categories\\part-00000'

In [165]:
file_details = re.split('[/\\\]',file)

In [166]:
ds_name = file_details[-2]

In [167]:
columns = get_column_name(schemas, ds_name)

In [168]:
columns

['category_id', 'category_department_id', 'category_name']

In [169]:
df = pd.read_csv(file, names=columns)

In [170]:
df.head()

Unnamed: 0,category_id,category_department_id,category_name
0,1,2,Football
1,2,2,Soccer
2,3,2,Baseball & Softball
3,4,2,Basketball
4,5,2,Lacrosse


In [171]:
for file in src_file_names:
    print(f'Processing {file}')
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    columns = get_column_name(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    print(f'Shape of {ds_name} is {df.shape}')

Processing retail_db\categories\part-00000
Shape of categories is (58, 3)
Processing retail_db\customers\part-00000
Shape of customers is (12435, 9)
Processing retail_db\departments\part-00000
Shape of departments is (6, 2)
Processing retail_db\orders\part-00000
Shape of orders is (68883, 4)
Processing retail_db\order_items\part-00000
Shape of order_items is (172198, 6)
Processing retail_db\products\part-00000
Shape of products is (1345, 6)


In [176]:
tgt_base_dir = 'retail_db_json'

In [177]:
for file in src_file_names:
#     print(f'Processing {file}')
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{ds_name}/{file_name}'
    columns = get_column_name(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    os.makedirs(f'{tgt_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(json_file_path, orient='records',lines=True)