**<mark>Generate Data</mark>**

In [1]:
!pip install -q tpchgen-cli

In [2]:
sf = 1000
BASE_OUTPUT_DIR = f"/lakehouse/default/Tables/CH{sf}"

In [3]:
%%time
import os
lineitem_parts = max(2, int(100 * (sf / 1000)))
orders_parts = max(2, int(72 * (sf / 1000)))
partsupp_parts = max(1, int(40 * (sf / 1000)))
part_parts = max(1, int(6 * (sf / 1000)))
customer_parts = max(1, int(12 * (sf / 1000)))

shell_script = f"""
set -e

SCALE_FACTOR={sf}
BASE_OUTPUT_DIR="{BASE_OUTPUT_DIR}"
FORMAT="parquet"

declare -A TABLES_PARTS=(
    ["lineitem"]={lineitem_parts}
    ["orders"]={orders_parts}
    ["partsupp"]={partsupp_parts}
    ["part"]={part_parts}
    ["customer"]={customer_parts}
    ["nation"]=1
    ["region"]=1
    ["supplier"]=1
)

echo "--- Starting TPC-H Data Generation ---"
echo "Scale Factor: ${{SCALE_FACTOR}}"
echo "Base Output Directory: ${{BASE_OUTPUT_DIR}}"
echo "Output Format: ${{FORMAT}}"
echo ""

for table_name in "${{!TABLES_PARTS[@]}}"; do
    num_parts_to_generate="${{TABLES_PARTS[${{table_name}}]}}"
    table_output_dir="${{BASE_OUTPUT_DIR}}/${{table_name}}"

    echo "--- Generating data for table: ${{table_name}} (${{num_parts_to_generate}} parts) ---"
    mkdir -p "${{table_output_dir}}"

    for ((i=1; i<=num_parts_to_generate; i++)); do
        PART_NUM=$(printf "%03d" $i)
        OUTPUT_PART_DIR="${{table_output_dir}}/part=${{PART_NUM}}"

        echo "  -> Generating ${{table_name}} part ${{i}}/${{num_parts_to_generate}} to ${{OUTPUT_PART_DIR}}"

        tpchgen-cli \\
            -s "${{SCALE_FACTOR}}" \\
            --tables "${{table_name}}" \\
            --output-dir "${{OUTPUT_PART_DIR}}" \\
            --parts "${{num_parts_to_generate}}" \\
            --part "${{i}}" \\
            --format="${{FORMAT}}"
    done
    echo "--- Finished generating data for table: ${{table_name}} ---"
    echo ""
done

echo "--- TPC-H Data Generation Complete! ---"
"""

if not os.path.isdir(BASE_OUTPUT_DIR):
    print(f"Directory {BASE_OUTPUT_DIR} does not exist. Generating data...")
    get_ipython().system(shell_script)
else:
    print(f"Directory {BASE_OUTPUT_DIR} already exists. Skipping data generation.")


Directory /lakehouse/default/Tables/CH1000 does not exist. Generating data...
--- Starting TPC-H Data Generation ---
Scale Factor: 1000
Base Output Directory: /lakehouse/default/Tables/CH1000
Output Format: parquet

--- Generating data for table: nation (1 parts) ---
  -> Generating nation part 1/1 to /lakehouse/default/Tables/CH1000/nation/part=001
--- Finished generating data for table: nation ---

--- Generating data for table: lineitem (100 parts) ---
  -> Generating lineitem part 1/100 to /lakehouse/default/Tables/CH1000/lineitem/part=001
  -> Generating lineitem part 2/100 to /lakehouse/default/Tables/CH1000/lineitem/part=002
  -> Generating lineitem part 3/100 to /lakehouse/default/Tables/CH1000/lineitem/part=003
  -> Generating lineitem part 4/100 to /lakehouse/default/Tables/CH1000/lineitem/part=004
  -> Generating lineitem part 5/100 to /lakehouse/default/Tables/CH1000/lineitem/part=005
  -> Generating lineitem part 6/100 to /lakehouse/default/Tables/CH1000/lineitem/part=006


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  -> Generating orders part 26/72 to /lakehouse/default/Tables/CH1000/orders/part=026
  -> Generating orders part 27/72 to /lakehouse/default/Tables/CH1000/orders/part=027
  -> Generating orders part 28/72 to /lakehouse/default/Tables/CH1000/orders/part=028
  -> Generating orders part 29/72 to /lakehouse/default/Tables/CH1000/orders/part=029
  -> Generating orders part 30/72 to /lakehouse/default/Tables/CH1000/orders/part=030
  -> Generating orders part 31/72 to /lakehouse/default/Tables/CH1000/orders/part=031
  -> Generating orders part 32/72 to /lakehouse/default/Tables/CH1000/orders/part=032
  -> Generating orders part 33/72 to /lakehouse/default/Tables/CH1000/orders/part=033
  -> Generating orders part 34/72 to /lakehouse/default/Tables/CH1000/orders/part=034
  -> Generating orders part 35/72 to /lakehouse/default/Tables/CH1000/orders/part=035
  -> Generating orders part 36/72 to /lakehouse/default/Tables/CH1000/orders/part=036
  -> Generating orders part 37/72 to /lakehouse/defaul

**<mark>convert to Delta</mark>**

In [4]:
%%time
from   deltalake   import convert_to_deltalake
import pyarrow as pa 
partition_schema = pa.schema([ pa.field("part", pa.string()) ])
for tbl in ['region','nation','lineitem','orders','partsupp','part','customer','supplier']: 
 convert_to_deltalake(BASE_OUTPUT_DIR +'/'+ tbl,partition_by= partition_schema,partition_strategy='hive',storage_options={"allow_unsafe_rename":"true"} )
 print(tbl)

rr /synfs/lakehouse/default/Tables/CH1000/region/_delta_log/_commit_9ffdaa54-69c6-4201-b4b2-4102002f4fbf.json.tmp -> /synfs/lakehouse/default/Tables/CH1000/region/_delta_log/00000000000000000000.json
region
rr /synfs/lakehouse/default/Tables/CH1000/nation/_delta_log/_commit_557e5fdc-1885-45a1-9f3e-4f272f9cafd2.json.tmp -> /synfs/lakehouse/default/Tables/CH1000/nation/_delta_log/00000000000000000000.json
nation
rr /synfs/lakehouse/default/Tables/CH1000/lineitem/_delta_log/_commit_74b2ecc3-0924-4f75-bb61-99479cd22758.json.tmp -> /synfs/lakehouse/default/Tables/CH1000/lineitem/_delta_log/00000000000000000000.json
lineitem
rr /synfs/lakehouse/default/Tables/CH1000/orders/_delta_log/_commit_26e99606-814d-4298-a6e1-13ce905e2020.json.tmp -> /synfs/lakehouse/default/Tables/CH1000/orders/_delta_log/00000000000000000000.json
orders
rr /synfs/lakehouse/default/Tables/CH1000/partsupp/_delta_log/_commit_d27a2be4-0f0f-4930-912a-1255290e7a40.json.tmp -> /synfs/lakehouse/default/Tables/CH1000/partsupp