**<mark>Generate Data</mark>**

In [1]:
!pip install -q tpchgen-cli

In [2]:
sf = 1000
BASE_OUTPUT_DIR = f"/lakehouse/default/Tables/TPCH{sf}"

In [3]:
%%time
import os
lineitem_parts = max(2, int(200 * (sf / 1000)))
orders_parts   = max(2, int(140 * (sf / 1000)))
partsupp_parts = max(1, int(80 * (sf / 1000)))
part_parts     = max(1, int(12 * (sf / 1000)))
customer_parts = max(1, int(20 * (sf / 1000)))

shell_script = f"""
set -e

SCALE_FACTOR={sf}
BASE_OUTPUT_DIR="{BASE_OUTPUT_DIR}"
FORMAT="parquet"

declare -A TABLES_PARTS=(
    ["lineitem"]={lineitem_parts}
    ["orders"]={orders_parts}
    ["partsupp"]={partsupp_parts}
    ["part"]={part_parts}
    ["customer"]={customer_parts}
    ["nation"]=1
    ["region"]=1
    ["supplier"]=1
)

echo "--- Starting TPC-H Data Generation ---"
echo "Scale Factor: ${{SCALE_FACTOR}}"
echo "Base Output Directory: ${{BASE_OUTPUT_DIR}}"
echo "Output Format: ${{FORMAT}}"
echo ""

for table_name in "${{!TABLES_PARTS[@]}}"; do
    num_parts_to_generate="${{TABLES_PARTS[${{table_name}}]}}"
    table_output_dir="${{BASE_OUTPUT_DIR}}/${{table_name}}"

    echo "--- Generating data for table: ${{table_name}} (${{num_parts_to_generate}} parts) ---"
    mkdir -p "${{table_output_dir}}"

    for ((i=1; i<=num_parts_to_generate; i++)); do
        PART_NUM=$(printf "%03d" $i)
        OUTPUT_PART_DIR="${{table_output_dir}}/parquetfile=${{PART_NUM}}"

        echo "  -> Generating ${{table_name}} part ${{i}}/${{num_parts_to_generate}} to ${{OUTPUT_PART_DIR}}"

        tpchgen-cli \\
            -s "${{SCALE_FACTOR}}" \\
            --tables "${{table_name}}" \\
            --output-dir "${{OUTPUT_PART_DIR}}" \\
            --parts "${{num_parts_to_generate}}" \\
            --part "${{i}}" \\
            --format="${{FORMAT}}"
    done
    echo "--- Finished generating data for table: ${{table_name}} ---"
    echo ""
done

echo "--- TPC-H Data Generation Complete! ---"
"""

if not os.path.isdir(BASE_OUTPUT_DIR):
    print(f"Directory {BASE_OUTPUT_DIR} does not exist. Generating data...")
    get_ipython().system(shell_script)
else:
    print(f"Directory {BASE_OUTPUT_DIR} already exists. Skipping data generation.")


Directory /lakehouse/default/Tables/TPCH1000 does not exist. Generating data...
--- Starting TPC-H Data Generation ---
Scale Factor: 1000
Base Output Directory: /lakehouse/default/Tables/TPCH1000
Output Format: parquet

--- Generating data for table: nation (1 parts) ---
  -> Generating nation part 1/1 to /lakehouse/default/Tables/TPCH1000/nation/parquetfile=001
--- Finished generating data for table: nation ---

--- Generating data for table: lineitem (200 parts) ---
  -> Generating lineitem part 1/200 to /lakehouse/default/Tables/TPCH1000/lineitem/parquetfile=001
  -> Generating lineitem part 2/200 to /lakehouse/default/Tables/TPCH1000/lineitem/parquetfile=002
  -> Generating lineitem part 3/200 to /lakehouse/default/Tables/TPCH1000/lineitem/parquetfile=003
  -> Generating lineitem part 4/200 to /lakehouse/default/Tables/TPCH1000/lineitem/parquetfile=004
  -> Generating lineitem part 5/200 to /lakehouse/default/Tables/TPCH1000/lineitem/parquetfile=005
  -> Generating lineitem part 6/

**<mark>convert to Delta</mark>**

In [5]:
%%time
from   deltalake   import convert_to_deltalake
import pyarrow as pa 
partition_schema = pa.schema([ pa.field("parquetfile", pa.string()) ])
for tbl in ['region','nation','lineitem','orders','partsupp','part','customer','supplier']: 
 convert_to_deltalake(BASE_OUTPUT_DIR +'/'+ tbl,partition_by= partition_schema,partition_strategy='hive',storage_options={"allow_unsafe_rename":"true"} )
 print(tbl)

rr /synfs/lakehouse/default/Tables/TPCH1000/region/_delta_log/_commit_8943d88f-4a76-4f82-8c62-d1fc629bb13e.json.tmp -> /synfs/lakehouse/default/Tables/TPCH1000/region/_delta_log/00000000000000000000.json
region
rr /synfs/lakehouse/default/Tables/TPCH1000/nation/_delta_log/_commit_a24b3bdc-ee8f-4f4e-b3a2-eb47a0a2399b.json.tmp -> /synfs/lakehouse/default/Tables/TPCH1000/nation/_delta_log/00000000000000000000.json
nation
rr /synfs/lakehouse/default/Tables/TPCH1000/lineitem/_delta_log/_commit_5d1c0f2f-327e-4584-a070-63e577c5dc81.json.tmp -> /synfs/lakehouse/default/Tables/TPCH1000/lineitem/_delta_log/00000000000000000000.json
lineitem
rr /synfs/lakehouse/default/Tables/TPCH1000/orders/_delta_log/_commit_79cc8782-73f5-4ae3-b3ac-2fd55bde3a6a.json.tmp -> /synfs/lakehouse/default/Tables/TPCH1000/orders/_delta_log/00000000000000000000.json
orders
rr /synfs/lakehouse/default/Tables/TPCH1000/partsupp/_delta_log/_commit_4e1ca16f-2efb-4388-ae25-616908de2863.json.tmp -> /synfs/lakehouse/default/Tabl