**<mark>Generate Data</mark>**

In [None]:
!pip install -q tpchgen-cli

In [None]:
sf = 1000
BASE_OUTPUT_DIR = f"/lakehouse/default/Tables/TPCH{sf}"

In [None]:
%%time
import os
lineitem_parts = max(2, int(200 * (sf / 1000)))
orders_parts   = max(2, int(140 * (sf / 1000)))
partsupp_parts = max(1, int(80 * (sf / 1000)))
part_parts     = max(1, int(12 * (sf / 1000)))
customer_parts = max(1, int(20 * (sf / 1000)))

shell_script = f"""
set -e

SCALE_FACTOR={sf}
BASE_OUTPUT_DIR="{BASE_OUTPUT_DIR}"
FORMAT="parquet"

declare -A TABLES_PARTS=(
    ["lineitem"]={lineitem_parts}
    ["orders"]={orders_parts}
    ["partsupp"]={partsupp_parts}
    ["part"]={part_parts}
    ["customer"]={customer_parts}
    ["nation"]=1
    ["region"]=1
    ["supplier"]=1
)

echo "--- Starting TPC-H Data Generation ---"
echo "Scale Factor: ${{SCALE_FACTOR}}"
echo "Base Output Directory: ${{BASE_OUTPUT_DIR}}"
echo "Output Format: ${{FORMAT}}"
echo ""

for table_name in "${{!TABLES_PARTS[@]}}"; do
    num_parts_to_generate="${{TABLES_PARTS[${{table_name}}]}}"
    table_output_dir="${{BASE_OUTPUT_DIR}}/${{table_name}}"

    echo "--- Generating data for table: ${{table_name}} (${{num_parts_to_generate}} parts) ---"
    mkdir -p "${{table_output_dir}}"

    for ((i=1; i<=num_parts_to_generate; i++)); do
        PART_NUM=$(printf "%03d" $i)
        OUTPUT_PART_DIR="${{table_output_dir}}/parquetfile=${{PART_NUM}}"

        echo "  -> Generating ${{table_name}} part ${{i}}/${{num_parts_to_generate}} to ${{OUTPUT_PART_DIR}}"

        tpchgen-cli \\
            -s "${{SCALE_FACTOR}}" \\
            --tables "${{table_name}}" \\
            --output-dir "${{OUTPUT_PART_DIR}}" \\
            --parts "${{num_parts_to_generate}}" \\
            --part "${{i}}" \\
            --format="${{FORMAT}}"
    done
    echo "--- Finished generating data for table: ${{table_name}} ---"
    echo ""
done

echo "--- TPC-H Data Generation Complete! ---"
"""

if not os.path.isdir(BASE_OUTPUT_DIR):
    print(f"Directory {BASE_OUTPUT_DIR} does not exist. Generating data...")
    get_ipython().system(shell_script)
else:
    print(f"Directory {BASE_OUTPUT_DIR} already exists. Skipping data generation.")


**<mark>convert to Delta</mark>**

In [None]:
%%time
from   deltalake   import convert_to_deltalake
import pyarrow as pa 
partition_schema = pa.schema([ pa.field("parquetfile", pa.string()) ])
for tbl in ['region','nation','lineitem','orders','partsupp','part','customer','supplier']: 
 convert_to_deltalake(BASE_OUTPUT_DIR +'/'+ tbl,partition_by= partition_schema,partition_strategy='hive',storage_options={"allow_unsafe_rename":"true"} )
 print(tbl)