In [0]:
%pip install s3fs --quiet

In [0]:
default_date = "01-01-2023"
dbutils.widgets.text("day", default_date , "day")
day_str = dbutils.widgets.get("day")

In [0]:
import s3fs
import shutil
import os
import subprocess
from time import strftime

from datetime import date, timedelta
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import json
import s3fs

##### Download Clickhouse that is open-source real-time data warehouse platform, we will use it to convert files

In [0]:
%sh 
cd /home/ubuntu/
curl https://clickhouse.com/ | sh

In [0]:
%sh
cd /home/ubuntu/
ls

clickhouse
databricks


In [0]:
# Function to delete files in the directory
def remove_dir(directory):
    try:
        shutil.rmtree(directory)  
    except FileNotFoundError:
        pass  
    except OSError as e:
        print(f"Error: {e}")

In [0]:
# Delete if day folder exists in Linux folder and recreate
remove_dir(f'/home/ubuntu/created_folder/{day_str}/')
os.makedirs(f'/home/ubuntu/created_folder/{day_str}/', exist_ok=True)

##### Option 1 - Download JSON files from s3 to our Linux folder, convert to parquet by clickhouse and Upload to s3

In [0]:
# Get the JSON files list
fs = s3fs.S3FileSystem()
files = fs.ls(f"s3a://consents-export/{day_str}")
files[:2]

['consents-export/01-01-2024/consents-01-01-2024 00:00:06 UTC.json',
 'consents-export/01-01-2024/consents-01-01-2024 00:15:07 UTC.json']

In [None]:
for file in files:
   file_name = file.split('/')[2]
   print(f"downloading {file_name} from S3")
   fs.download(file, f"/home/ubuntu/created_folder/{day_str}/{file_name}")

   print(f"finished downloading {file_name} from S3")

   local_folder = f"/home/ubuntu/created_folder/{day_str}"
   json_file = f"{local_folder}/{file_name}"
   parquet_file = f"{local_folder}/{file_name}.parquet"

   command = (
         f"TZ='Europe/Berlin' TZDIR='/usr/share/zoneinfo' "
         f"/home/ubuntu/clickhouse --input_format_allow_errors_num 10000 "
         f"-q \"SELECT * FROM file('{json_file}') INTO OUTFILE '{parquet_file}'\""
      )

   print(f"converting {file_name} to parquet")
   subprocess.run(command, shell=True, check=True)

   print(f"uploading {file_name} to s3")
   fs.upload(f"/home/ubuntu/created_folder/{day_str}/{file_name}.parquet", f"s3://tracking-analytics/consents-export-parquet/{day_str}/{file_name}.parquet")

   os.remove(f'/home/ubuntu/created_folder/{day_str}/{file_name}.parquet')
   os.remove(f'/home/ubuntu/created_folder/{day_str}/{file_name}')
   print(f"removed {file_name} from local folder")

##### Option 2 - Read directly JSON files, convert to parquet by clickhouse and Upload to s3

In [0]:
from urllib.parse import quote

In [0]:
for file in files:
    file_name = file.split('/')[-1]
    encoded_file_name = quote(file_name)

    command = (
        f"TZ='Europe/Berlin' TZDIR='/usr/share/zoneinfo' "
        f"/home/ubuntu/clickhouse -q "
        f"\"SELECT * FROM s3('s3://consents-export/{day_str}/{encoded_file_name}') FORMAT Parquet\" > "
        f"'{local_folder}/{file_name}.parquet'"
    )

    print(f"converting {file_name} to parquet")
    subprocess.run(
        command,
        shell=True,
        check=True
    )

    print(f"uploading {file_name} to s3")
    fs.upload(f"/home/ubuntu/created_folder/{day_str}/{file_name}.parquet", f"s3://tracking-analytics/consents-export-parquet/{day_str}/{file_name}.parquet")

    os.remove(f'/home/ubuntu/created_folder/{day_str}/{file_name}.parquet')
    print(f"removed {file_name} from local folder")