<mark>Download Parquet</mark>

In [1]:
Table_Path    = "/lakehouse/default/Tables/ny/taxi/"
Parquet_Path  = "/lakehouse/default/Files/ny/taxi/"
start         = '2011-01-01'
end           = '2024-06-30'

StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 3, Finished, Available, Finished)

In [2]:
import pandas as pd
import random
import time
import os
import glob
import requests
from tqdm import tqdm
import sys

def download_NYT(start, end, output_dir):
    new = [f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{x}.parquet' 
           for x in pd.date_range(start, end, freq='M').strftime("%Y-%m")]
    existing = ['https://d37ci6vzurychx.cloudfront.net/trip-data/' + os.path.basename(x) 
                for x in glob.glob(os.path.join(output_dir, '*.parquet'))]
    
    urls = list(set(new) - set(existing))
    random.shuffle(urls)
    os.makedirs(output_dir, exist_ok=True)
    
    if len(urls) > 0:
        session = requests.Session()
        failed_downloads = []
        for url in tqdm(urls, desc="Downloading Parquet files"):
            file_name = url.split("/")[-1]
            file_path = os.path.join(output_dir, file_name)
            
            response = session.head(url)
            if response.status_code == 200:
                response = session.get(url, stream=True)
                with open(file_path, "wb") as f:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            f.write(chunk)
                print(f"Downloaded {file_name} to {file_path}")
            else:
                failed_downloads.append(url)
        
        return failed_downloads
    else:
        return []


failed_urls = download_NYT(start, end, Parquet_Path)
if len(failed_urls) == 0:
        print("All files downloaded successfully.")
else:
        print(f"{len(failed_urls)} files failed to download. please run the notebook again ...")
        notebookutils.session.stop()


StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 4, Finished, Available, Finished)

Downloading Parquet files: 100%|██████████| 34/34 [03:52<00:00,  6.84s/it]


Downloaded yellow_tripdata_2015-03.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2015-03.parquet
Downloaded yellow_tripdata_2011-02.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2011-02.parquet
Downloaded yellow_tripdata_2024-01.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2024-01.parquet
Downloaded yellow_tripdata_2022-10.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2022-10.parquet
Downloaded yellow_tripdata_2020-10.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2020-10.parquet
Downloaded yellow_tripdata_2024-03.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2024-03.parquet
Downloaded yellow_tripdata_2019-01.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2019-01.parquet
Downloaded yellow_tripdata_2013-02.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2013-02.parquet
Downloaded yellow_tripdata_2011-04.parquet to /lakehouse/default/Files/ny/taxi/yellow_tripdata_2011-04.parquet
D

**<mark>Create Delta Table</mark>**

In [3]:
import time
from psutil import *
vCPU = str(cpu_count()) + " vCPU"
mem=round(virtual_memory().total/(1024 * 1024 * 1024),0)
runtime = vCPU+' '+str(mem)+'GB'+ ' Run date: '+ str(time.strftime("%Y-%m-%d"))
print(runtime)

StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 5, Finished, Available, Finished)

8 vCPU 63.0GB Run date: 2024-08-30


In [4]:
### adjust this number based on your available RAM
max_batch     = 162

StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 6, Finished, Available, Finished)

In [5]:
!pip install duckdb --upgrade
!pip install deltalake==0.17.4

StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 7, Finished, Available, Finished)

Collecting duckdb
  Downloading duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m119.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.0.0
Collecting deltalake==0.17.4
  Downloading deltalake-0.17.4-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pyarrow-hotfix (from deltalake==0.17.4)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, deltalake
Successfully installed deltalake-0.17.4 pyarrow-hotfix-0.6


In [6]:
import duckdb,glob,os,math
from deltalake.writer import write_deltalake, try_get_deltatable
storage_options={"allow_unsafe_rename":"true"}
RG=8_000_000
duckdb.sql(""" set temp_directory = '/lakehouse/default/Files/temp' ; SET preserve_identifier_case = false  """)
def generate_delta():
  dt =try_get_deltatable(Table_Path,storage_options=storage_options)
  if dt is not None:
      existing_files = duckdb.sql(f""" select distinct file as file from delta_scan('{Table_Path}') """).df()['file'].tolist()
  else:
    existing_files=[]
  new=[ os.path.basename(x) for x in glob.glob(Parquet_Path +'*.parquet')]
  files_to_download = list(set(new) - set(existing_files))
  files_to_download_Path = sorted([Parquet_Path + i for i in files_to_download], reverse=True)[:max_batch]
  print(files_to_download_Path)
  if len(files_to_download_Path) > 0 :
    df=duckdb.sql(f""" 
        select
        *exclude(filename,Airport_fee,congestion_surcharge,tpep_pickup_datetime,tpep_dropoff_datetime,VendorID,passenger_count,PULocationID,DOLocationID,payment_type,RateCodeID),
        cast(Airport_fee as double)                                             as airport_fee,
        cast(congestion_surcharge as double)                                    as congestion_surcharge,
        cast(VendorID as integer)                                               as VendorID,
        cast(passenger_count as integer)                                        as passenger_count ,
        cast (PULocationID   as integer)                                        as PULocationID ,
        cast(DOLocationID    as integer)                                        as DOLocationID,
        cast(payment_type    as integer)                                        as payment_type,
        cast(RateCodeID      as integer)                                        as RateCodeID,
        cast (tpep_pickup_datetime as TIMESTAMPTZ )                             as tpep_pickup_datetime,
        cast (tpep_dropoff_datetime as TIMESTAMPTZ )                            as tpep_dropoff_datetime,
        cast(date_trunc('hour',tpep_pickup_datetime) as TIMESTAMPTZ)            as hour ,
        cast(date_trunc('day',tpep_pickup_datetime) as TIMESTAMPTZ)             as date ,
        parse_filename(filename)                                                as file,
        isoyear (cast (tpep_pickup_datetime as timestamp))                      as year
        from read_parquet ({files_to_download_Path},filename = True) 
        where hour >='{start}' and hour <='{end}' and fare_amount >0 and passenger_count > 0 
        and strftime(tpep_pickup_datetime,'%Y-%m') = right(parse_filename(filename,True),7)

                  """).record_batch()
    write_deltalake(Table_Path, df, mode="append",max_rows_per_group = RG, min_rows_per_group = RG, storage_options= storage_options)
if len(failed_urls) == 0 :
  for _ in range(math.ceil(len(glob.glob(Parquet_Path +'*.parquet'))/max_batch)):
    generate_delta()
else:
  print(f"{len(failed_urls)} files failed to download. please run the notebook again to download the remaining files...")

StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 8, Finished, Available, Finished)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[]


StatementMeta(, b89f67fd-cea3-413c-826b-3c1067f78607, 9, Finished, Available, Finished)