#DOWNLOAD FUNCTIONS

In [0]:
import os, re, requests, xml.etree.ElementTree as ET
from typing import Optional, Tuple

ZIP_RE = re.compile(r"(\d{4}(?:\d{2})?)-citibike-tripdata\.zip")

def _session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": "citibike-ingestion/1.0"})
    return s

def _download_stream(sess: requests.Session, src_url: str, dest_path: str, chunk=8*1024*1024):
    with sess.get(src_url, stream=True, timeout=30) as resp:
        resp.raise_for_status()
        with open(dest_path, "wb") as f:
            for part in resp.iter_content(chunk_size=chunk):
                if part:
                    f.write(part)

def download_data(
    s3_url: str,
    adls_dir_url: str,
    dbutils,
    local_dir: str,
    year_filter: Optional[Tuple[int, int]] = None,
    limit: Optional[int] = None
):  
    # local dir
    os.makedirs(local_dir, exist_ok=True)
    # cloud dir
    dbutils.fs.mkdirs(adls_dir_url)

    # existing ADLS filenames
    try:
        existing = {fi.name.rstrip("/") for fi in dbutils.fs.ls(adls_dir_url)}
    except Exception:
        dbutils.fs.mkdirs(adls_dir_url)
        existing = set()

    # list all files in S3 bucket (XML, <Key>)    
    list_url = f"{s3_url.rstrip('/')}/?list-type=2"
    sess = _session()
    r = sess.get(list_url, timeout=20)
    r.raise_for_status()
    keys = [e.text for e in ET.fromstring(r.text).iterfind(".//{*}Key")]

    # loop through all files and download if not already in ADLS
    downloaded = skipped = failed = 0
    for i, key in enumerate(keys, start=1):
        fname = os.path.basename(key)
        if not ZIP_RE.fullmatch(fname):
            continue

        # filter by year (optional)
        if year_filter:
            try:
                yr = int(fname[:4])
            except ValueError:
                continue
            y1, y2 = year_filter
            if not (y1 <= yr <= y2):
                continue

        # if already exists in ADLS, skip
        if fname in existing:
            print(f"[{i}] SKIP (ADLS): {fname}")
            skipped += 1
            continue

        file_url  = f"{s3_url.rstrip('/')}/{fname}"
        local_path = os.path.join(local_dir, fname)
        adls_path  = f"{adls_dir_url}/{fname}"

        # download single file
        print(f"[{i}] Downloading: {fname}")
        try:
            _download_stream(sess, file_url, local_path)
            # move to ADLS
            dbutils.fs.mv("file:" + local_path, adls_path, True)
            print(f"[{i}] MOVED â†’ ADLS: {fname}")
            existing.add(fname)
            downloaded += 1
        except Exception as e:
            print(f"[{i}] FAILED {fname}: {e}")
            failed += 1
            if os.path.exists(local_path):
                try: os.remove(local_path)
                except: pass

        if limit and downloaded >= limit:
            break

    return {"downloaded": downloaded, "skipped": skipped, "failed": failed}
    

#EXECUTION

In [0]:
S3_BUCKET       = "https://s3.amazonaws.com/tripdata"
ADLS_DOWNLOADS  = "abfss://citibike@databricksjm.dfs.core.windows.net/downloads"
LOCAL_TMP       = "/local_disk0/tmp/citibike"

stats = download_data(
    s3_url=S3_BUCKET,
    adls_dir_url=ADLS_DOWNLOADS,
    local_dir=LOCAL_TMP,
    dbutils=dbutils,
    year_filter=None,
    limit=None               
)

print("Ingestion finished.")
print(f"Downloaded: {stats['downloaded']} | Skipped: {stats['skipped']} | Failed: {stats['failed']}")