In [1]:
import glob
import sqlite3

In [2]:
con = sqlite3.connect("/data/tables/processing.db")
cur = con.cursor()

In [3]:
cur.execute("""
SELECT product_id, last_processed FROM downloaded
WHERE last_processed IS NULL
""")

cur.fetchall()

[('98100297', None), ('98100103', None)]

In [4]:
cur.execute("""
SELECT count(*) FROM downloaded
""")
cur.fetchall()

[(7889,)]

In [5]:
cur.execute("""
SELECT DISTINCT product_id FROM downloaded
""")

product_ids_processed = [x[0] for x in cur.fetchall()]
print(len(product_ids_processed))

7889


# This is the entire productIds universe according to Statitics Canada

In [6]:
cur.execute("""
SELECT DISTINCT product_id FROM cubes
""")

print(len(cur.fetchall()))

7917


# The remaining productIds that I need to download

In [8]:
cur.execute("""
SELECT DISTINCT product_id FROM cubes
WHERE product_id NOT IN (SELECT product_id FROM downloaded)
""")

to_download = [x[0] for x in cur.fetchall()]

In [14]:
from datetime import datetime
import glob
from multiprocessing import Pool
import json
import os
import sqlite3
import zipfile
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from tqdm import tqdm

data_folder = "/data/tables"
input_folder = f"{data_folder}/input"
scratch_folder = f"{data_folder}/scratch"
output_folder = f"{data_folder}/output"


def download_cube(product_id, language="en"):
    """
    Downloads the English CSV for a specific table
    """
    download_url = f"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{product_id}/en"
    response = requests.get(download_url).json()
    zip_url = response['object']
    zip_file_name = f"{input_folder}/{language}/june_25_2025/{product_id}.zip"
    print(f"Downloading {zip_url} to {zip_file_name}")
    response = requests.get(zip_url, stream=True, headers={"user-agent": None})
    progress_bar = tqdm(
        desc=zip_file_name,
        total=int(response.headers.get("content-length", 0)),
        unit="B",
        unit_scale=True
    )
    with open(zip_file_name, "wb") as handle:
        for chunk in response.iter_content(chunk_size=512):
            if chunk:  # filter out keep-alive new chunks
                handle.write(chunk)
                progress_bar.update(len(chunk))
        progress_bar.close()

In [15]:
for product_id in to_download:
    download_cube(product_id)

Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100147-eng.zip to /data/tables/input/en/june_25_2025/12100147.zip


/data/tables/input/en/june_25_2025/12100147.zip: 100%|█████████████████| 312M/312M [07:33<00:00, 688kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100148-eng.zip to /data/tables/input/en/june_25_2025/12100148.zip


/data/tables/input/en/june_25_2025/12100148.zip: 10


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100149-eng.zip to /data/tables/input/en/june_25_2025/12100149.zip


/data/tables/input/en/june_25_2025/12100149.zip: 100%|██████████████| 1.42G/1.42G [14:39<00:00, 1.62MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100150-eng.zip to /data/tables/input/en/june_25_2025/12100150.zip


/data/tables/input/en/june_25_2025/12100150.zip: 100%|████████████████| 317M/317M [04:48<00:00, 1.10MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100151-eng.zip to /data/tables/input/en/june_25_2025/12100151.zip


/data/tables/input/en/june_25_2025/12100151.zip: 100%|██████████████| 2.13G/2.13G [24:42<00:00, 1.43MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100152-eng.zip to /data/tables/input/en/june_25_2025/12100152.zip


/data/tables/input/en/june_25_2025/12100152.zip: 100%|██████████████| 6.94G/6.94G [57:20<00:00, 2.02MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100153-eng.zip to /data/tables/input/en/june_25_2025/12100153.zip


/data/tables/input/en/june_25_2025/12100153.zip: 3.48kB [00:00, 17.1MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100154-eng.zip to /data/tables/input/en/june_25_2025/12100154.zip


/data/tables/input/en/june_25_2025/12100154.zip: 3.48kB [00:00, 19.5MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100155-eng.zip to /data/tables/input/en/june_25_2025/12100155.zip


/data/tables/input/en/june_25_2025/12100155.zip: 3.48kB [00:00, 16.2MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/12100156-eng.zip to /data/tables/input/en/june_25_2025/12100156.zip


/data/tables/input/en/june_25_2025/12100156.zip: 3.48kB [00:00, 16.3MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100442-eng.zip to /data/tables/input/en/june_25_2025/13100442.zip


/data/tables/input/en/june_25_2025/13100442.zip: 100%|██████████████| 1.02M/1.02M [00:00<00:00, 1.15MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/13100958-eng.zip to /data/tables/input/en/june_25_2025/13100958.zip


/data/tables/input/en/june_25_2025/13100958.zip: 100%|██████████████| 11.8k/11.8k [00:00<00:00, 52.3MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/33100852-eng.zip to /data/tables/input/en/june_25_2025/33100852.zip


/data/tables/input/en/june_25_2025/33100852.zip: 100%|██████████████| 5.26k/5.26k [00:00<00:00, 26.0MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100293-eng.zip to /data/tables/input/en/june_25_2025/34100293.zip


/data/tables/input/en/june_25_2025/34100293.zip: 100%|██████████████| 49.3M/49.3M [00:47<00:00, 1.04MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100294-eng.zip to /data/tables/input/en/june_25_2025/34100294.zip


/data/tables/input/en/june_25_2025/34100294.zip: 100%|███████████████| 50.6k/50.6k [00:00<00:00, 102MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/34100295-eng.zip to /data/tables/input/en/june_25_2025/34100295.zip


/data/tables/input/en/june_25_2025/34100295.zip: 100%|███████████████| 40.9k/40.9k [00:00<00:00, 101MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100271-eng.zip to /data/tables/input/en/june_25_2025/37100271.zip


/data/tables/input/en/june_25_2025/37100271.zip: 100%|███████████████| 5.92M/5.92M [00:07<00:00, 780kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100272-eng.zip to /data/tables/input/en/june_25_2025/37100272.zip


/data/tables/input/en/june_25_2025/37100272.zip: 100%|███████████████| 14.8M/14.8M [00:19<00:00, 763kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100273-eng.zip to /data/tables/input/en/june_25_2025/37100273.zip


/data/tables/input/en/june_25_2025/37100273.zip: 100%|███████████████| 9.07M/9.07M [00:12<00:00, 711kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100274-eng.zip to /data/tables/input/en/june_25_2025/37100274.zip


/data/tables/input/en/june_25_2025/37100274.zip: 100%|██████████████| 23.5M/23.5M [00:07<00:00, 3.03MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100289-eng.zip to /data/tables/input/en/june_25_2025/37100289.zip


/data/tables/input/en/june_25_2025/37100289.zip: 100%|█████████████████| 544k/544k [00:01<00:00, 470kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100290-eng.zip to /data/tables/input/en/june_25_2025/37100290.zip


/data/tables/input/en/june_25_2025/37100290.zip: 100%|███████████████| 10.4M/10.4M [00:11<00:00, 927kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100291-eng.zip to /data/tables/input/en/june_25_2025/37100291.zip


/data/tables/input/en/june_25_2025/37100291.zip: 100%|████████████████| 129M/129M [01:41<00:00, 1.28MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/37100292-eng.zip to /data/tables/input/en/june_25_2025/37100292.zip


/data/tables/input/en/june_25_2025/37100292.zip: 100%|██████████████| 1.57M/1.57M [00:00<00:00, 4.13MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100182-eng.zip to /data/tables/input/en/june_25_2025/38100182.zip


/data/tables/input/en/june_25_2025/38100182.zip: 100%|█████████████████| 403k/403k [00:00<00:00, 937kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/38100183-eng.zip to /data/tables/input/en/june_25_2025/38100183.zip


/data/tables/input/en/june_25_2025/38100183.zip: 100%|████████████████| 105k/105k [00:00<00:00, 2.88MB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100110-eng.zip to /data/tables/input/en/june_25_2025/45100110.zip


/data/tables/input/en/june_25_2025/45100110.zip: 100%|███████████████| 16.2M/16.2M [00:19<00:00, 817kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/45100111-eng.zip to /data/tables/input/en/june_25_2025/45100111.zip


/data/tables/input/en/june_25_2025/45100111.zip: 100%|███████████████| 5.20M/5.20M [00:11<00:00, 435kB/s]


Downloading https://www150.statcan.gc.ca/n1/tbl/csv/46100092-eng.zip to /data/tables/input/en/june_25_2025/46100092.zip


/data/tables/input/en/june_25_2025/46100092.zip: 100%|██████████████| 73.9k/73.9k [00:00<00:00, 1.90MB/s]


# The remaining productIds that I need to process from input data directory

In [9]:
remaining_to_process = []
for file in glob.glob("/data/tables/input/en/*.zip"):
    product_id = file.split("/")[-1].split(".zip")[0]
    if product_id not in product_ids_processed:
        remaining_to_process.append(product_id)

In [10]:
print(len(remaining_to_process))

1


In [11]:
print(remaining_to_process)

['13100442']
