In [26]:
import pandas as pd

data_folder="/data/tables"

In [27]:
"""
tables = pd.read_json('tables.json')
tables_unique = tables.drop_duplicates()
tables_unique.to_json('tables_unique.json', index=False)
"""

"\ntables = pd.read_json('tables.json')\ntables_unique = tables.drop_duplicates()\ntables_unique.to_json('tables_unique.json', index=False)\n"

In [28]:
tables_unique = pd.read_json('tables_unique.json')

In [38]:
len(tables_unique)

12179

In [29]:
import sqlite3

import buckaroo

con = sqlite3.connect('tables_data.db')

In [30]:
cur = con.cursor()

cur.executescript("""
-- DROP TABLE IF EXISTS processed;
CREATE TABLE IF NOT EXISTS processed (
  table_id TEXT,
  message TEXT
);

/*
Maps unique dguids to which tables they are found in
*/
-- DROP TABLE IF EXISTS dguid_table_id;
CREATE TABLE IF NOT EXISTS dguid_table_id (
  dguid TEXT,
  table_id TEXT
);
""")

con.commit()

In [34]:
import time
import aria2p

# initialization, these are the default values
aria2 = aria2p.API(
    aria2p.Client(
        host="http://localhost",
        port=6800,
        secret=""
    )
)

# Helper function to wait for download to finish
def wait_for_download(download):
    while download.is_active:
        print(f"Progress: {download.progress_string()} - {download.status}")
        time.sleep(10)
        download = aria2.get_download(download.gid)  # refresh state
    print(f"Download completed: {download.name} - Status: {download.status}")
    return download

In [35]:
import os
import json
import re
import requests

def parse_table(table):
    parsed_table = re.sub(r"\D", "", table)[:8]
    return parsed_table

def download_table(table_id):
    print(f"Input table_id {table_id}")
    table_id = parse_table(table_id)
    if len(table_id) == 8:
        print(f"Getting metadata for {table_id}")
    else:
        print(f"productId: {table_id} not length 8")
        return

    metadata_path = f"{data_folder}/{table_id}.json"
    if os.path.exists(metadata_path):
        print(f"Metadata and data for {table_id} already exists")
        return
        
    # Write the metadata to a JSON file
    url = f"https://www150.statcan.gc.ca/t1/wds/rest/getCubeMetadata"
    payload = [{"productId": table_id}]
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:    
        # It can still be a failed search even if it is status_code 200
        response_dict = response.json()[0]

        status = response_dict.get("status")
        if status:
            status_message = response_dict.get("object")
            if status == "FAILED":
                print(status_message)
                return
        
        #if os.path.exists(metadata_path):
        with open(metadata_path, "w") as metadata_fp:
            json.dump(response_dict, metadata_fp)
    else:
        print(f"Failed to request. Status code: {response.status_code}")
        return

    url = f"https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/{table_id}/en"
    print(f"Getting the download url for {table_id}")
    response_dict = requests.get(url).json()
    status = response_dict.get("status")
    if status:
        status_message = response_dict.get("object")
        
    print(f"Downloading {status_message} to {data_folder}/{table_id}.zip")
    urls = [status_message]
    download = aria2.add_uris(urls, options={"out": f"{table_id}.zip"})
    download = wait_for_download(download)
    """
    response = requests.get(status_message, stream=True, headers={"user-agent": None})
    with open(f"{data_folder}/{table_id}.zip", "wb") as f:
        for chunk in response.iter_content(chunk_size=512):
            if chunk:
                f.write(chunk)
    """

    return table_id

In [36]:
import zipfile

def process_table(table_id):
    table_path = f"{data_folder}/{table_id}.zip"
    csv_file = f"{table_id}.csv"
    print(f"Processing {table_path}")
    with zipfile.ZipFile(table_path) as myzip:
        with myzip.open(csv_file) as myfile:
            col_names = pd.read_csv(myfile, nrows=0).columns

        # reopen the file or it misses the first row
        with myzip.open(csv_file) as myfile:
            #types_dict = {"VALUE": float}
            #types_dict.update({col: str for col in col_names if col not in types_dict})
            #df = pd.read_csv(myfile, usecols=["DGUID"], dtype=types_dict)
            df = pd.read_csv(myfile, usecols=["DGUID"]).drop_duplicates()

    
    universe_path = f"{data_folder}/universe/{table_id}.parquet"
    print(f"Wrote to {universe_path}")
    df.to_parquet(universe_path, compression="zstd", index=False)

In [37]:
for record in tables_unique.to_records():
    index, table_id = record
    download_table(table_id)

Input table_id 33-10-0036-01
Getting metadata for 33100036
Metadata and data for 33100036 already exists
Input table_id 13-10-0946-01
Getting metadata for 13100946
Metadata and data for 13100946 already exists
Input table_id 13-10-0959-01
Getting metadata for 13100959
Metadata and data for 13100959 already exists
Input table_id 13-10-0960-01
Getting metadata for 13100960
Metadata and data for 13100960 already exists
Input table_id 23-10-0287-01
Getting metadata for 23100287
Metadata and data for 23100287 already exists
Input table_id 10-10-0139-01
Getting metadata for 10100139
Metadata and data for 10100139 already exists
Input table_id 10-10-0141-01
Getting metadata for 10100141
Metadata and data for 10100141 already exists
Input table_id 11-10-0084-01
Getting metadata for 11100084
Metadata and data for 11100084 already exists
Input table_id 11-10-0085-01
Getting metadata for 11100085
Metadata and data for 11100085 already exists
Input table_id 11-10-0086-01
Getting metadata for 11100

In [None]:
def process_table(table_id):
    already_processed = len(cur.execute("SELECT * FROM processed WHERE table_id = ?", (table_id,)).fetchall())
    if already_processed:
        print(f"Already processed {table_id}")
        return
    
    print(f"Processing {table_id}")
    try:
        df = sc.table_to_df(table_id)[['DGUID']].drop_duplicates()
        df.columns = [x.lower() for x in df.columns]
        df['table_id'] = table_id
        df.to_sql(name='dguid_table_id', con=con, if_exists='append', index=False)
        values = (table_id, 'success')
        cur.execute("INSERT INTO processed VALUES(?, ?)", values)
        con.commit()
    except Exception as e:
        values = (table_id, str(e))
        print(f"Failed to process {table_id}")
        cur.execute("INSERT INTO processed VALUES(?, ?)", values)
        con.commit()