In [1]:
import asyncio
import os
import re

import pandas as pd
import requests 
from dotenv import load_dotenv
from tqdm import tqdm

from autogen_core.models import UserMessage
from autogen_agentchat.agents import AssistantAgent
from autogen_agentchat.messages import StructuredMessage
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient

In [2]:
# same seed for any random operation
seed = 42

# Load environment variables from .env file
load_dotenv()
assert "TOGETHER_API_KEY" in os.environ

settings = {
    "model": "Qwen/Qwen3-235B-A22B-fp8-tput",
    "temperature": 0.1,
    "top_p": 1,
    "frequency_penalty": 0,
    "presence_penalty": 0,
    "max_tokens": 10000,
}

model_client = OpenAIChatCompletionClient(
    base_url = "https://api.together.xyz/v1/",
    api_key=os.getenv("TOGETHER_API_KEY", ""),
    model_info= {
        "vision": False,
        "function_calling": True,
        "json_output": True,
        "family": "unknown",
        "structured_output": True
    },
    **settings   
)

In [3]:
with open(os.path.join('..', 'backend', 'prompts', 'JoinSelector.md')) as file:
    JOIN_SELECTOR_PROMPT_TEMPLATE = file.read()

In [4]:
keywords_gt = pd.read_csv(os.path.join('..', 'data', 'keywords_gt_with_examples.csv'))

In [5]:
data = keywords_gt[
    (keywords_gt['n_keywords'] == 2) & 
    (keywords_gt['presence'] == 1) &
    (keywords_gt['result_count'] >= 15) & 
    (keywords_gt['r_id_index'] < 20)
].iloc[:1].reset_index(drop=True)

data

Unnamed: 0,country_tag,type,difficulty,r_rsc_id,s_rsc_id,r_pkg_id,s_pkg_id,r_col_name,s_col_name,nl,top_k,n_keywords,keywords,presence,result_count,r_id_index,s_id_index,prompt_tokens,completion_tokens,response
0,CAN,multi-table-join,challenging,a7c95445-e336-41e3-945c-1bc938ff0612,ae2204c7-db9e-450e-8158-c70eedada7ae,12c89aa5-063a-f0ed-334a-3019133b9ee8,4879e498-c9ac-44a3-d7d1-21af22e2224c,department_/_minista_re,department_/_minista_re,How has the payment amount changed for the sam...,1000,2,suppliers+payments,1,17,14.0,12.0,648,15,<think>\n\n</think>\n\n<keywords>suppliers+pay...


In [1]:
from ckan import CanadaCKAN

ckan_client = CanadaCKAN() 

In [2]:
s_rsc_metadata = ckan_client.resource_show(resource_id=data['s_rsc_id'][0])

NameError: name 'data' is not defined

In [5]:
response = ckan_client.package_search("payment+supplier", 3)

In [3]:
import os
import shutil
shutil.rmtree('tmp')
os.makedirs('tmp', exist_ok=True)

cnt = ckan_client.download_tables_from_package_search('tmp', 'csv', 1, q='payment+supplier', rows=3)

cnt

Method <function download_resource.<locals>.csv at 0x707180d7fce0> failed with resource bde3d810-d91b-4f7b-8231-b617500d0f7a: 
Method <function download_resource.<locals>.csv at 0x7071b4d98720> failed with resource d8c63f81-6c46-4a32-967e-5a41f8a39c53: 
Method <function download_resource.<locals>.csv at 0x7071beeceb60> failed with resource 266d4f70-5c55-4f2c-90a8-2d606cda5db6: 
Method <function download_resource.<locals>.csv at 0x7071b4d98720> failed with resource 9b605bf4-c67c-4f29-9b8e-38a8822411db: 
Method <function download_resource.<locals>.csv at 0x7071b4436c00> failed with resource e60778cf-0599-4ed9-b326-8336bbb9ea8d: 
Method <function download_resource.<locals>.csv at 0x7071bdcdbe20> failed with resource 19bb2d14-3703-4211-b0f2-fc4b270c859a: 
Method <function download_resource.<locals>.csv at 0x7071beeceb60> failed with resource c7bfd1f3-cffd-446c-92fe-b0e0e2526927: 
Method <function download_resource.<locals>.csv at 0x7071bdcdbe20> failed with resource 9a9b965c-261c-48b4-885d

14

In [9]:
data = response.json()
data['result']['results'][0]['resources']

[{'cache_last_updated': None,
  'cache_url': None,
  'created': '2025-02-28T16:26:36.056023',
  'data_quality': [],
  'datastore_active': False,
  'description': None,
  'format': 'CSV',
  'hash': '',
  'id': '3f483dd4-345e-45c3-9424-970f1be85ba9',
  'language': ['fr'],
  'last_modified': None,
  'metadata_modified': '2025-02-28T16:26:36.038668',
  'mimetype': None,
  'mimetype_inner': None,
  'name': 'Dataset',
  'name_translated': {'fr': 'Ensembles de données', 'en': 'Dataset'},
  'package_id': '8be07a6b-2f19-48e5-a4ba-024e5e4933c5',
  'position': 0,
  'resource_type': 'dataset',
  'state': 'active',
  'url': 'https://www150.statcan.gc.ca/n1/tbl/csv/33100937-fra.zip',
  'url_type': None},
 {'cache_last_updated': None,
  'cache_url': None,
  'created': '2025-02-28T16:26:36.056029',
  'data_quality': [],
  'datastore_active': False,
  'description': None,
  'format': 'CSV',
  'hash': '',
  'id': 'b5a4e35f-0a1e-42d0-9f4c-826c3f478adb',
  'language': ['en'],
  'last_modified': None,
  'm

In [None]:
result = ckan_client.package_show(id=data['r_pkg_id'][0])
result.json()

{'help': 'https://open.canada.ca/data/api/3/action/help_show?name=package_show',
 'success': True,
 'result': {'aggregate_identifier': '',
  'association_type': [],
  'audience': [],
  'author': None,
  'author_email': None,
  'collection': 'federated',
  'contact_information': '{"fr": {"organization_name": "Finances et Conseil du Trésor","electronic_mail_address": "opendatadonneesouvertes@snb.ca"},"en": {"organization_name": "Finance and Treasury Board","electronic_mail_address": "opendatadonneesouvertes@snb.ca"}}',
  'contributor': {},
  'creator_user_id': 'b88b9549-3a4b-43a9-9375-5f3fc20fe782',
  'data_series_issue_identification': {},
  'data_series_name': {},
  'date_published': '2021-10-28 00:00:00',
  'display_flags': [],
  'federated_date_modified': '2024-07-09',
  'file_id': '12c89aa5-063a-f0ed-334a-3019133b9ee8',
  'frequency': 'unknown',
  'geographic_region': [],
  'hierarchy_level': 'dataset; jeuDonnées',
  'id': '12c89aa5-063a-f0ed-334a-3019133b9ee8',
  'imso_approval': '