In [None]:
import pandas as pd
from openai import OpenAI
from tqdm.notebook import tqdm

### Load policies

In [3]:
df = pd.read_parquet('../data/sample_10k_with_policies_extracted_tagged_v4_2026-01-30.parquet')
df

Unnamed: 0_level_0,discussion,policies
openalex_id,Unnamed: 1_level_1,Unnamed: 2_level_1
W4387168299,This is the first study to assess the associat...,[]
W3014778543,The main conclusions extracted from this study...,[]
W3048162401,"LPG, CNG, ethanol and biodiesel are good candi...",[[MOBILITY] [REGULATORY] [NATIONAL] Policy sup...
W1005805659,The results of the study revealed that growth ...,[[FOOD] [ECONOMIC] [NATIONAL] Investment in ag...
W4319601041,"In our use of participatory video, following R...",[[NATURE] [PARTICIPATORY] [NGO] Use of partici...
...,...,...
W4390273040,Erosion is a significant issue impacting upon ...,[]
W4298143484,"In this article, the individual and collective...",[]
W4387449772,Research findings\nAccording to the context of...,[[ENERGY] [REGULATORY] [NATIONAL] China has pr...
W2623732815,A FDS Input Files\nList of Figures\nPredicted ...,[]


In [4]:
pdf = df.explode("policies").reset_index()
pdf = pdf.dropna()
pdf

Unnamed: 0,openalex_id,discussion,policies
2,W3048162401,"LPG, CNG, ethanol and biodiesel are good candi...",[MOBILITY] [REGULATORY] [NATIONAL] Policy supp...
3,W1005805659,The results of the study revealed that growth ...,[FOOD] [ECONOMIC] [NATIONAL] Investment in agr...
4,W1005805659,The results of the study revealed that growth ...,[FOOD] [ECONOMIC] [NATIONAL] Providing support...
5,W1005805659,The results of the study revealed that growth ...,[FOOD] [ECONOMIC] [NATIONAL] Targeting subsidi...
6,W4319601041,"In our use of participatory video, following R...",[NATURE] [PARTICIPATORY] [NGO] Use of particip...
...,...,...,...
20905,W4387449772,Research findings\nAccording to the context of...,[SOCIAL] [ECONOMIC] [NATIONAL] China should co...
20906,W4387449772,Research findings\nAccording to the context of...,[SOCIAL] [ECONOMIC] [NATIONAL] China should vi...
20908,W3005356321,"In this paper, we explored the causal relation...",[INDUSTRY] [ORGANISATIONAL] [COMPANIES] Manage...
20909,W3005356321,"In this paper, we explored the causal relation...",[INDUSTRY] [INFORMATIONAL] [COMPANIES] Service...


In [5]:
def extract_tags(policy):
    # policy is of form [DOMAIN] [POLCIY_TYPE] [ACTOR] text
    parts = policy.split("] ")
    if len(parts) < 4:
        raise ValueError(f"Policy does not have enough parts: {policy}")
    domain = parts[0][1:]  # remove leading '['
    policy_type = parts[1][1:]  # remove leading '['
    actor = parts[2][1:]  # remove leading '['
    policy_text = parts[-1]
    return {
        "domain": domain,
        "policy_type": policy_type,
        "actor": actor,
        "policy_text": policy_text,
    }

In [6]:
parts = pdf['policies'].apply(lambda x: len(x.split("] ")))

In [7]:
parts[parts != 4]

2139     19
2649     25
3645      5
5036      7
7761      3
11239    22
13187     6
13188     5
15645    13
19715     5
20387    13
20536     5
Name: policies, dtype: int64

In [8]:
for idx, row in pdf[parts != 4].iterrows():
    print(f"Row {idx}: {row['policies']}")

Row 2139: [SOCIAL] [ORGANISATIONAL] [NATIONAL] Policies to better support traditional healing practices in First Nation communities, including systemic federal support. [SOCIAL] [ORGANISATIONAL] [NATIONAL] Implementation of the Truth and Reconciliation Commission of Canada’s Calls to Action 18 and 19, which involves expanding the scope and mandate of health facilities in southern communities to include essential PHC services in all First Nation communities. [SOCIAL] [ORGANISATIONAL] [NATIONAL] Equitable rights-based funding and equitable access to PHC in each First Nation community. [SOCIAL] [ORGANISATIONAL] [NATIONAL] Active involvement of Indigenous Peoples in Canada in developing long-term structural changes to address systematic social and health inequities. [SOCIAL] [ORGANISATIONAL] [NATIONAL] Improving the living conditions for all First Nations living on reserve, reducing systemic and jurisdictional barriers, and addressing racism within the healthcare system. [SOCIAL] [ORGANISA

Most of these could be repaired, but they're few enough to just throw them away.

In [9]:
tags_df = pdf["policies"][parts == 4].apply(extract_tags).apply(pd.Series)
pdf = pd.concat([pdf[parts == 4], tags_df], axis=1)
pdf

Unnamed: 0,openalex_id,discussion,policies,domain,policy_type,actor,policy_text
2,W3048162401,"LPG, CNG, ethanol and biodiesel are good candi...",[MOBILITY] [REGULATORY] [NATIONAL] Policy supp...,MOBILITY,REGULATORY,NATIONAL,Policy support for the implementation of biodi...
3,W1005805659,The results of the study revealed that growth ...,[FOOD] [ECONOMIC] [NATIONAL] Investment in agr...,FOOD,ECONOMIC,NATIONAL,Investment in agriculture technology developme...
4,W1005805659,The results of the study revealed that growth ...,[FOOD] [ECONOMIC] [NATIONAL] Providing support...,FOOD,ECONOMIC,NATIONAL,"Providing support like credit, cheap power, tu..."
5,W1005805659,The results of the study revealed that growth ...,[FOOD] [ECONOMIC] [NATIONAL] Targeting subsidi...,FOOD,ECONOMIC,NATIONAL,Targeting subsidies to the poor and backward r...
6,W4319601041,"In our use of participatory video, following R...",[NATURE] [PARTICIPATORY] [NGO] Use of particip...,NATURE,PARTICIPATORY,NGO,Use of participatory video to bring Indigenous...
...,...,...,...,...,...,...,...
20905,W4387449772,Research findings\nAccording to the context of...,[SOCIAL] [ECONOMIC] [NATIONAL] China should co...,SOCIAL,ECONOMIC,NATIONAL,China should consistently narrow income gaps t...
20906,W4387449772,Research findings\nAccording to the context of...,[SOCIAL] [ECONOMIC] [NATIONAL] China should vi...,SOCIAL,ECONOMIC,NATIONAL,China should vigorously promote the sharing ec...
20908,W3005356321,"In this paper, we explored the causal relation...",[INDUSTRY] [ORGANISATIONAL] [COMPANIES] Manage...,INDUSTRY,ORGANISATIONAL,COMPANIES,Managers should take cultural intelligence int...
20909,W3005356321,"In this paper, we explored the causal relation...",[INDUSTRY] [INFORMATIONAL] [COMPANIES] Service...,INDUSTRY,INFORMATIONAL,COMPANIES,Service firms need to provide multicultural tr...


### Clean invalid tags

In [10]:
accepted_domains = {"BUILDING", "URBAN", "MOBILITY", "ENERGY", "MATERIALS", "FOOD", "INDUSTRY", "LOGISTICS", "NATURE", "SOCIAL", "MACROECONOMIC"}
domain_mapping = {
    "AGRICULTURE": "FOOD",
    "ECONOMIC": "MACROECONOMIC",
    "FINANCE": "MACROECONOMIC",
    "INDIVIDUALS": "SOCIAL",
    "SCHOOL": "SOCIAL",
    "WATER": "MATERIALS",
    "TOURISM": None
}
domain_mapping |= {domain: domain for domain in accepted_domains}

In [11]:
pdf.domain = pdf.domain.map(domain_mapping)

In [12]:
pdf.groupby('domain').size()

domain
BUILDING          455
ENERGY           1568
FOOD             2531
INDUSTRY         1137
LOGISTICS         102
MACROECONOMIC     278
MATERIALS         579
MOBILITY          730
NATURE           1687
SOCIAL           4426
URBAN            1285
dtype: int64

In [18]:
pdf.drop(columns=['policies', 'discussion']).reset_index(drop=True).to_parquet('../data/cleaned_policies_from_sample10k_tagged_v4_2026-01-30.parquet')

### LLM-based clustering

In [13]:
import os
from pydantic import BaseModel
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
client = OpenAI(
    base_url=os.getenv("GENERATION_API_URL"),
    api_key=os.getenv("SCW_SECRET_KEY"),
)

In [29]:
class PolicySubdomain(BaseModel):
    name: str
    clusters: list[str]

class PolicyHierarchy(BaseModel):
    subdomains: list[PolicySubdomain]

In [None]:
def extract_clusters(text: str, prompt: str, model_name: str, client: OpenAI = client) -> str:
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": prompt.strip()},
                {"role": "user", "content": text},
            ],
            temperature=0,
            max_tokens=1024 * 16,
            response_format={
                "type": "json_schema",
                "json_schema": {
                    "name": PolicyHierarchy.__name__,
                    "schema": PolicyHierarchy.model_json_schema(),
                },
            },
            timeout=300,
            stream=False,
        )
        return PolicyHierarchy.model_validate_json(response.choices[0].message.content)
    except Exception as e:
        print("Error:", e)
        return f"Error: {e}"

In [None]:
#model_name = "mistral-small-3.2-24b-instruct-2506"
model_name = "qwen3-235b-a22b-instruct-2507"  # much better than the above model at identifying good clusters
cluster_prompt = """Group the following policies into clusters of roughly similar policies.
All policies belong to the same domain, your task is to output of list of subdomains, each with a list of clusters of similar policies.
Do not include the policies themselves in the output, only the names of the subdomains and clusters.

Roughly similar means policies sharing a similar mechanism.
For instance, carbon pricing policies would all be in the same cluster.
Still, the granularity of the clusters should be rather fine. 

The following example clusters are too broad:
- pollution control -> we're interested in the how, not the what, so we want to separate pollution control policies by their mechanism (tax, regulation, etc.)
- circular economy and waste management -> these are two different objectives, we want to separate them. Even separated, they would still be too broad, as they include very different policies (e.g. extended producer responsibility and landfill tax are both circular economy policies but very different in nature).
- sustainable farming practices -> again, way too broad, which practices ? or which policies to promote these practices ?
- Building materials and construction techniques -> which ones?

Our goal is to get a set of unique policies.

Here are the policies:
"""

merge_clusters_prompt = """
The following taxonomies are exracted from the same process applied to different samples of policies.
There will likely be some overlap / duplicates at all hierarchy levels.
Merge them into a single taxonomy.
"""

In [None]:
max_policies = 1000
outputs = {}
for domain, group in tqdm(pdf.groupby('domain')):
    group = pdf[pdf.domain == domain]
    policies = group['policy_text'].tolist()
    print(f"{domain} - {len(policies)} policies |", end=' ')
    
    for i in range(0, len(policies), max_policies):
        chunk = policies[i:i+max_policies]
        print(len(chunk), end=' ')
        query = "\n\n".join(chunk)
        #output = llm_call(query, cluster_prompt, model_name, temperature=0, max_tokens=16*1024, timeout=60)
        output = extract_clusters(query, cluster_prompt, model_name)
        outputs.setdefault(domain, []).append(output)
    print()

SOCIAL - 4426 policies | 1000 1000 Error: Request timed out.
1000 1000 426 
URBAN - 1285 policies | 1000 285 


In [None]:
final = {}
for domain, taxonomies in outputs.items():
    if len(taxonomies) == 1:
        final[domain] = taxonomies[0]
    else:
        str_taxonomies = "\n\n".join([taxonomy.model_dump_json() for taxonomy in taxonomies])
        final[domain] = extract_clusters(str_taxonomies, merge_clusters_prompt, model_name)

In [None]:
# this was used to fix errors in the cell above

#str_taxonomies = "\n\n".join([taxonomy.model_dump_json() for taxonomy in outputs['SOCIAL']])
#final['SOCIAL'] = extract_clusters(str_taxonomies, merge_clusters_prompt, model_name)

#str_taxonomies = "\n\n".join([taxonomy.model_dump_json() for taxonomy in outputs['URBAN']])
#final['URBAN'] = extract_clusters(str_taxonomies, merge_clusters_prompt, model_name)

In [95]:
tmp = []
for domain, hierarchy in final.items():
    for subdomain in hierarchy.subdomains:
        print(f"{domain} - {subdomain.name} - {len(subdomain.clusters)} clusters")
        tmp.append((domain, subdomain.name, subdomain.clusters))

BUILDING - Building Energy Efficiency and Decarbonization - 5 clusters
BUILDING - Housing Affordability, Access, and Social Equity - 5 clusters
BUILDING - Passive and Climate-Responsive Design - 5 clusters
BUILDING - Structural Safety, Resilience, and Geotechnical Engineering - 5 clusters
BUILDING - Sustainable Construction Materials and Methods - 5 clusters
BUILDING - Indoor Environmental Quality and Health - 5 clusters
BUILDING - Urban Green Infrastructure and Water Management - 5 clusters
BUILDING - Digitalization and Smart Building Technologies - 5 clusters
BUILDING - Geotechnical and Foundation Engineering - 5 clusters
BUILDING - Fire Safety in Buildings - 4 clusters
BUILDING - Financing and Incentive Mechanisms for Building Upgrades - 5 clusters
BUILDING - Housing Flexibility, Adaptability, and Inclusivity - 5 clusters
BUILDING - Well-being and Human-Centered Design - 4 clusters
BUILDING - Construction Sector Policy and Governance - 4 clusters
BUILDING - Sustainable Tourism and H

In [101]:
cdf = pd.DataFrame.from_records(tmp, columns=['domain', 'subdomain', 'cluster'])
cdf

Unnamed: 0,domain,subdomain,cluster
0,BUILDING,Building Energy Efficiency and Decarbonization,[Building Energy Performance Certification and...
1,BUILDING,"Housing Affordability, Access, and Social Equity",[Affordable and Social Housing Provision and A...
2,BUILDING,Passive and Climate-Responsive Design,"[Bioclimatic and Passive Design Principles, Da..."
3,BUILDING,"Structural Safety, Resilience, and Geotechnica...",[Soil Stabilization and Geotechnical Engineeri...
4,BUILDING,Sustainable Construction Materials and Methods,[Green Building Certification Schemes and Ince...
...,...,...,...
434,URBAN,Regional Development and Territorial Cohesion,"[National and Regional Development Strategies,..."
435,URBAN,Urban Forestry and Ecological Restoration,[Tree inventory and urban forest management (e...
436,URBAN,Urban Finance and Investment Mechanisms,[Public-private partnerships (PPPs) and open t...
437,URBAN,Built Environment Safety and Resilience,[Construction standards and hazard zoning (e.g...


In [104]:
cdf = cdf.explode('cluster', ignore_index=True)
cdf

Unnamed: 0,domain,subdomain,cluster
0,BUILDING,Building Energy Efficiency and Decarbonization,Building Energy Performance Certification and ...
1,BUILDING,Building Energy Efficiency and Decarbonization,Energy Efficiency Standards and Codes for New ...
2,BUILDING,Building Energy Efficiency and Decarbonization,Thermal and Envelope Retrofit Programs for Exi...
3,BUILDING,Building Energy Efficiency and Decarbonization,Operational Energy Optimization and HVAC Contr...
4,BUILDING,Building Energy Efficiency and Decarbonization,Embodied Energy and Life Cycle Assessment (LCA...
...,...,...,...
2180,URBAN,Built Environment Safety and Resilience,"Building safety and resilience (e.g., earthqua..."
2181,URBAN,Built Environment Safety and Resilience,"Material and design innovation (e.g., reflecti..."
2182,URBAN,Rural Development and Livelihood Support,"Rural infrastructure development (e.g., roads,..."
2183,URBAN,Rural Development and Livelihood Support,Support for aging farmers and flexible employm...


In [105]:
cdf.to_csv('../data/sample_10k_policy_clusters_llm_2026-02-12.csv')

In [106]:
cdf.to_parquet('../data/sample_10k_policy_clusters_llm_2026-02-12.parquet')