In [1]:
import asyncio
import os
import time
from dotenv import load_dotenv

from openai import AsyncOpenAI
import pandas as pd
from pydantic import BaseModel, Field
from tqdm.notebook import tqdm

from pprint import pprint

load_dotenv()

True

In [2]:
client = AsyncOpenAI(
    #base_url=os.getenv("GENERATION_API_URL"),
    base_url="https://f3948611-98f8-4738-b973-30b4a657cffa.ifr.fr-par.scaleway.com/v1",
    api_key=os.getenv("SCW_SECRET_KEY"),
)

In [3]:
df = pd.read_parquet('data/conclusions_sample_10k_2026-01-27.parquet')

In [4]:
df

Unnamed: 0_level_0,discussion
openalex_id,Unnamed: 1_level_1
W4387168299,This is the first study to assess the associat...
W3014778543,The main conclusions extracted from this study...
W3048162401,"LPG, CNG, ethanol and biodiesel are good candi..."
W1005805659,The results of the study revealed that growth ...
W4319601041,"In our use of participatory video, following R..."
...,...
W4390273040,Erosion is a significant issue impacting upon ...
W4298143484,"In this article, the individual and collective..."
W4387449772,Research findings\nAccording to the context of...
W2623732815,A FDS Input Files\nList of Figures\nPredicted ...


In [11]:
class PolicyExtractionResponse(BaseModel):
    contains_policies: bool = Field(..., description="Whether the text mentions at least one policy.")
    policies: list[str] = Field(..., description="A list of policies mentioned in the text. If contains_policies is False, this list should be empty.")
    

model_name = "mistral-small-3.2-24b-instruct-2506"
#model_name = "qwen/qwen3-235b-a22b-instruct-2507"

with open('POLICIES_EXTRACTION_PROMPT.txt', 'r') as f:
    prompt = f.read()

In [12]:
async def extract_policies(
    text: str, prompt: str, model_name: str, client: AsyncOpenAI = client
) -> PolicyExtractionResponse:
    try:
        response = await client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": prompt.strip()},
                {"role": "user", "content": text},
            ],
            temperature=0,
            max_tokens=1024,
            response_format= {
                    "type": "json_schema",
                    "json_schema": {
                        "name": PolicyExtractionResponse.__name__,
                        "schema": PolicyExtractionResponse.model_json_schema(),
                    },
                },
            timeout=60,
            stream=False,
        )
        return PolicyExtractionResponse.model_validate_json(response.choices[0].message.content)
    except Exception as e:
        print('Error')
        return f"Error: {e}"


async def batch_extract_policies(
    texts: list[str], prompt: str, model_name: str, client: AsyncOpenAI = client
) -> list[PolicyExtractionResponse]:
    results = await asyncio.gather(*[extract_policies(text, prompt, model_name, client) for text in texts])
    return results

In [15]:
pprint(df.discussion.iloc[650])

('The Global Burden of Disease project (GBD) found that household solid fuel '
 'use accounted for 12% of ambient\n'
 '\xadPM2.5 globally in \xad201020, with higher contribution in China (19% in '
 '2013) and India (24% in 2015)21,22. Exposure assessment for the GBD also '
 'showed substantial exposures occurring in rural \xadareas23. Potential '
 'sources of rural ambient air pollution may be from households using solid '
 'fuels for cooking and heating, from nearby urban and rural sources, and from '
 'secondary pollutants at intercontinental \xadscales8.\n'
 'Only a handful of studies have reported ambient concentrations in rural '
 'areas, as health-damaging air pollu- tion has been considered a largely '
 'urban \xadphenomenon8. Our study revealed high levels of outdoor air '
 'pollution in a rural area of China with a high incidence of lung cancer. The '
 '24-h geometric mean outdoor \xadPM2.5 concentra- tions in villages of the '
 'two rural counties, 51.6 µg/m3 in Xuanwei and 4

In [16]:
res = await extract_policies(df.discussion.iloc[650], prompt, model_name, client)
res

PolicyExtractionResponse(contains_policies=True, policies=['[SOCIAL] [INFORMATIONAL] [NATIONAL] The World Health Organization (WHO) set a guideline value for outdoor PM2.5 concentrations at 25 µg/m3.', '[SOCIAL] [REGULATORY] [NATIONAL] The Chinese national standard for 24-h criterion of BaP is 2.5 ng/m3.', '[ENERGY] [ORGANISATIONAL] [INDIVIDUALS] Converting from unvented stoves to either stoves with chimneys or portable stoves (which were intended to be lit outdoors before being carried inside for use).', '[ENERGY] [ORGANISATIONAL] [INDIVIDUALS] Installing chimneys to discharge pollutants from inside homes to the outdoors.', '[ENERGY] [ORGANISATIONAL] [INDIVIDUALS] Moving populations up the “energy ladder” towards the use of cleaner fuels (e.g. biogas and electricity).'])

In [17]:
# process by batch
batch_size = 100
#rpm_quota = 600
#wait_time = 60 / (rpm_quota / batch_size)
results = []
for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i+batch_size]
    texts = batch.discussion.values.tolist()
    preds = await batch_extract_policies(texts, prompt, model_name, client)
    results.extend(preds)
    time.sleep(0.1)

  0%|          | 0/100 [00:00<?, ?it/s]

Error
Error
Error
Error
Error
Error


In [18]:
policies = []
for r in results:
    try:
        policies.append(r.policies)
    except:
        policies.append([])

In [19]:
df['policies'] = policies

In [20]:
df.to_parquet('data/sample_10k_with_policies_extracted_tagged_v4_2026-01-30.parquet')

In [21]:
df.to_csv('data/sample_10k_with_policies_extracted_tagged_v4_2026-01-30.csv')

In [3]:
df = pd.read_parquet('data/sample_10k_with_policies_extracted_tagged_v2_2026-01-28.parquet')

In [5]:
i = 0
for oaid, row in df.iterrows():
    print(oaid)
    print('TEXT')
    print(row.discussion)
    print('Extracted policies:', row.policies)
    print('='*80)
    i+=1
    if i == 100:
        break

W4387168299
TEXT
This is the first study to assess the association between frailty status and insulin resistance estimated by the TyG index in older urban residents. The present study investi­ gated the association between TyG index and frailty sta­ tus from cross-sectional, retrospective and prospective levels using an ongoing aging cohort dataset. Elevated
TyG index and the high-stable trajectory group of the
TyG index was found to be associated with a significantly increased risk of frailty, which persisted even after adjust­ ing for potential confounders such as BMI, nutritional status, exercise and cardiovascular-metabolic factors. In the subgroup analysis, subjects with a higher BMI and who follow a high-stable trajectory of TyG index run a greater risk of developing prefrailty or frailty. Notably, these results remained robust in the sensitivity analyses that excluded subjects with the use of hypoglycemic or lipid-lowering agents, further accentuating the consis­ tency of this a