In [1]:
import requests
import pandas as pd
from math import ceil
from time import sleep
import time
import json
from IPython.display import display

# 1. Data Retrieval and Integration

In [3]:
BASE = "https://api.openelectricity.org.au/v4"
TOKEN = "oe_3ZPA7phVarPYZc7Qks7uijfE" 
NETWORK = "NEM"     

HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/json",
}


DATE_START = "2025-10-01T00:00:00"  
DATE_END   = "2025-10-08T00:00:00" 


In [4]:
#get all facilities and related units in NEM
def get_nem_facilities():
    url = f"{BASE}/facilities/"
    # 按文档：network_id 可传数组；也可用 status_id / fueltech_id 做更细过滤
    params = {
        "network_id": ["NEM"],          # 只要 NEM
        "status_id": ["operating"],     # 只要运营中
       
    }
    r = requests.get(url, headers=HEADERS, params=params, timeout=60)
    r.raise_for_status()
    js = r.json()
 
    fac_df = pd.json_normalize(js["data"])  # 设施层
    # 展开 units 列为行（record_path=units），带上 facility 的 code 作为 meta
    units_df = pd.json_normalize(
    js["data"],
    record_path="units",
    meta=["code"],
    meta_prefix="facility_",    # ← 防止重复名
    errors="ignore"
    ).rename(columns={"code": "unit_code", "facility_code": "facility_code"})

    # 某些版本列名就是 'code'（unit）与 'facility_code'（meta），做个兜底：
    if "unit_code" not in units_df.columns:
        # 常见字段名：'code'（unit 的 code）
        if "code" in units_df.columns:
            units_df = units_df.rename(columns={"code": "unit_code"})

    return fac_df, units_df

fac_df, units_df = get_nem_facilities()

units_df.head()

Unnamed: 0,unit_code,fueltech_id,status_id,capacity_registered,capacity_maximum,data_first_seen,data_last_seen,dispatch_type,created_at,updated_at,capacity_storage,emissions_factor_co2,facility_code
0,ADPPV1,solar_utility,operating,24.75,19.0,2021-05-18T13:10:00+10:00,2025-10-27T19:30:00+10:00,GENERATOR,2023-10-18T04:34:30Z,2024-12-16T23:52:12Z,,,ADP
1,ADPPV2,solar_utility,operating,0.2,0.2,,,GENERATOR,2023-10-18T04:34:30Z,2024-12-16T23:50:10Z,,,ADP
2,ADPPV3,solar_utility,operating,0.02,0.02,,,GENERATOR,2023-10-18T04:34:30Z,2024-12-16T23:51:11Z,,,ADP
3,ADPBA1G,battery_discharging,operating,7.76,6.15,2021-05-18T10:55:00+10:00,2025-10-27T20:45:00+10:00,GENERATOR,2023-10-18T04:34:30Z,2025-06-23T05:34:25Z,12.6,,ADP
4,ADPBA1L,battery_charging,operating,7.76,6.15,2021-05-18T09:55:00+10:00,2025-10-27T20:35:00+10:00,LOAD,2023-10-18T04:34:30Z,2025-06-23T05:34:16Z,12.6,,ADP


In [5]:
fac_df.head()

Unnamed: 0,code,name,network_id,network_region,description,units,updated_at,created_at,location.lat,location.lng
0,ADP,Adelaide Desalination,NEM,SA1,"<p>The Adelaide Desalination plant (ADP), form...","[{'code': 'ADPPV1', 'fueltech_id': 'solar_util...",2025-08-05T06:08:12Z,2023-10-18T04:34:30Z,-35.096948,138.484061
1,ALDGASF,Aldoga,NEM,QLD1,<p>The Aldoga Solar Farm will be approximately...,"[{'code': 'ALDGASF1', 'fueltech_id': 'solar_ut...",2025-03-25T00:52:44Z,2025-01-31T04:19:33Z,-23.839544,151.0849
2,ANGASTON,Angaston,NEM,SA1,<p>Angaston Power Station is a diesel-powered ...,"[{'code': 'ANGAST1', 'fueltech_id': 'distillat...",2025-09-07T01:53:13Z,2023-10-18T04:34:32Z,-34.503948,139.024296
3,APPIN,Appin,NEM,NSW1,"<p>In a world first, EDL developed the largest...","[{'code': 'APPIN', 'fueltech_id': 'gas_wcmg', ...",2025-09-07T01:53:15Z,2023-10-18T04:34:32Z,-34.210868,150.792711
4,ARWF,Ararat,NEM,VIC1,<p>Ararat Wind Farm is wind farm in western Vi...,"[{'code': 'ARWF1', 'fueltech_id': 'wind', 'sta...",2025-07-08T03:42:06Z,2023-10-18T04:34:32Z,-37.263393,143.082116


In [6]:
#get specific facilities time-series data
def fetch_facility_timeseries(facility_codes, date_start=DATE_START, date_end=DATE_END):
    """对一批 facility_codes 请求两种指标（power, emissions），5m 粒度"""
    url = f"{BASE}/data/facilities/{NETWORK}"
    params = {
        "metrics": ["power", "emissions"],
        "interval": "5m",
        "facility_code": facility_codes,  # 批量
        "date_start": date_start,
        "date_end": date_end,
    }
    r = requests.get(url, headers=HEADERS, params=params, timeout=120)
    r.raise_for_status()
    return r.json()

def timeseries_to_df(js):
    """把返回扁平化为：
       timestamp | unit_code | metric | value | unit
    """
    rows = []
    data_block = js.get("data", [])
    if not isinstance(data_block, list):
        return pd.DataFrame(rows)  # 可能是 '-'，直接空表

    for blk in data_block:
        if not isinstance(blk, dict):
            continue
        metric = blk.get("metric")  # 'power' / 'emissions'
        unit   = blk.get("unit")    # 'MW' / 'tCO2e'
        results = blk.get("results", [])
        for res in results:
            if not isinstance(res, dict):
                continue
            # unit_code 优先从 columns 取；没有就从 name 拆
            unit_code = None
            cols = res.get("columns")
            if isinstance(cols, dict):
                unit_code = cols.get("unit_code")
            name = res.get("name")
            if not unit_code and isinstance(name, str) and "_" in name:
                unit_code = name.split("_", 1)[1]  # e.g. power_ADPBA1 → ADPBA1

            for item in res.get("data", []):  # item = [timestamp, value]
                if isinstance(item, (list, tuple)) and len(item) >= 2:
                    ts, val = item[0], item[1]
                    rows.append({
                        "timestamp": ts,
                        "unit_code": unit_code,
                        "metric": metric,
                        "value": val,
                        "unit": unit
                    })
    return pd.DataFrame(rows)

# 先拿到 facility_code 列表（可按需过滤 fueltech/network_region 来减少范围）
facility_codes = fac_df["code"].dropna().unique().tolist()

# 日配额考虑：把 batch 调大一点（但单次响应也会更大）
BATCH   = 30
SLEEP_S = 0.25

all_parts = []
req_count = 0

for i in range(0, len(facility_codes), BATCH):
    batch = facility_codes[i:i+BATCH]
    js = fetch_facility_timeseries(batch)
    req_count += 1
    df_part = timeseries_to_df(js)
    all_parts.append(df_part)
    time.sleep(SLEEP_S)

df_unit = pd.concat(all_parts, ignore_index=True) if all_parts else pd.DataFrame()
print("requests used:", req_count, "rows:", len(df_unit))
display(df_unit.head())

requests used: 14 rows: 1958730


Unnamed: 0,timestamp,unit_code,metric,value,unit
0,2025-10-01T00:00:00+10:00,ADPBA1,power,-0.004,MW
1,2025-10-01T00:05:00+10:00,ADPBA1,power,-0.046,MW
2,2025-10-01T00:10:00+10:00,ADPBA1,power,0.0,MW
3,2025-10-01T00:15:00+10:00,ADPBA1,power,0.003,MW
4,2025-10-01T00:20:00+10:00,ADPBA1,power,-0.018,MW


In [7]:
# 建立 unit_code → facility_code 的映射
unit_to_fac = units_df[["unit_code", "facility_code"]].dropna().drop_duplicates()
df_unit = df_unit.merge(unit_to_fac, on="unit_code", how="left")

# per-facility 宽表：同一时间点、同一 facility 的 power/emissions 各一列（把多单元求和）
df_fac_wide = (
    df_unit
    .pivot_table(index=["timestamp", "facility_code"], columns="metric", values="value", aggfunc="sum")
    .rename_axis(columns=None)
    .reset_index()
    .sort_values(["timestamp", "facility_code"])
)

# 将 fac_df 中需要的列挑出来
fac_info = fac_df[["code", "name", "location.lat", "location.lng","network_region"]].rename(
    columns={"code": "facility_code", "name": "facility_name"}
)

# 左连接合并（保留 df_fac_wide 中的所有行）
df_fac_wide = df_fac_wide.merge(fac_info, on="facility_code", how="left")

# 查看结果
display(df_fac_wide.head())


Unnamed: 0,timestamp,facility_code,emissions,power,facility_name,location.lat,location.lng,network_region
0,2025-10-01T00:00:00+10:00,0MREH,0.0,0.0,Melbourne A1,-37.661274,144.726302,VIC1
1,2025-10-01T00:00:00+10:00,0MREHA2,0.0,0.0,Melbourne A2,-37.663934,144.726927,VIC1
2,2025-10-01T00:00:00+10:00,0TARONGBESS,0.0,0.0,Tarong,-26.780051,151.912068,QLD1
3,2025-10-01T00:00:00+10:00,0WAMBOWF,0.0,65.23,Wambo,-26.603045,151.246876,QLD1
4,2025-10-01T00:00:00+10:00,ADP,0.0,0.0,Adelaide Desalination,-35.096948,138.484061,SA1


In [8]:
def fetch_market_network(network_region=None,date_start=DATE_START, date_end=DATE_END):
    url = f"{BASE}/market/network/{NETWORK}"
    params = {
        "metrics": ["price", "demand"],
        "interval": "5m",
        "date_start": date_start,
        "date_end": date_end,
        "primary_grouping": "network_region",  # "network" 或 "network_region"
    }
    if network_region:
        params["network_region"] = network_region

    r = requests.get(url, headers=HEADERS, params=params, timeout=60)
    r.raise_for_status()
    return r.json()

# -------- 2) 扁平化为 DataFrame（长表：timestamp / metric / value / unit / network_region） --------
def market_to_df(js):
    """
    将 /v4/market/network 返回的数据扁平化为长表：
    列：timestamp | network_region | metric | value | unit
    兼容 columns.region / columns.network_region；必要时从 name 里解析区域。
    """
    rows = []
    data_block = js.get("data", [])
    if not isinstance(data_block, list):
        return pd.DataFrame(rows)

    for blk in data_block:
        metric = blk.get("metric")         # e.g. "price" / "demand"
        unit   = blk.get("unit")           # e.g. "$/MWh" / "MW"

        for res in blk.get("results", []):
            cols = res.get("columns") or {}
            name = res.get("name") or ""

            # 优先按你截图：columns 里是 {"region": "NSW1"}
            region = cols.get("region") or cols.get("network_region")

            # 兜底：从 name 里解析（如 "price_NSW1"）
            if region is None and isinstance(name, str):
                m = re.search(r'_(NSW1|VIC1|QLD1|SA1|TAS1)\b', name)
                region = m.group(1) if m else None

            # 展开该区域的时序点
            for item in res.get("data", []):
                if isinstance(item, (list, tuple)) and len(item) >= 2:
                    ts, val = item[0], item[1]
                    rows.append({
                        "timestamp": ts,
                        "network_region": region,
                        "metric": metric,
                        "value": val,
                        "unit": unit
                    })

    df = pd.DataFrame(rows)
    return df


js = fetch_market_network()

df_market = market_to_df(js)

# # 如需宽表（每行一个时间+区域，price/demand 两列）：
df_market_wide = (
    df_market
    .pivot_table(
        index=["timestamp", "network_region"],  # 每个区域、每个时间点一行
        columns="metric",                      # 每种指标（price, demand）变成一列
        values="value",                        # 数值列
        aggfunc="mean"                         # 遇到重复取最后一个（或可换成 'mean'）
    )
    .rename_axis(columns=None)                 # 去掉多余 axis 名称
    .reset_index()                             # 把索引恢复成普通列
    .sort_values(["timestamp", "network_region"])
)
display(df_market_wide.head(10))

Unnamed: 0,timestamp,network_region,demand,price
0,2025-10-01T00:00:00+10:00,NSW1,7105.57,56.98
1,2025-10-01T00:00:00+10:00,QLD1,5989.24,54.82
2,2025-10-01T00:00:00+10:00,SA1,1564.92,8.11
3,2025-10-01T00:00:00+10:00,TAS1,898.71,0.12
4,2025-10-01T00:00:00+10:00,VIC1,4893.49,8.95
5,2025-10-01T00:05:00+10:00,NSW1,7170.68,80.01
6,2025-10-01T00:05:00+10:00,QLD1,5920.4,67.3
7,2025-10-01T00:05:00+10:00,SA1,1565.38,0.01
8,2025-10-01T00:05:00+10:00,TAS1,897.18,0.2
9,2025-10-01T00:05:00+10:00,VIC1,4889.73,0.01


In [9]:
df_fac_wide = (
    df_fac_wide
    .merge(
        df_market_wide[["timestamp", "network_region", "demand", "price"]],
        on=["timestamp", "network_region"],
        how="left"
    )
)
df_fac_wide = df_fac_wide.rename(columns={"network_region": "market(network_region)"})
df_fac_wide.head()

Unnamed: 0,timestamp,facility_code,emissions,power,facility_name,location.lat,location.lng,market(network_region),demand,price
0,2025-10-01T00:00:00+10:00,0MREH,0.0,0.0,Melbourne A1,-37.661274,144.726302,VIC1,4893.49,8.95
1,2025-10-01T00:00:00+10:00,0MREHA2,0.0,0.0,Melbourne A2,-37.663934,144.726927,VIC1,4893.49,8.95
2,2025-10-01T00:00:00+10:00,0TARONGBESS,0.0,0.0,Tarong,-26.780051,151.912068,QLD1,5989.24,54.82
3,2025-10-01T00:00:00+10:00,0WAMBOWF,0.0,65.23,Wambo,-26.603045,151.246876,QLD1,5989.24,54.82
4,2025-10-01T00:00:00+10:00,ADP,0.0,0.0,Adelaide Desalination,-35.096948,138.484061,SA1,1564.92,8.11


# 2. Data Preprocessing

In [11]:
df_fac_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668353 entries, 0 to 668352
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   timestamp               668353 non-null  object 
 1   facility_code           668353 non-null  object 
 2   emissions               668353 non-null  float64
 3   power                   668353 non-null  float64
 4   facility_name           668353 non-null  object 
 5   location.lat            668353 non-null  float64
 6   location.lng            668353 non-null  float64
 7   market(network_region)  668353 non-null  object 
 8   demand                  667033 non-null  float64
 9   price                   654823 non-null  float64
dtypes: float64(6), object(4)
memory usage: 51.0+ MB


In [12]:
#把时间转换成澳东时区
df_fac_wide["timestamp"] = pd.to_datetime(df_fac_wide["timestamp"], utc=True)
df_fac_wide["timestamp"] = df_fac_wide["timestamp"].dt.tz_convert("Australia/Sydney")

df_fac_wide["facility_code"]  = df_fac_wide["facility_code"].astype("string")
df_fac_wide["facility_name"]  = df_fac_wide["facility_name"].astype("string")
df_fac_wide["market(network_region)"]  = df_fac_wide["facility_code"].astype("string")

In [13]:
neg_demand = df_fac_wide[df_fac_wide["demand"] < 0]
print(f"Negative demand rows: {len(neg_demand)}")
display(neg_demand.head())

Negative demand rows: 290


Unnamed: 0,timestamp,facility_code,emissions,power,facility_name,location.lat,location.lng,market(network_region),demand,price
333607,2025-10-04 12:15:00+10:00,ADP,0.0,0.088,Adelaide Desalination,-35.096948,138.484061,ADP,-12.17,-31.19
333608,2025-10-04 12:15:00+10:00,AGLHAL,0.0,0.0,Hallett,-33.34931,138.752633,AGLHAL,-12.17,-31.19
333611,2025-10-04 12:15:00+10:00,ANGASTON,0.0,0.0,Angaston,-34.503948,139.024296,ANGASTON,-12.17,-31.19
333620,2025-10-04 12:15:00+10:00,BARKIPS,0.0,0.0,Barker Inlet,-34.804,138.524,BARKIPS,-12.17,-31.19
333632,2025-10-04 12:15:00+10:00,BLUFF,0.0,0.0,The Bluff,-33.367741,138.79379,BLUFF,-12.17,-31.19


In [14]:
neg_demand = df_fac_wide[df_fac_wide["demand"] < 0]
print(f"Negative demand rows: {len(neg_demand)}")

ratio = (len(neg_demand) / len(df_fac_wide) * 100)
print(f"Negative demand ratio (%): {round(ratio, 2)}")

display(neg_demand.head())

Negative demand rows: 290
Negative demand ratio (%): 0.04


Unnamed: 0,timestamp,facility_code,emissions,power,facility_name,location.lat,location.lng,market(network_region),demand,price
333607,2025-10-04 12:15:00+10:00,ADP,0.0,0.088,Adelaide Desalination,-35.096948,138.484061,ADP,-12.17,-31.19
333608,2025-10-04 12:15:00+10:00,AGLHAL,0.0,0.0,Hallett,-33.34931,138.752633,AGLHAL,-12.17,-31.19
333611,2025-10-04 12:15:00+10:00,ANGASTON,0.0,0.0,Angaston,-34.503948,139.024296,ANGASTON,-12.17,-31.19
333620,2025-10-04 12:15:00+10:00,BARKIPS,0.0,0.0,Barker Inlet,-34.804,138.524,BARKIPS,-12.17,-31.19
333632,2025-10-04 12:15:00+10:00,BLUFF,0.0,0.0,The Bluff,-33.367741,138.79379,BLUFF,-12.17,-31.19


In [15]:
neg_price = df_fac_wide[df_fac_wide["price"] < 0]
print(f"Negative price rows: {len(neg_price)}")
display(neg_price.head())

Negative price rows: 279826


Unnamed: 0,timestamp,facility_code,emissions,power,facility_name,location.lat,location.lng,market(network_region),demand,price
7260,2025-10-01 01:50:00+10:00,0MREH,0.0,1.4704,Melbourne A1,-37.661274,144.726302,0MREH,4300.21,-0.06
7261,2025-10-01 01:50:00+10:00,0MREHA2,0.0,0.0,Melbourne A2,-37.663934,144.726927,0MREHA2,4300.21,-0.06
7264,2025-10-01 01:50:00+10:00,ADP,0.0,0.0,Adelaide Desalination,-35.096948,138.484061,ADP,1513.45,-0.05
7265,2025-10-01 01:50:00+10:00,AGLHAL,0.0,0.0,Hallett,-33.34931,138.752633,AGLHAL,1513.45,-0.05
7266,2025-10-01 01:50:00+10:00,AGLSOM,0.0,0.0,Somerton,-37.630949,144.953098,AGLSOM,4300.21,-0.06


In [16]:
df_fac_wide.to_csv("nem_per_facility_power_emissions_5m_2025-10-01_to_08.csv", index=False)

# 3.Data Publishing via MQTT

In [18]:
# 1: MQTT broker 
import socket

host, port = "test.mosquitto.org", 1883
# host, port = "broker.hivemq.com", 1883

s = socket.socket(); s.settimeout(3)
try:
    s.connect((host, port))
    print(f"MQTT broker reachable: {host}:{port}")
finally:
    s.close()

print(f"\nMQTT broker candidate: {host}:{port}")




MQTT broker reachable: test.mosquitto.org:1883

MQTT broker candidate: test.mosquitto.org:1883


In [19]:
# 2: subscriber 
import paho.mqtt.client as mqtt
import json, uuid

HOST, PORT = "test.mosquitto.org", 1883
TOPIC = "nem/yjia0057/power_emissions"

# 1) stop previous subscriber if exists (safe re-run)
try:
    c.loop_stop()
    c.disconnect()
except Exception:
    pass

# 2) use random client_id to avoid kicking each other
client_id = f"nem-tail-{uuid.uuid4().hex[:6]}"

count = 0

def on_connect_quiet(client, userdata, flags, rc, properties=None):
    # re-connect callback after first connect: subscribe only, no printing
    client.subscribe(TOPIC, qos=1)

def on_connect_once(client, userdata, flags, rc, properties=None):
    # print only once, then switch to quiet callback to suppress further logs
    print("connected, rc=", rc)
    client.on_connect = on_connect_quiet
    client.subscribe(TOPIC, qos=1)

def on_message(client, userdata, msg):
    global count
    count += 1
    if count <= 3:
        try:
            print("[sub] sample:", json.loads(msg.payload.decode("utf-8")))
        except Exception:
            print("[sub] sample(non-JSON):", msg.payload[:120])
    if count % 10000 == 0:
        print(f"[sub] received {count} messages")

c = mqtt.Client(
    client_id=client_id,
    protocol=mqtt.MQTTv311,
    transport="tcp",
    callback_api_version=mqtt.CallbackAPIVersion.VERSION2
)

# lower-frequency reconnect to reduce noise if broker resets the connection
c.reconnect_delay_set(min_delay=2, max_delay=30)

c.on_connect = on_connect_once
c.on_message = on_message
c.connect(HOST, PORT, keepalive=60)
c.loop_start()
print(f"subscriber started (id={client_id}).")




subscriber started (id=nem-tail-e844d5).


In [20]:
# 3: 发布端工具函数
import json, time
import pandas as pd

_REGION_CANDS = ["market(network_region)", "network_region", "region"]

def _detect_region_col(df: pd.DataFrame) -> str:
    for c in _REGION_CANDS:
        if c in df.columns:
            return c
    raise KeyError("region column not found (market(network_region)/network_region/region)")

def _parse_ts_to_utc(ts_val):
    try:
        ts = pd.to_datetime(ts_val, errors="raise")
        if getattr(ts, "tzinfo", None) is None:
            ts = ts.tz_localize("UTC")
        else:
            ts = ts.tz_convert("UTC")
        return ts
    except Exception:
        return pd.NaT

def _connect_client(host="localhost", port=1883, user="", password=""):
    # 创建 MQTT 客户端，兼容 paho 2.x 的回调 API 版本
    cli = mqtt.Client(
        client_id="nem-pub",
        protocol=mqtt.MQTTv311,
        transport="tcp",
        callback_api_version=mqtt.CallbackAPIVersion.VERSION2
    )
    if user:
        cli.username_pw_set(user, password or "")
    cli.reconnect_delay_set(min_delay=1, max_delay=10)
    cli.will_set("nem/system/nem-pub", payload="offline", qos=1, retain=False)
    cli.connect(host, port, keepalive=60)
    return cli

def _row_to_payload(row: pd.Series, region_col: str, seq: int) -> dict:
    # 将一行数据转为发布载荷
    ts = row["timestamp"]
    ts_iso = ts.tz_convert("UTC").isoformat().replace("+00:00", "Z") if pd.notna(ts) else None
    return {
        "timestamp_utc": ts_iso,             
        "facility_code": row.get("facility_code"),
        "facility_name": row.get("facility_name"),
        "region": row.get(region_col),
        "lat": row.get("location.lat", None),
        "lng": row.get("location.lng", None),
        "power": row.get("power"),
        "emissions": row.get("emissions"),
        "price": row.get("price", None),
        "demand": row.get("demand", None),
        "seq": seq
    }


In [21]:
# 4: 从 DataFrame 顺序发布到 MQTT
def publish_from_df_progress(
    df: pd.DataFrame,
    *,
    host="localhost",
    port=1883,
    topic="nem/power_emissions",
    delay=0.1,
    also_split=True,
    user="",
    password="",
    log_every=1000
) -> int:
    """
    Publish rows from df (df_fac_wide) to MQTT in event-time order.
    - stable sort by ['timestamp','facility_code']
    - at least 0.1s delay between messages (assignment requirement)
    - also_split=True: additionally publish to 'nem/{REGION}/{FACILITY}/power_emissions'
    - console output is single-line progress to avoid flooding
    """
    if "timestamp" not in df.columns:
        raise KeyError("DataFrame lacks `timestamp` column")

    # 拷贝数据并统一时间为 UTC
    dfx = df.copy()
    dfx["timestamp"] = dfx["timestamp"].apply(_parse_ts_to_utc)

    # 稳定排序：确保严格的事件时间顺序
    dfx = dfx.sort_values(["timestamp", "facility_code"], kind="mergesort").reset_index(drop=True)

    # 自动识别区域列名
    region_col = _detect_region_col(dfx)

    # 建立连接并开启网络循环
    cli = _connect_client(host=host, port=port, user=user, password=password)
    cli.loop_start()

    n = len(dfx)
    print(f"[publisher] start: total rows = {n}, topic = {topic}, delay = {delay}s")
    t0 = time.time()
    wrote_inline = False

    # 逐行发布（QoS=1；不保留消息；满足≥0.1s 间隔）
    for i, row in enumerate(dfx.itertuples(index=False), start=1):
        payload = _row_to_payload(pd.Series(row._asdict()), region_col, i-1)
        j = json.dumps(payload, ensure_ascii=False)

        # 汇总主题
        cli.publish(topic, j, qos=1, retain=False)

        # 分层主题：nem/{REGION}/{FACILITY}/power_emissions
        if also_split:
            reg = str(payload.get("region", "NA")).replace("/", "_")
            fac = str(payload.get("facility_code", "NA")).replace("/", "_")
            split_topic = f"nem/{reg}/{fac}/power_emissions"
            cli.publish(split_topic, j, qos=1, retain=False)

        # 单行覆盖式进度输出
        if i % max(1, int(log_every)) == 0:
            elapsed = time.time() - t0
            rate = i / max(elapsed, 1e-9)
            pct = (i * 100.0 / n) if n else 100.0
            eta = (n - i) / max(rate, 1e-9)
            print(
                f"\r[publisher] {i}/{n} ({pct:5.2f}%)  {rate:6.1f} msg/s  ETA {eta/60:5.1f} min",
                end="",
                flush=True
            )
            wrote_inline = True

        # 至少 0.1 秒的间隔
        time.sleep(max(0.1, float(delay)))

    # 关闭循环与连接
    cli.loop_stop(); cli.disconnect()

    # 若使用单行覆盖式打印，收尾补一个换行
    if wrote_inline:
        print()

    print(f"[publisher] done. sent {n} rows in {time.time()-t0:.1f}s")
    return n



In [22]:
# 5: 小样本演示
sample = (
    df_fac_wide
    .sort_values(["timestamp", "facility_code"])
    .head(300)
    .copy()
)

publish_from_df_progress(
    sample,
    host="test.mosquitto.org", port=1883,
    topic="nem/yjia0057/power_emissions",
    delay=0.1,
    also_split=True,
    log_every=100  
)



[publisher] start: total rows = 300, topic = nem/yjia0057/power_emissions, delay = 0.1s
connected, rc= Success
[sub] sample: {'timestamp_utc': '2025-09-30T14:00:00Z', 'facility_code': 'AVLSF', 'facility_name': 'Avonlie', 'region': None, 'lat': None, 'lng': None, 'power': 0.0, 'emissions': 0.0, 'price': 56.98, 'demand': 7105.57, 'seq': 10}
[sub] sample: {'timestamp_utc': '2025-09-30T14:00:00Z', 'facility_code': 'B2PS', 'facility_name': 'Braemar 2', 'region': None, 'lat': None, 'lng': None, 'power': 0.0806, 'emissions': 0.0039, 'price': 54.82, 'demand': 5989.24, 'seq': 11}
[sub] sample: {'timestamp_utc': '2025-09-30T14:00:00Z', 'facility_code': 'BALBESS', 'facility_name': 'Ballarat', 'region': None, 'lat': None, 'lng': None, 'power': 0.0, 'emissions': 0.0, 'price': 8.95, 'demand': 4893.49, 'seq': 12}
[publisher] 300/300 (100.00%)     9.9 msg/s  ETA   0.0 min
[publisher] done. sent 300 rows in 30.8s


300

# 5.Continuous Execution 

In [23]:
# === Cell 7: Task 5 连续执行运行器 ===
import time

def run_continuous_publish(
    df,
    *,
    rounds: int | None = 2,      # 轮数
    round_sleep: int = 60,       # 每轮结束后等待60秒
    host="test.mosquitto.org",
    port=1883,
    topic="nem/yjia0057/power_emissions",
    delay=0.1,                   # 单条 0.1s
    also_split=True,
    log_every=50000             
):

    i = 0
    try:
        while True:
            i += 1
            print(f"\n[runner] round {i} start")
            sent = publish_from_df_progress(
                df,
                host=host, port=port,
                topic=topic,
                delay=delay,
                also_split=also_split,
                log_every=log_every
            )
            print(f"[runner] round {i} done: published {sent} rows")
            if (rounds is not None) and (i >= rounds):
                print("[runner] finished all rounds.")
                break
            print(f"[runner] sleeping {round_sleep}s before next round ...")
            time.sleep(round_sleep)
    except KeyboardInterrupt:
        print("\n[runner] stopped by user.")


In [45]:
run_continuous_publish(
    df=sample,            
    rounds=2,             # 跑两轮
    round_sleep=60,       # 每轮额外等待 60 秒
    topic="nem/yjia0057/power_emissions",
    delay=0.1,
    log_every=200       
)



[runner] round 1 start
[publisher] start: total rows = 300, topic = nem/yjia0057/power_emissions, delay = 0.1s
[publisher] 200/300 (66.67%)    10.0 msg/s  ETA   0.2 min
[publisher] done. sent 300 rows in 31.0s
[runner] round 1 done: published 300 rows
[runner] sleeping 60s before next round ...

[runner] round 2 start
[publisher] start: total rows = 300, topic = nem/yjia0057/power_emissions, delay = 0.1s
[publisher] 200/300 (66.67%)    10.0 msg/s  ETA   0.2 min
[publisher] done. sent 300 rows in 30.5s
[runner] round 2 done: published 300 rows
[runner] finished all rounds.
