In [1]:
from contextlib import nullcontext

import pandas as pd

In [2]:
df = pd.read_csv("../data/gt.csv")
df.rename(columns={
    'Pieces1': 'Pieces',
    'Manufacturer1': "Manufacturer",
    "SubType1": "SubType",
    "HxType1": "HxType",
    "NominelEffectEach1": "NominalEffectEach",
    "Year1": "Year"
}, inplace=True)
df.head()

Unnamed: 0,S_text,L_text,Pieces,Manufacturer,SubType,HxType,NominalEffectEach,Year
0,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...,1.0,Unknown,,Isoleret varmeveksler,,
1,Eksisterende fjernvarme,,1.0,Unknown,,,,
2,Fjernvarme med uisoleret veksler (indirekte an...,Bygningen opvarmes med fjernvarme. Anlægget er...,1.0,Unknown,,Uisoleret varmeveksler,,After 1980
3,Fjernvarme med isoleret veksler (indirekte anl...,Ejendommen opvarmes med fjernvarme fra HOFOR.\...,1.0,Danfoss Redan,,Isoleret varmeveksler,,
4,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...,1.0,Unknown,,Isoleret varmeveksler,,Before 1970


Create a number for each label in the dataset. This approach is not extensible: if the manufacturer is not present in the training data, the model won't be able to predict it.

In [3]:
target_cols = df.columns[2:]

label_maps = {}
for col in target_cols:
    if df[col].dtype == "object":
        df.fillna({col: "NaN"})
        label_maps[col] = {label: idx for idx, label in enumerate(df[col].unique())}
        df[f"{col}_idx"] = df[col].map(label_maps[col])

In [4]:
import sys
sys.path.append("../scripts")
from local_model import chatbot_extracter

In [5]:
MODEL_PATH = "../models/Qwen3-0.6B-Q5_K_M.gguf"  # Path to your GGUF file
N_CTX = 2048  # Context window size
N_THREADS = 8  # CPU threads

def build_prompt(text):
    return f"""
    Example 1:
    Input: "Fjernvarme med isoleret veksler (indirekte anlæg) - nyere.
    Bygningen opvarmes med fjernvarme.
    Anlægget er udført med isoleret varmeveksler og indirekte centralvarmevand i fordelingsnettet.
    Anlægget er opstillet i Badensgade 41."
    Output: {{
        "Pieces": 1,
        "Manufacturer": "Unknown",
        "SubType": null,
        "HxType": "Eksisterende fjernvarme",
        "NominalEffectEach": null,
        "Year": null
    }}

    Example 2:
    Input: "Fjernvarme med isoleret veksler (indirekte anlæg) - nyere. Ejendommen opvarmes med fjernvarme fra HOFOR.
Til opvarmning af radiatorerne er der 1 stk. isoleret varmevekslere monteret i fjernvarmeunit,  fabrikat Redan.
Fjernvarmeunit er placeret i kælderen."
    Output: {{
        "Pieces": 1,
        "Manufacturer": "Danfoss Redan",
        "SubType": null,
        "HxType": "Isoleret varmeveksler",
        "NominalEffectEach": null,
        "Year": null
    }}

    Example 3:
    Input: "Fjv. Installation efter 1980 (isoleret). Ejendommen opvarmes med indirekte fjernvarme.
Bygningen opvarmes med fjernvarme med veksler.
Veksleren er af fabrikat WPH, type SL70TL-1-90CC fra 2008 og vurderes isoleret med 40 mm PUR.
Jf. tidligere energimærkerapport er der brændeovne i de enkelte boliger.
I værkstedet i baghuset mod nordvest er der monteret en lille elradiator. Rummet er ikke medtaget som opvarmet i beregningen, da rummet alene vurderes kortvarigt opvarmet."
    Output: {{
        "Pieces": 1,
        "Manufacturer": "WPH Teknik",
        "SubType": SL70TL-1-90CC,
        "HxType": "Isoleret varmeveksler",
        "NominalEffectEach": null,
        "Year": 2008
    }}

    Example 4:
    Input: "Fjernvarme med isoleret veksler (indirekte anlæg) - efter 1980. Bygningen opvarmes med fjernvarme.
Anlægget er udført med isoleret varmeveksler og indirekte centralvarmevand i fordelingsnettet.
I teknikrum er opstillet 2 stk. varmevekslere - 1 stk. fabr. Sondex type ST 15-ST (radiatorer) à 200 kW og 1 stk. Sondex type ST 20-ST (ventilationsvarmeflader) à 180 kW.
Vekslere er præisolerede."
    Output: {{
        "Pieces": 1,
        "Manufacturer": "Sondex Teknik",
        "SubType": "ST 15-ST",
        "HxType": "Isoleret varmeveksler",
        "NominalEffectEach": "200 kW",
        "Year": "After 1980"
    }}

    You are provided a sentence, and you have to extract the following values:
        - pieces: the quantity of heating systems.
        - manufacturer: if it is not specified it is Unknown
        - SubType: must be id to identify the model, e.g. null, SL3323TLX, SKR, APVB,
        - HxType: Heat Exchanger Type, e.g. Isoleret varmeveksler, Fjernvarmeveksler, Varmeveksler
        - NominalEffectEach, e.g. 1100 kW, 150 kW, 400 kW
        - Year: e.g. After 1980, 2017, 2000-2009
    Return a JSON with the values.
    Input: "{text}"
    Output:
    """


GRAMMAR = r"""
root ::= "{" p "," m "," s "," h "," n "," y "}"
p ::= "\"Pieces\": " (number | "null")
number ::= [0-9]+ ("." [0-9]+)?
m ::= "\"Manufacturer\": " ("\"Unknown\"" | "\"" [a-zA-Z0-9æøåÆØÅ ]+ "\"")
s ::= "\"SubType\": " ("null" | "\"" [a-zA-Z0-9æøåÆØÅ ]+ "\"")
h ::= "\"HxType\": " ("null" | "\"" [a-zA-Z0-9æøåÆØÅ ]+ "\"")
n ::= "\"NominalEffectEach\": " ("null" | "\"" + number + " " ("kW" | "MW" | "W") + "\"")
y ::= "\"Year\": " ("null" | "\"" [a-zA-Z0-9æøåÆØÅ ]+ "\"" | [0-9]{4})
"""


In [6]:
chatbot = chatbot_extracter(MODEL_PATH, N_CTX, N_THREADS, build_prompt, GRAMMAR)

llama_context: n_ctx_per_seq (2048) < n_ctx_train (40960) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_set_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_c4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f16                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64 

In [7]:
i = 21
line = str(df.loc[i, "S_text"]) + ". " + str(df.loc[i, "L_text"])
print(df.iloc[i])

S_text                   Fjernvarme med isoleret veksler (indirekte anl...
L_text                   Ejendommen opvarmes med fjernvarme fra HOFOR.\...
Pieces                                                                 1.0
Manufacturer                                                       Unknown
SubType                                                                NaN
HxType                                                    Isoleret veksler
NominalEffectEach                                                      NaN
Year                                                                   NaN
Manufacturer_idx                                                         0
SubType_idx                                                              0
HxType_idx                                                               3
NominalEffectEach_idx                                                    0
Year_idx                                                                 0
Name: 21, dtype: object


In [8]:
class Query:
    def __init__(self, text):
        self.text = text

query = Query(line)
prediction = await chatbot.extract_heating_data(query)

In [9]:
import json

obj = prediction.body
print(obj)

b'{"Pieces":1,"Manufacturer":"HOFOR","SubType":null,"HxType":"Fjernvarmeveksler","NominalEffectEach":null,"Year":null}'


In [10]:
pred = json.loads(obj)
print(pred)
print(type(pred))

{'Pieces': 1, 'Manufacturer': 'HOFOR', 'SubType': None, 'HxType': 'Fjernvarmeveksler', 'NominalEffectEach': None, 'Year': None}
<class 'dict'>


In [11]:
dd = pd.DataFrame([pred])
print(dd)

   Pieces Manufacturer SubType             HxType NominalEffectEach  Year
0       1        HOFOR    None  Fjernvarmeveksler              None  None


In [12]:
from tqdm import trange

In [13]:
res = []
errors = []
for i in trange(len(df)):
    line = str(df.loc[i, "S_text"]) + ". " + str(df.loc[i, "L_text"])
    query = Query(line)
    try:
        prediction = await chatbot.extract_heating_data(query)
        obj = prediction.body
        pred = json.loads(obj)
        pred["S_text"] = df.loc[i, "S_text"]
        pred["L_text"] = df.loc[i, "L_text"]
        res.append(pred)
    except:
        print("error with entry: ", i)
        errors.append(i)

res_dataframe = pd.DataFrame(res)
res_dataframe.head()

  2%|▏         | 36/2272 [01:46<2:52:13,  4.62s/it]

error with entry:  35


  3%|▎         | 74/2272 [04:05<2:56:38,  4.82s/it]

error with entry:  73


 10%|█         | 233/2272 [09:54<1:35:08,  2.80s/it]

error with entry:  232


 11%|█         | 249/2272 [10:28<1:37:17,  2.89s/it]

error with entry:  248


 18%|█▊        | 415/2272 [15:53<1:24:44,  2.74s/it]

error with entry:  414


 19%|█▉        | 432/2272 [16:27<1:30:02,  2.94s/it]

error with entry:  431


 19%|█▉        | 433/2272 [16:32<1:47:11,  3.50s/it]

error with entry:  432


 34%|███▎      | 762/2272 [27:20<1:20:14,  3.19s/it]

error with entry:  761


 34%|███▎      | 763/2272 [27:25<1:36:12,  3.83s/it]

error with entry:  762


 34%|███▎      | 764/2272 [27:31<1:47:20,  4.27s/it]

error with entry:  763


 35%|███▌      | 802/2272 [29:01<1:24:54,  3.47s/it]

error with entry:  801


 36%|███▋      | 824/2272 [30:02<1:21:52,  3.39s/it]

error with entry:  823


 39%|███▉      | 888/2272 [32:28<1:11:59,  3.12s/it]

error with entry:  887


 39%|███▉      | 893/2272 [32:44<1:25:18,  3.71s/it]

error with entry:  892


 43%|████▎     | 980/2272 [36:29<1:21:58,  3.81s/it]

error with entry:  979


 48%|████▊     | 1088/2272 [40:47<1:08:13,  3.46s/it]

error with entry:  1087


 51%|█████     | 1160/2272 [43:41<59:56,  3.23s/it]  

error with entry:  1159


 53%|█████▎    | 1197/2272 [45:11<56:36,  3.16s/it]

error with entry:  1196


 62%|██████▏   | 1398/2272 [52:22<51:33,  3.54s/it]

error with entry:  1397


 66%|██████▋   | 1508/2272 [56:17<41:11,  3.23s/it]

error with entry:  1507


 69%|██████▉   | 1575/2272 [58:39<35:29,  3.05s/it]

error with entry:  1574


 70%|██████▉   | 1588/2272 [59:14<39:57,  3.50s/it]

error with entry:  1587


 70%|███████   | 1591/2272 [59:25<43:03,  3.79s/it]

error with entry:  1590


 72%|███████▏  | 1643/2272 [1:01:29<33:30,  3.20s/it]

error with entry:  1642


 74%|███████▍  | 1685/2272 [1:03:15<25:25,  2.60s/it]

error with entry:  1685


 74%|███████▍  | 1690/2272 [1:03:31<34:32,  3.56s/it]

error with entry:  1689


 75%|███████▍  | 1703/2272 [1:04:04<32:25,  3.42s/it]

error with entry:  1702


 76%|███████▌  | 1728/2272 [1:05:10<29:42,  3.28s/it]

error with entry:  1727


 78%|███████▊  | 1763/2272 [1:06:32<27:19,  3.22s/it]

error with entry:  1762


 78%|███████▊  | 1782/2272 [1:07:16<27:54,  3.42s/it]

error with entry:  1781


 80%|████████  | 1824/2272 [1:09:16<25:14,  3.38s/it]

error with entry:  1823


 87%|████████▋ | 1987/2272 [1:16:17<15:12,  3.20s/it]

error with entry:  1986


 89%|████████▊ | 2016/2272 [1:17:29<13:56,  3.27s/it]

error with entry:  2015


 89%|████████▉ | 2017/2272 [1:17:35<16:28,  3.88s/it]

error with entry:  2016


 89%|████████▉ | 2023/2272 [1:17:52<14:47,  3.56s/it]

error with entry:  2022


 93%|█████████▎| 2119/2272 [1:21:51<10:01,  3.93s/it]

error with entry:  2118


 94%|█████████▍| 2134/2272 [1:22:35<07:29,  3.26s/it]

error with entry:  2133


 96%|█████████▌| 2183/2272 [1:24:40<05:15,  3.55s/it]

error with entry:  2182


 97%|█████████▋| 2195/2272 [1:25:12<04:18,  3.36s/it]

error with entry:  2194


 97%|█████████▋| 2205/2272 [1:25:40<03:57,  3.55s/it]

error with entry:  2204


100%|██████████| 2272/2272 [1:28:16<00:00,  2.33s/it]


Unnamed: 0,Pieces,Manufacturer,SubType,HxType,NominalEffectEach,Year,S_text,L_text
0,1,Unknown,,Isoleret varmeveksler,,,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...
1,1,Unknown,,Eksisterende fjernvarme,,,Eksisterende fjernvarme,
2,1,Unknown,,Fjernvarmeveksler,,After 1980,Fjernvarme med uisoleret veksler (indirekte an...,Bygningen opvarmes med fjernvarme. Anlægget er...
3,1,Danfoss Redan,,Isoleret varmeveksler,,,Fjernvarme med isoleret veksler (indirekte anl...,Ejendommen opvarmes med fjernvarme fra HOFOR.\...
4,1,Unknown,,Isoleret varmeveksler,,,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...


In [15]:
import sys

sys.path.append("../scripts")
from data_from_xlsx import to_csv

to_csv(res_dataframe, "chatbot_predictions")


In [16]:
print(len(errors))

40
