In [None]:
# scripts/make_id_maps.py
import json, pandas as pd, pathlib

triples = pd.read_csv('/content/drive/MyDrive/THESIS/LTN/KG-triples.csv', sep=',',
                      names=['h','r','t'], skiprows=1)

entities = sorted(set(triples['h']) | set(triples['t']))
relations = sorted(triples['r'].unique())

pathlib.Path('build').mkdir(exist_ok=True)

with open('/content/drive/MyDrive/THESIS/LTN/entities.json', 'w') as f:
    json.dump({e:i for i,e in enumerate(entities)}, f, indent=2)

with open('/content/drive/MyDrive/THESIS/LTN/relations.json', 'w') as f:
    json.dump({r:i for i,r in enumerate(relations)}, f, indent=2)

print(f'entities={len(entities)}  relations={len(relations)}')


entities=2441  relations=16


In [None]:
# scripts/split_triples.py

import json
import random
import pandas as pd
from pathlib import Path

DATA_DIR   = Path('/content/drive/MyDrive/THESIS/LTN')
triples = pd.read_csv('/content/drive/MyDrive/THESIS/LTN/KG-triples.csv', sep=',',
                      names=['h','r','t'], skiprows=1)

# 1) load & shuffle
random.seed(42)

triples = triples.sample(frac=1, random_state=42).reset_index(drop=True)

# 2) compute split sizes
n_total  = len(triples)
n_train  = int(0.8 * n_total)
n_valid  = int(0.1 * n_total)
# rest goes to test
n_test   = n_total - n_train - n_valid

# 3) slice
train = triples[:n_train]
valid = triples[n_train:n_train + n_valid]
test  = triples[n_train + n_valid:]

# 4) write out
for name, subset in [('train', train),
                     ('valid', valid),
                     ('test',  test)]:
    out_path = DATA_DIR / f'{name}_triples.json'
    with open(out_path, 'w') as f:
        json.dump(subset.values.tolist(), f, indent=2)
    print(f"{name:5s}: {len(subset)} triples")

print(f"total: {n_total}, train={n_train}, valid={n_valid}, test={n_test}")


train: 3280 triples
valid: 410 triples
test : 411 triples
total: 4101, train=3280, valid=410, test=411


In [None]:
# scripts/compile_empirical_rules.py
import json, yaml, numpy as np, csv, pathlib, math

# 1) median λ from AMIE
pca_vals = []
with open('/content/drive/MyDrive/THESIS/LTN/AMIE-rules.csv') as f:
    rdr = csv.reader(f, delimiter=',')
    next(rdr)  # header
    for row in rdr:
        pca_vals.append(float(row[3]))
lam_median = np.median([-math.log(1-c+1e-2) for c in pca_vals])
lam_exp = 2*lam_median

# 2) overwrite YAML weights
rules = yaml.safe_load(open('/content/drive/MyDrive/THESIS/LTN/empirical_rules.yaml'))
for r in rules:
    r['weight'] = lam_exp

pathlib.Path('build').mkdir(exist_ok=True)
json.dump(rules, open('/content/drive/MyDrive/THESIS/LTN/empirical_rules_compiled.json','w'), indent=2)
print(f'saved {len(rules)} empirical rules (λ={lam_exp:.2f})')


saved 9 empirical rules (λ=3.62)


In [None]:
# build_patient_facts.py
#
# Reads patient-week records from CSV and writes a JSON-Lines file
# with:
#   • Numeric literal triples ("num::${value}")
#   • Categorical HAS_PROP flags
#   • Explicit HAS_WEEK and OF_PATIENT relations
# so that LTNs can learn from both empirical data and structural semantics.
# --------------------------------------------------------------------
import pandas as pd
import json
import math
from pathlib import Path

# ── Configuration ──────────────────────────────────────────────────────
CSV    = "/content/drive/MyDrive/THESIS/LTN/patient-data.csv"
OUT    = "/content/drive/MyDrive/THESIS/LTN/patient_facts_with_week.jsonl"
VAL_NS = "num::"
PREC   = 3

# ── Helper Functions ───────────────────────────────────────────────────

def lit(val: float) -> str:
    """Canonical literal node – identical numbers map to one entity."""
    return f"{VAL_NS}{round(float(val), PREC)}"


def write_triple(out_f, h: str, r: str, t: str):
    out_f.write(json.dumps({"h": h, "r": r, "t": t}) + "\n")


def write_num(out_f, h: str, rel: str, value):
    """Write a numeric triple if value is not NaN"""
    if pd.isna(value):
        return
    write_triple(out_f, h, rel, lit(value))


def write_has_prop(out_f, h: str, prop: str, cond: bool):
    """Write (h HAS_PROP prop) if cond is True."""
    if cond:
        write_triple(out_f, h, "HAS_PROP", prop)

# ── Relations & Baselines ──────────────────────────────────────────────
NUMERIC_RELATIONS = {
    "glucose_median_after"  : "Glucose_median_After meal",
    "glucose_median_before" : "Glucose_median_Before meal",
    "glucose_min_after"     : "Glucose_min_After meal",
    "glucose_min_before"    : "Glucose_min_Before meal",
    "glucose_max_after"     : "Glucose_max_After meal",
    "glucose_max_before"    : "Glucose_max_Before meal",
    "sleep_mean"            : "Sleep_mean",
    "steps_mean"            : "Steps_mean",
    "intensity_sum"         : "Intensity_sum",
    "weight_mean"           : "Weight_mean",
    "bmi_mean"              : "BMI_mean",
    "med_diet_score"        : "MedDietScore",
    "hyperglycemia"         : "Hyperglycemia",
    "HbA1c"                 : "HbA1c",
    "LDL"                   : "LDL",
}

# Precompute baselines for week 0:
baseline_steps = {}
baseline_bmi   = {}

df = pd.read_csv(CSV)
for _, row in df[df["StudyWeek"] == 0].iterrows():
    pid = row["ShortId"]
    if not pd.isna(row.get("Steps_mean")):
        baseline_steps[pid] = float(row["Steps_mean"])
    if not pd.isna(row.get("BMI_mean")):
        baseline_bmi[pid]   = float(row["BMI_mean"])

# ── Write JSON-Lines ───────────────────────────────────────────────────
with open(OUT, "w", encoding="utf-8") as out:
    for _, row in df.iterrows():
        pid      = row["ShortId"]
        week_idx = int(row["StudyWeek"])
        node     = f"{pid}_w{week_idx}"

        # 1) Structural semantics ──────────────────────────────────────────
        # a) Which week is this?
        write_triple(out, node,     "HAS_WEEK",    f"{VAL_NS}{week_idx}")
        # b) Which patient does this belong to?
        write_triple(out, node,     "OF_PATIENT",  pid)

        # 2) Numeric data ────────────────────────────────────────────────
        for rel, col in NUMERIC_RELATIONS.items():
            if col in row:
                write_num(out, node, rel, row[col])

        # 3) Categorical flags ───────────────────────────────────────────
        # (i) StepsPlus500
        if pid in baseline_steps and not pd.isna(row.get("Steps_mean")):
            cond = row["Steps_mean"] - baseline_steps[pid] > 500
            write_has_prop(out, node, "StepsPlus500", cond)

        # (ii) GlucoseReg
        g_max_a   = row.get("Glucose_max_After meal")
        g_max_b   = row.get("Glucose_max_Before meal")
        g_min_a   = row.get("Glucose_min_After meal")
        g_min_b   = row.get("Glucose_min_Before meal")
        if not any(pd.isna(v) for v in (g_max_a, g_max_b, g_min_a, g_min_b)):
            cond = (g_max_a <= 180 and g_max_b <= 130 and g_min_a >= 70 and g_min_b >= 70)
            write_has_prop(out, node, "GlucoseReg", cond)

        # (iii) BMIOnTrack
        if pid in baseline_bmi and not pd.isna(row.get("BMI_mean")):
            target = baseline_bmi[pid] * (1 - 0.05 * (week_idx - 0.5)/12)
            cond   = row["BMI_mean"] <= target
            write_has_prop(out, node, "BMIOnTrack", cond)

        # (iv) WeeklyWeigh
        write_has_prop(out, node, "WeeklyWeigh", not pd.isna(row.get("Weight_mean")))

        # (v) SMBGSpread
        if not any(pd.isna(v) for v in (g_max_a, g_min_a)):
            spread = g_max_a - g_min_a
            write_has_prop(out, node, "SMBGSpread", spread > 60)

        # (vi) SMBGDays3Plus
        days = row.get("SMBG_days")
        write_has_prop(out, node, "SMBGDays3Plus", (not pd.isna(days) and int(days) >= 3))

print(f"Wrote triples to {OUT}")


Wrote triples to /content/drive/MyDrive/THESIS/LTN/patient_facts_with_week.jsonl
