In [3]:
import os
import json
import pandas as pd

# ========================
# Helper Functions
# ========================

def extract_id(ref):
    if ref.startswith("Patient/"):
        return ref.split("/")[-1]
    elif ref.startswith("RelatedPerson/"):
        return ref.split("/")[-1]
    elif ref.startswith("urn:uuid:"):
        return ref.split(":")[-1]
    else:
        return ref

def load_json(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def save_as_csv(data, output_filepath):
    df = pd.DataFrame(data)
    df.to_csv(output_filepath, index=False)
    print(f"✅ CSV disimpan di {output_filepath}")

def save_as_json(data, output_filepath):
    with open(output_filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"✅ JSON disimpan di {output_filepath}")

# ========================
# Ekstraksi Minimal untuk Setiap Resource
# ========================

def extract_patients(data):
    """Ekstrak informasi dasar pasien: id, birthDate, gender dan (opsional) name."""
    extracted = []
    for p in data:
        extracted.append({
            "patient_id": p.get("id", ""),
            "birthDate": p.get("birthDate", ""),
            "gender": p.get("gender", ""),
            # Jika terdapat informasi name, ambil salah satu field (misalnya family name)
            "name": p.get("name", [{"family": "Unknown"}])[0].get("family", "Unknown")
        })
    return extracted

def extract_family_member_history(data):
    """Ekstrak informasi FamilyMemberHistory: id, patient_id, relationship, dan kondisi."""
    extracted = []
    for rec in data:
        patient_ref = rec.get("patient", {}).get("reference", "")
        patient_id = extract_id(patient_ref)
        # Ambil relationship (jika ada)
        relationship = ""
        if "relationship" in rec and "coding" in rec["relationship"] and rec["relationship"]["coding"]:
            relationship = rec["relationship"]["coding"][0].get("display", "")
        # Ekstrak kondisi (jika ada)
        conditions = []
        if "condition" in rec:
            for cond in rec["condition"]:
                cond_text = cond.get("code", {}).get("text", "")
                if cond_text:
                    conditions.append(cond_text)
        extracted.append({
            "family_member_history_id": rec.get("id", ""),
            "patient_id": patient_id,
            "relationship": relationship,
            "conditions": "; ".join(conditions)  # Gabungkan kondisi jika lebih dari satu
        })
    return extracted

def extract_related_person(data):
    """Ekstrak informasi RelatedPerson: id, patient_id, relationship, nama, gender, dan birthDate."""
    extracted = []
    for rec in data:
        patient_ref = rec.get("patient", {}).get("reference", "")
        patient_id = extract_id(patient_ref)
        # Gabungkan teks relationship jika lebih dari satu
        relationship = ", ".join([r.get("text", "") for r in rec.get("relationship", [])])
        rp_name = ""
        if "name" in rec and isinstance(rec["name"], list) and len(rec["name"]) > 0:
            rp_name = rec["name"][0].get("family", "")
        extracted.append({
            "related_person_id": extract_id(rec.get("id", "")),
            "patient_id": patient_id,
            "relationship": relationship,
            "rp_name": rp_name,
            "gender": rec.get("gender", ""),
            "birthDate": rec.get("birthDate", "")
        })
    return extracted

def extract_conditions(data):
    """Ekstrak informasi Condition: id, patient_id (dari subject), dan nama penyakit (code.text)."""
    extracted = []
    for rec in data:
        subject_ref = rec.get("subject", {}).get("reference", "")
        patient_id = extract_id(subject_ref)
        disease = rec.get("code", {}).get("text", "")
        extracted.append({
            "condition_id": rec.get("id", ""),
            "patient_id": patient_id,
            "disease": disease
        })
    return extracted



In [15]:
import os
import json
import pandas as pd

# Folder hasil proses
processed_folder = "synthea/output/processed/"

# Resource yang dianggap berguna untuk model prediktif
resources = ["Patient", "FamilyMemberHistory", "RelatedPerson", "Condition"]

# Mapping resource ke fungsi ekstraksi yang sudah didefinisikan
extraction_functions = {
    "Patient": extract_patients,
    "FamilyMemberHistory": extract_family_member_history,
    "RelatedPerson": extract_related_person,
    "Condition": extract_conditions
}

# Dictionary untuk menyimpan DataFrame hasil ekstraksi
dataframes = {}

for res in resources:
    filepath = os.path.join(processed_folder, f"{res}.json")
    if os.path.exists(filepath):
        print(f"📂 Memproses {res} dari {filepath}")
        data = load_json(filepath)
        # Jika data berbentuk dictionary, ubah menjadi list
        if isinstance(data, dict):
            data = [data]
        extracted_data = extraction_functions[res](data)
        df = pd.DataFrame(extracted_data)
        dataframes[res] = df
        print(f"✅ Dataframe {res} memiliki shape: {df.shape}")
    else:
        print(f"⚠️ File {res}.json tidak ditemukan di {processed_folder}")

# Tampilkan beberapa baris contoh untuk setiap DataFrame
for res, df in dataframes.items():
    print(f"\nContoh DataFrame untuk {res}:")
    display(df.head())


📂 Memproses Patient dari synthea/output/processed/Patient.json
✅ Dataframe Patient memiliki shape: (108, 4)
📂 Memproses FamilyMemberHistory dari synthea/output/processed/FamilyMemberHistory.json
✅ Dataframe FamilyMemberHistory memiliki shape: (193, 4)
📂 Memproses RelatedPerson dari synthea/output/processed/RelatedPerson.json
✅ Dataframe RelatedPerson memiliki shape: (261, 6)
📂 Memproses Condition dari synthea/output/processed/Condition.json
✅ Dataframe Condition memiliki shape: (4093, 3)

Contoh DataFrame untuk Patient:


Unnamed: 0,patient_id,birthDate,gender,name
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267



Contoh DataFrame untuk FamilyMemberHistory:


Unnamed: 0,family_member_history_id,patient_id,relationship,conditions
0,family-3a644dcd-672c-9579-cdeb-65ce6783da97-,7da148be-b73e-73e3-ed5c-67d7c712a253,Father,Asthma
1,family-8463087b-be64-1139-b779-97d09881e034-,7da148be-b73e-73e3-ed5c-67d7c712a253,Sister,Hypertension; Heart Disease
2,family-00a4d481-551d-9741-dd8f-fa88fe29ab79-,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Father,Hypertension
3,family-8c97920a-fc41-8150-f54e-9dcfc1f48fef-,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Mother,Diabetes; Hypertension
4,family-2b27a9c6-3b32-83fe-c4eb-ff271de3536b-,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,Father,Cancer



Contoh DataFrame untuk RelatedPerson:


Unnamed: 0,related_person_id,patient_id,relationship,rp_name,gender,birthDate
0,3a644dcd-672c-9579-cdeb-65ce6783da97,7da148be-b73e-73e3-ed5c-67d7c712a253,Father,Barton704,female,1975-09-20
1,67ed8fab-19a2-40c5-e56c-3dfdab2c9805,7da148be-b73e-73e3-ed5c-67d7c712a253,Mother,Schowalter414,male,1989-02-23
2,8463087b-be64-1139-b779-97d09881e034,7da148be-b73e-73e3-ed5c-67d7c712a253,Sister,Boyle917,male,1965-09-03
3,00a4d481-551d-9741-dd8f-fa88fe29ab79,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Father,Bernhard322,female,1968-08-26
4,8c97920a-fc41-8150-f54e-9dcfc1f48fef,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Mother,Jerde200,female,2003-10-06



Contoh DataFrame untuk Condition:


Unnamed: 0,condition_id,patient_id,disease
0,ded1426d-62e2-77ad-0c8b-5b34075c89a9,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)
1,40f951b4-d966-312a-c6b2-2b9b89ca5f30,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)
2,aa6822b6-d4e2-bbe9-d4fa-574aac5c27ca,7da148be-b73e-73e3-ed5c-67d7c712a253,Gingivitis (disorder)
3,35aac9a9-f1fe-8563-79d0-d279208f9098,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)
4,63b4c19c-4488-56fe-e4eb-f5dd262aa4b2,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)


In [16]:
df_patient = dataframes["Patient"]
df_fmh = dataframes["FamilyMemberHistory"]
df_rp = dataframes["RelatedPerson"]
df_condition = dataframes["Condition"]

In [17]:
# === 1. Merge Patient dengan FamilyMemberHistory berdasarkan patient_id (dengan pivot untuk unique patient_id) ===
df_fmh_grouped = (
    df_fmh.groupby(["patient_id", "relationship"])["conditions"]
    .apply(lambda x: "; ".join(x.dropna().unique()))
    .reset_index()
)

# Pivot agar setiap patient_id unik dan masing-masing relationship menjadi kolom
df_fmh_pivot = df_fmh_grouped.pivot(index="patient_id", columns="relationship", values="conditions").reset_index()

# Ubah nama kolom relationship menjadi lower-case + "_condition"
df_fmh_pivot = df_fmh_pivot.rename(columns=lambda x: x.lower() + "_condition" if x != "patient_id" else x)

# Merge dengan df_patient
df_patient_fmh = pd.merge(df_patient, df_fmh_pivot, on="patient_id", how="left")
print("Merged Patient dan FamilyMemberHistory (unique) shape:", df_patient_fmh.shape)
display(df_patient_fmh.head())

# === 2. Group Condition berdasarkan patient_id dan gabungkan nama penyakit ===
df_condition_grouped = (
    df_condition.groupby("patient_id")["disease"]
    .apply(lambda x: " ".join(x))
    .reset_index()
    .rename(columns={"disease": "patient_conditions_text"})
)
print("Grouped Condition shape:", df_condition_grouped.shape)
display(df_condition_grouped.head())

# === 3. Gabungkan data pasien dengan kondisi mereka ===
df_training = pd.merge(df_patient_fmh, df_condition_grouped, on="patient_id", how="left")
print("Training DataFrame shape:", df_training.shape)
display(df_training.head())

# === 4. Proses RelatedPerson dengan cara yang sama seperti FamilyMemberHistory ===
# a. Gabungkan RelatedPerson dengan Condition (berdasarkan related_person_id == patient_id di Condition)
df_rp_condition = pd.merge(
    df_rp, df_condition_grouped, 
    left_on="related_person_id", right_on="patient_id", 
    how="left"
).rename(columns={"patient_conditions_text": "related_conditions"})

# b. Hapus kolom 'patient_id_y' yang redundant (karena sudah ada dari df_rp)
df_rp_condition.drop(columns=["patient_id_y"], inplace=True)

# c. Group berdasarkan patient_id dan relationship, lalu gabungkan kondisi dalam satu baris per patient_id
df_rp_grouped = (
    df_rp_condition.groupby(["patient_id_x", "relationship"])["related_conditions"]
    .apply(lambda x: "; ".join(x.dropna().unique()))
    .reset_index()
)

# d. Pivot agar setiap patient_id unik dan masing-masing relationship menjadi kolom
df_rp_pivot = df_rp_grouped.pivot(index="patient_id_x", columns="relationship", values="related_conditions").reset_index()

# e. Ubah nama kolom relationship menjadi lower-case + "_related_condition"
df_rp_pivot = df_rp_pivot.rename(columns=lambda x: x.lower() + "_related_condition" if x != "patient_id_x" else x)

# f. Merge df_rp_pivot ke df_training berdasarkan patient_id
df_training = pd.merge(df_training, df_rp_pivot, left_on="patient_id", right_on="patient_id_x", how="left")

# g. Hapus kolom redundant
df_training.drop(columns=["patient_id_x"], inplace=True)

# h. Jika ada nilai NaN pada kolom related_conditions, ganti dengan string kosong
df_training.fillna("", inplace=True)

print("Final Training DataFrame shape (after fixing RelatedPerson logic):", df_training.shape)
display(df_training.head())


Merged Patient dan FamilyMemberHistory (unique) shape: (108, 8)


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,


Grouped Condition shape: (108, 2)


Unnamed: 0,patient_id,patient_conditions_text
0,00a4d481-551d-9741-dd8f-fa88fe29ab79,Housing unsatisfactory (finding) Received high...
1,047b9787-9e0b-6cf4-7b40-1cd2ab9422c1,Received higher education (finding) Transport ...
2,06671679-d2c8-8426-da09-7017cc0bda53,Received higher education (finding) Lack of ac...
3,0d1b91dc-9b9e-11cd-e150-6837c9cb3e54,Medication review due (situation) Medication r...
4,18b84736-db6e-baac-84e2-62d3e8dacd0f,Chronic sinusitis (disorder) Received higher e...


Training DataFrame shape: (108, 9)


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...


Final Training DataFrame shape (after fixing RelatedPerson logic): (108, 13)


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text,brother_related_condition,father_related_condition,mother_related_condition,sister_related_condition
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...,,Recurrent urinary tract infection (disorder) E...,Loss of teeth (disorder) Received higher educa...,Risk activity involvement (finding) Received h...
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...,,Housing unsatisfactory (finding) Received high...,Medication review due (situation) Risk activit...,
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Received h...,,Received higher education (finding) Loss of te...
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...,Medication review due (situation) Perennial al...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Only recei...,
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...,,,Educated to high school level (finding) Predia...,


In [18]:
# === 1. Daftar penyakit target ===
target_diseases = ["Diabetes", "Hypertension", "Cancer", "Heart Disease", "Alzheimer", "Asthma"]

# === 2. Membuat kolom binary untuk setiap penyakit berdasarkan patient_conditions_text ===
for disease in target_diseases:
    df_training[disease] = df_training["patient_conditions_text"].apply(lambda x: 1 if disease.lower() in x.lower() else 0)
# === 3. Tampilkan hasil ===
print("Training DataFrame with Multi-Label Encoding:")
display(df_training.head())

# === 4. Cek distribusi label untuk tiap penyakit ===
print("Distribusi Label Multi-Label:")
display(df_training[target_diseases].sum())


Training DataFrame with Multi-Label Encoding:


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text,brother_related_condition,father_related_condition,mother_related_condition,sister_related_condition,Diabetes,Hypertension,Cancer,Heart Disease,Alzheimer,Asthma
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...,,Recurrent urinary tract infection (disorder) E...,Loss of teeth (disorder) Received higher educa...,Risk activity involvement (finding) Received h...,1,1,0,0,0,1
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...,,Housing unsatisfactory (finding) Received high...,Medication review due (situation) Risk activit...,,0,1,1,0,0,0
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Received h...,,Received higher education (finding) Loss of te...,1,1,1,1,0,1
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...,Medication review due (situation) Perennial al...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Only recei...,,0,1,0,0,0,0
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...,,,Educated to high school level (finding) Predia...,,1,1,0,1,0,1


Distribusi Label Multi-Label:


Diabetes         65
Hypertension     82
Cancer           29
Heart Disease    56
Alzheimer        11
Asthma           31
dtype: int64

In [19]:
import re

def remove_target_diseases(text, target_diseases):
    if pd.isna(text):  # Jika NaN, langsung return ""
        return ""

    # Konversi text ke lowercase untuk mencocokkan semua bentuk penyakit
    text = text.lower()

    # Looping untuk hapus semua keyword yang mengandung target penyakit
    for disease in target_diseases:
        if disease.lower() in text:  # Jika ada disease sebagai substring
            pattern = r"\b\w*" + re.escape(disease.lower()) + r"\w*\b"  # Hapus kata yang mengandung target
            text = re.sub(pattern, "", text).strip()  # Hapus dan bersihkan spasi

    # Hapus double spaces yang muncul setelah penghapusan kata
    text = re.sub(r"\s+", " ", text)

    return text

# === 1. Hilangkan target penyakit dari patient_conditions_text (case-insensitive) ===
df_training["patient_conditions_text_cleaned"] = df_training["patient_conditions_text"].apply(
    lambda x: remove_target_diseases(x, target_diseases)
)

# === 2. Tampilkan beberapa contoh untuk verifikasi ===
print("Training DataFrame after removing target diseases from patient_conditions_text:")
display(df_training[["patient_conditions_text", "patient_conditions_text_cleaned"]].head())


Training DataFrame after removing target diseases from patient_conditions_text:


Unnamed: 0,patient_conditions_text,patient_conditions_text_cleaned
0,Medication review due (situation) Medication r...,medication review due (situation) medication r...
1,Medication review due (situation) Gingivitis (...,medication review due (situation) gingivitis (...
2,Childhood asthma (disorder) Perennial allergic...,childhood (disorder) perennial allergic rhinit...
3,Medication review due (situation) Primary dent...,medication review due (situation) primary dent...
4,Impacted molars (disorder) Chronic pain (findi...,impacted molars (disorder) chronic pain (findi...


In [20]:
# === 1. Gabungkan FamilyMemberHistory dengan RelatedPerson berdasarkan hubungan ===
# Loop untuk setiap tipe relasi (father, mother, brother, sister)
for rel in ["father", "mother", "brother", "sister"]:
    fmh_col = f"{rel}_condition"
    rp_col = f"{rel}_related_condition"
    
    # Gabungkan kondisi dari FamilyMemberHistory dan RelatedPerson jika keduanya ada
    df_training[f"{rel}_merged_condition"] = df_training[[fmh_col, rp_col]].apply(
        lambda x: "; ".join(filter(None, x)), axis=1
    )

# === 2. Tampilkan hasil untuk verifikasi ===
print("Training DataFrame after merging FamilyMemberHistory & RelatedPerson conditions:")
display(df_training[["father_merged_condition", "mother_merged_condition", "brother_merged_condition", "sister_merged_condition"]].head())


Training DataFrame after merging FamilyMemberHistory & RelatedPerson conditions:


Unnamed: 0,father_merged_condition,mother_merged_condition,brother_merged_condition,sister_merged_condition
0,Asthma; Recurrent urinary tract infection (dis...,Loss of teeth (disorder) Received higher educa...,,Hypertension; Heart Disease; Risk activity inv...
1,Hypertension; Housing unsatisfactory (finding)...,Diabetes; Hypertension; Medication review due ...,,
2,Cancer; Risk activity involvement (finding) Re...,,Diabetes; Received higher education (finding) ...,Asthma; Received higher education (finding) Lo...
3,Heart Disease; Received higher education (find...,Risk activity involvement (finding) Only recei...,Cancer; Medication review due (situation) Pere...,
4,,Educated to high school level (finding) Predia...,,


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# === 3. Bag-of-Words (BoW) untuk setiap relationship secara terpisah ===
vectorizers = {}  # Simpan vectorizer untuk setiap relasi
embedded_features = []  # Simpan DataFrame embedding

for rel in ["father", "mother", "brother", "sister"]:
    col_name = f"{rel}_merged_condition"
    
    # Buat vectorizer untuk tiap kategori hubungan
    vectorizer = CountVectorizer()
    vectorized_data = vectorizer.fit_transform(df_training[col_name].fillna(""))  # Isi NaN dengan string kosong
    
    # Simpan hasil vektorisasi dalam DataFrame
    feature_names = [f"{rel}_bow_{word}" for word in vectorizer.get_feature_names_out()]
    df_vec = pd.DataFrame(vectorized_data.toarray(), columns=feature_names)

    # Simpan hasil dan vectorizer
    embedded_features.append(df_vec)
    vectorizers[rel] = vectorizer  # Simpan vectorizer untuk penggunaan nanti

# Gabungkan semua fitur embedded ke df_training
df_training_bow = pd.concat([df_training] + embedded_features, axis=1)

print("Training DataFrame after adding BoW embeddings:")
display(df_training_bow.head())


Training DataFrame after adding BoW embeddings:


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text,brother_related_condition,...,sister_bow_valve,sister_bow_variation,sister_bow_victim,sister_bow_violence,sister_bow_viral,sister_bow_virus,sister_bow_wheezing,sister_bow_whiplash,sister_bow_with,sister_bow_wrist
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...,,...,0,0,1,1,2,0,0,0,0,0
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...,,...,0,0,0,0,0,0,0,0,0,0
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...,Received higher education (finding) Past pregn...,...,0,0,2,0,0,0,0,0,0,0
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...,Medication review due (situation) Perennial al...,...,0,0,0,0,0,0,0,0,0,0
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...,,...,0,0,0,0,0,0,0,0,0,0


In [32]:
X_cleaned['father_merged_condition'].iloc[0]

'asthma recurrent urinary tract infection disorder educated to high school level finding history of tubal ligation situation chronic pain finding chronic low back pain finding chronic neck pain finding fulltime employment finding victim of intimate partner abuse finding viral sinusitis disorder medication review due situation prediabetes finding infection of tooth disorder gingival disease disorder loss of teeth disorder anemia disorder not in labor force finding stress finding injury of neck disorder whiplash injury to neck disorder medication review due situation fulltime employment finding gingivitis disorder gingival disease disorder sinusitis disorder chronic sinusitis disorder victim of intimate partner abuse finding medication review due situation medication review due situation parttime employment finding gingivitis disorder primary dental caries disorder not in labor force finding viral sinusitis disorder essential hypertension disorder medication review due situation fulltime

In [33]:
import pandas as pd
import numpy as np
import re
import string
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import hstack
from catboost import CatBoostClassifier
import lightgbm as lgb

# === 1. Pilih Fitur dan Label ===
text_columns = [
    "patient_conditions_text_cleaned",
    "father_merged_condition",
    "mother_merged_condition",
    "brother_merged_condition",
    "sister_merged_condition"
]
label_columns = ["Diabetes", "Hypertension", "Cancer", "Heart Disease", "Alzheimer", "Asthma"]  # Label multi-label

X_texts = df_training[text_columns]  # Fitur teks
Y = df_training[label_columns].values  # Label multi-label dalam bentuk numpy array

# === 2. Preprocessing Teks ===
def clean_text(text):
    if pd.isna(text) or text.strip() == "":
        return ""  # Handle NaN atau teks kosong
    text = text.lower()  # Lowercase
    text = re.sub(r"\d+", "", text)  # Hapus angka
    text = re.sub(r"[^\w\s]", "", text)  # Hapus tanda baca
    text = re.sub(r"\s+", " ", text).strip()  # Hapus spasi berlebih
    return text

# Terapkan preprocessing ke semua fitur teks
X_cleaned = X_texts.applymap(clean_text)

# === 3. Pilihan Vectorizer: BoW dan TF-IDF ===
vectorizer_types = {
    "BoW": CountVectorizer,
    "TF-IDF": TfidfVectorizer
}

all_results = {}

# Looping untuk BoW dan TF-IDF
for vec_type, VectorizerClass in vectorizer_types.items():
    print(f"\n===== Running with {vec_type} =====")

    vectorizers = {}  # Dictionary untuk menyimpan vectorizer setiap fitur
    X_features = []  # List untuk menyimpan hasil transformasi

    for col in text_columns:
        vectorizer = VectorizerClass()
        X_transformed = vectorizer.fit_transform(X_cleaned[col])  # Transform fitur teks
        X_features.append(X_transformed)
        vectorizers[col] = vectorizer  # Simpan vectorizer untuk digunakan nanti

    # Gabungkan semua fitur menjadi satu matriks sparse
    X_combined = hstack(X_features)

    # Split Data dengan Iterative Stratification
    X_train, y_train, X_test, y_test = iterative_train_test_split(X_combined, Y, test_size=0.2)

    # === 4. Model Definitions (Tanpa OneVsRest untuk CatBoost & LightGBM) ===
    models = {
        "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000)),  # Logistic Regression tetap pakai OneVsRest
        "CatBoost": CatBoostClassifier(iterations=500, loss_function='MultiLogloss', verbose=100),
    }

    results = {}

    for model_name, model in models.items():
        print(f"\nTraining {model_name} with {vec_type}...")
        
        # Untuk CatBoost & LightGBM, gunakan array numpy agar kompatibel
        if model_name in ["CatBoost", "LightGBM"]:
            model.fit(X_train.toarray(), y_train)  # Convert sparse matrix ke array
            y_pred = model.predict(X_test.toarray())
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        f1 = f1_score(y_test, y_pred, average="macro")

        results[model_name] = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}
        
        print(f"Results for {model_name} with {vec_type}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

    # Simpan hasil untuk setiap vectorizer
    all_results[vec_type] = results

# === 5. Display Comparison Results ===
for vec_type, results in all_results.items():
    print(f"\n===== Final Model Comparison with {vec_type} =====")
    df_results = pd.DataFrame(results).T
    display(df_results)


  X_cleaned = X_texts.applymap(clean_text)



===== Running with BoW =====

Training Logistic Regression with BoW...
Results for Logistic Regression with BoW:
Accuracy: 0.0000
Precision: 0.3700
Recall: 0.3925
F1 Score: 0.3796

Training CatBoost with BoW...
Learning rate set to 0.006824
0:	learn: 0.6903718	total: 23.2ms	remaining: 11.6s
100:	learn: 0.5205551	total: 1.97s	remaining: 7.8s
200:	learn: 0.4273533	total: 3.94s	remaining: 5.86s
300:	learn: 0.3550720	total: 5.8s	remaining: 3.83s
400:	learn: 0.3034204	total: 7.73s	remaining: 1.91s
499:	learn: 0.2581247	total: 9.62s	remaining: 0us
Results for CatBoost with BoW:
Accuracy: 0.0909
Precision: 0.3191
Recall: 0.4138
F1 Score: 0.3601

===== Running with TF-IDF =====

Training Logistic Regression with TF-IDF...
Results for Logistic Regression with TF-IDF:
Accuracy: 0.0455
Precision: 0.3038
Recall: 0.4114
F1 Score: 0.3494

Training CatBoost with TF-IDF...
Learning rate set to 0.006824


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0:	learn: 0.6904290	total: 60.4ms	remaining: 30.1s
100:	learn: 0.4967897	total: 3.41s	remaining: 13.5s
200:	learn: 0.3880670	total: 6.79s	remaining: 10.1s
300:	learn: 0.3203153	total: 10.1s	remaining: 6.7s
400:	learn: 0.2658375	total: 13.5s	remaining: 3.33s
499:	learn: 0.2249554	total: 16.7s	remaining: 0us
Results for CatBoost with TF-IDF:
Accuracy: 0.0909
Precision: 0.3188
Recall: 0.4138
F1 Score: 0.3598

===== Final Model Comparison with BoW =====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.0,0.370014,0.392531,0.379624
CatBoost,0.090909,0.319129,0.413753,0.360121



===== Final Model Comparison with TF-IDF =====


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Logistic Regression,0.045455,0.303752,0.411422,0.349383
CatBoost,0.090909,0.318799,0.413753,0.359795


In [35]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import hstack

# === 1. Preprocessing Teks ===
def clean_text(text):
    if pd.isna(text) or text.strip() == "":
        return ""
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Terapkan preprocessing ke semua fitur teks
X_cleaned = df_training[text_columns].applymap(clean_text)

# === 2. Fit TF-IDF Vectorizer Sekali (Shared Vocabulary) ===
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df_training["all_text_cleaned"])  # Fit hanya sekali dengan vocabulary global

# === 3. Transform setiap fitur teks dengan vectorizer yang sama ===
X_tfidf_features = [tfidf_vectorizer.transform(X_cleaned[col]) for col in text_columns]

# === 4. Gabungkan embedding dari setiap fitur teks ===
X_combined = hstack(X_tfidf_features)  # Gabungkan hasil embedding

# === 5. Train Logistic Regression ===
logreg = OneVsRestClassifier(LogisticRegression(max_iter=1000))
logreg.fit(X_combined, df_training[label_columns].values)

# === 6. Simpan Model Logistic Regression dan TF-IDF Vectorizer ===
with open("logreg.pkl", "wb") as f:
    pickle.dump(logreg, f)
print("✅ Logistic Regression model saved as logreg.pkl")

with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)
print("✅ Shared TF-IDF vectorizer saved as tfidf.pkl")


✅ Logistic Regression model saved as logreg.pkl
✅ Shared TF-IDF vectorizer saved as tfidf.pkl


  X_cleaned = df_training[text_columns].applymap(clean_text)
