<a href="https://colab.research.google.com/github/davidarvai/DIPLOMADOLGOZAT-/blob/main/Xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import math
import numpy as np
import nibabel as nib
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# -----------------------------
# Konfúziós mátrix és metrikák függvényei
# -----------------------------
def get_custom_confusion_matrix(tumor_type, tn, fp, fn, tp):
    if tumor_type == "Whole Tumor":
        custom_matrix = np.array([[tn, fp, fp, fp],
                                  [fn, tp, tp, tp],
                                  [fn, tp, tp, tp],
                                  [fn, tp, tp, tp]])
    elif tumor_type == "Edema":
        custom_matrix = np.array([[tn, tn, fp, tn],
                                  [tn, tn, fp, tn],
                                  [fn, fn, tp, fn],
                                  [tn, tn, fp, tn]])
    elif tumor_type == "Tumor Core":
        custom_matrix = np.array([[tn, fp, tn, fp],
                                  [fn, tp, fn, tp],
                                  [tn, fp, tn, fp],
                                  [fn, tp, fn, tp]])
    elif tumor_type == "Enhancing Core":
        custom_matrix = np.array([[tn, tn, tn, fp],
                                  [tn, tn, tn, fp],
                                  [tn, tn, tn, fp],
                                  [fn, fn, fn, tp]])
    else:
        custom_matrix = None
    return custom_matrix

def compute_confusion(gt_mask, pred_mask):
    tn = np.sum((gt_mask==False) & (pred_mask==False))
    tp = np.sum((gt_mask==True)  & (pred_mask==True))
    fp = np.sum((gt_mask==False) & (pred_mask==True))
    fn = np.sum((gt_mask==True)  & (pred_mask==False))
    return tn, fp, fn, tp

def compute_metrics(tn, fp, fn, tp):
    TPR = tp / (tp + fn) if (tp+fn) > 0 else 0
    TNR = tn / (tn + fp) if (tn+fp) > 0 else 0
    PPV = tp / (tp + fp) if (tp+fp) > 0 else 0
    NPV = tn / (tn + fn) if (tn+fn) > 0 else 0
    ACC = (tp + tn) / (tp + tn + fp + fn) if (tp+tn+fp+fn)>0 else 0
    DS  = (2 * tp) / (2 * tp + fp + fn) if (2 * tp + fp + fn)>0 else 0
    return TPR, TNR, PPV, NPV, ACC, DS

# -----------------------------
# Adat betöltés és előfeldolgozás
# -----------------------------
def remap_segmentation(seg):
    seg_new = np.copy(seg)
    # Az eredeti adatokban a label 4 az enhancing core, amit átmappolunk 3-ra
    seg_new[seg == 4] = 3
    return seg_new

def load_subject_data(subject_path):
    files = os.listdir(subject_path)
    subject_data = {}
    for file in files:
        if file.endswith('.nii') or file.endswith('.nii.gz'):
            lower = file.lower()
            if 'seg' in lower:
                subject_data['seg'] = os.path.join(subject_path, file)
            else:
                for mod in ['t1', 't1ce', 't2', 'flair']:
                    if mod in lower:
                        subject_data[mod] = os.path.join(subject_path, file)
    return subject_data

def load_data_from_dir(data_dir):
    X_list = []  # 3D volume (H,W,D,4)
    Y_list = []  # 3D segmentation (H,W,D)
    subject_names = []
    subject_dirs = [os.path.join(data_dir, d) for d in os.listdir(data_dir)
                    if os.path.isdir(os.path.join(data_dir, d))]
    subject_dirs.sort()
    for subject_path in subject_dirs:
        data_files = load_subject_data(subject_path)
        if all(mod in data_files for mod in ['t1', 't1ce', 't2', 'flair']) and 'seg' in data_files:
            modality_imgs = []
            for mod in ['t1', 't1ce', 't2', 'flair']:
                img = nib.load(data_files[mod]).get_fdata()
                modality_imgs.append(img)
            X = np.stack(modality_imgs, axis=-1)  # shape: (H,W,D,4)
            seg = nib.load(data_files['seg']).get_fdata()
            seg = remap_segmentation(seg)
            X_list.append(X)
            Y_list.append(seg)
            subject_names.append(os.path.basename(subject_path))
        else:
            print("Hiányos adatok:", subject_path)
    return X_list, Y_list, subject_names

def normalize_volume(vol):
    vol = vol.astype(np.float32)
    vol = (vol - np.min(vol)) / (np.max(vol) - np.min(vol) + 1e-8)
    return vol

# -----------------------------
# Szeletek kinyerése (2D) – hasonló logika, mint a CNN esetében
# -----------------------------
def extract_slices(volume, seg, slice_axis=2, include_bg_ratio=0.3):
    slices_x = []
    slices_y = []
    D = volume.shape[slice_axis]
    for i in range(D):
        img_slice = volume[:, :, i, :]  # (H, W, 4)
        seg_slice = seg[:, :, i]         # (H, W)
        # Ha a szeletben van tumor, vagy véletlenszerűen választunk háttér szeletet
        if np.sum(seg_slice > 0) > 0.01 * (seg_slice.shape[0] * seg_slice.shape[1]):
            slices_x.append(img_slice)
            slices_y.append(seg_slice)
        else:
            if np.random.rand() < include_bg_ratio:
                slices_x.append(img_slice)
                slices_y.append(seg_slice)
    return slices_x, slices_y

# -----------------------------
# Pixel mintavételezés egy szeletből
# -----------------------------
def sample_pixels_from_slice(img_slice, seg_slice, num_samples=1000, tumor_ratio=0.5):
    H, W, C = img_slice.shape  # C=4
    # Lekérjük az indexeket
    tumor_idx = np.argwhere(seg_slice > 0)
    bg_idx = np.argwhere(seg_slice == 0)

    n_tumor = int(num_samples * tumor_ratio)
    n_bg = num_samples - n_tumor

    # Ha túl kevés tumor pixel van, vegyük az összeset
    if len(tumor_idx) < n_tumor:
        tumor_sample = tumor_idx
        n_bg = num_samples - len(tumor_sample)
    else:
        indices = np.random.choice(len(tumor_idx), n_tumor, replace=False)
        tumor_sample = tumor_idx[indices]

    if len(bg_idx) < n_bg:
        bg_sample = bg_idx
    else:
        indices = np.random.choice(len(bg_idx), n_bg, replace=False)
        bg_sample = bg_idx[indices]

    # Egyesítjük a mintákat
    all_idx = np.concatenate([tumor_sample, bg_sample], axis=0)
    # Véletlen sorrendbe keverjük az indexeket
    np.random.shuffle(all_idx)

    # Kinyerjük a jellemzőket és a címkéket
    features = []
    labels = []
    for idx in all_idx:
        i, j = idx
        features.append(img_slice[i, j, :])
        labels.append(int(seg_slice[i, j]))
    return np.array(features), np.array(labels)

# -----------------------------
# Tréning adathalmaz előállítása pixel szinten
# -----------------------------
def create_training_dataset(volumes, segmentations, num_samples_per_slice=1000):
    X_samples = []
    y_samples = []
    for vol, seg in zip(volumes, segmentations):
        slices_img, slices_seg = extract_slices(vol, seg, slice_axis=2, include_bg_ratio=0.3)
        for img_slice, seg_slice in zip(slices_img, slices_seg):
            # Ha a szelet túl kicsi, kihagyjuk
            if img_slice.shape[0] < 10 or img_slice.shape[1] < 10:
                continue
            Xp, yp = sample_pixels_from_slice(img_slice, seg_slice, num_samples=num_samples_per_slice, tumor_ratio=0.5)
            X_samples.append(Xp)
            y_samples.append(yp)
    if len(X_samples) == 0:
        raise ValueError("Nincsenek érvényes szelet minták!")
    X_all = np.concatenate(X_samples, axis=0)
    y_all = np.concatenate(y_samples, axis=0)
    return X_all, y_all

# -----------------------------
# Teszt adatoknál a teljes szelet előrejelzésének függvénye
# -----------------------------
def predict_volume(model, volume):
    H, W, D, _ = volume.shape
    pred_vol = np.zeros((H, W, D), dtype=np.int32)
    for i in range(D):
        x_slice = volume[:, :, i, :]  # (H, W, 4)
        # Flatten a szeletet: (H*W, 4)
        x_flat = x_slice.reshape(-1, x_slice.shape[-1])
        # Model előrejelzés
        pred_flat = model.predict(x_flat)
        pred_slice = pred_flat.reshape(H, W)
        pred_vol[:, :, i] = pred_slice
    return pred_vol

# -----------------------------
# A kapott 3D eredmény bináris maszkra konvertálása a tumor típus alapján
# -----------------------------
def get_binary_mask_3d(segmentation, tumor_type):
    if tumor_type == "Whole Tumor":
        return np.isin(segmentation, [1,2,3])
    elif tumor_type == "Edema":
        return (segmentation == 2)
    elif tumor_type == "Tumor Core":
        return np.isin(segmentation, [1,3])
    elif tumor_type == "Enhancing Core":
        return (segmentation == 3)
    else:
        raise ValueError("Ismeretlen tumor típus!")

# -----------------------------
# Fő program
# -----------------------------
if __name__ == "__main__":
    # Állítsd be az útvonalakat
    train_dir = "/content/drive/My Drive/Allamvizsga/Data/Teszt/Train"
    test_dir  = "/content/drive/My Drive/Allamvizsga/Data/Teszt/Teszt"

    print("Train adatok betöltése...")
    X_train_vols, Y_train_vols, train_subject_names = load_data_from_dir(train_dir)
    if len(X_train_vols) == 0:
        raise ValueError("Nincsenek betöltött train adatok!")
    X_train_vols = [normalize_volume(vol) for vol in X_train_vols]

    print("Tréning pixel adatok előállítása...")
    X_pixels, y_pixels = create_training_dataset(X_train_vols, Y_train_vols, num_samples_per_slice=1000)
    print("Összes minta:", X_pixels.shape, y_pixels.shape)

    # Train/Validation split (pixel szinten)
    X_train, X_val, y_train, y_val = train_test_split(X_pixels, y_pixels, test_size=0.2, random_state=42)

    # XGBoost osztályozó – 4 osztály (0,1,2,3)
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=4,
        use_label_encoder=False,
        eval_metric='mlogloss',
        verbosity=1,
        random_state=42
    )

    print("XGBoost tréning...")
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

    # -----------------------------
    # Tesztelés: A teljes 3D térkép előrejelzése szeletenként
    # -----------------------------
    print("Teszt adatok betöltése...")
    X_test_vols, Y_test_vols, test_subject_names = load_data_from_dir(test_dir)
    if len(X_test_vols) == 0:
        raise ValueError("Nincsenek betöltött teszt adatok!")
    X_test_vols = [normalize_volume(vol) for vol in X_test_vols]

    metrics_rows = []
    output_txt_lines = []
    tumor_types = ["Whole Tumor", "Edema", "Tumor Core", "Enhancing Core"]

    for vol, seg, subj_name in zip(X_test_vols, Y_test_vols, test_subject_names):
        print("Előrejelzés a", subj_name, "subjectön...")
        pred_vol = predict_volume(model, vol)

        # Metrikák számítása minden tumor típusra
        for tumor in tumor_types:
            gt_mask = get_binary_mask_3d(seg, tumor)
            pred_mask = get_binary_mask_3d(pred_vol, tumor)
            tn, fp, fn, tp = compute_confusion(gt_mask, pred_mask)
            TPR, TNR, PPV, NPV, ACC, DS = compute_metrics(tn, fp, fn, tp)
            cm = get_custom_confusion_matrix(tumor, tn, fp, fn, tp)

            metrics_rows.append({
                "Name": subj_name,
                "TumorType": tumor,
                "TP": tp,
                "TN": tn,
                "FP": fp,
                "FN": fn,
                "TPR": round(TPR, 3),
                "TNR": round(TNR, 3),
                "PPV": round(PPV, 3),
                "NPV": round(NPV, 3),
                "ACC": round(ACC, 3),
                "DS": round(DS, 3)
            })

            txt_block = f"Mapa neve: {subj_name}\nTumor típus: {tumor}\nKonfúziós mátrix:\n{cm}\n"
            txt_block += f"True Positive Rate (TPR): {round(TPR,3)}\n"
            txt_block += f"True Negative Rate (TNR): {round(TNR,3)}\n"
            txt_block += f"Positive Predictive Value (PPV): {round(PPV,3)}\n"
            txt_block += f"Negative Predictive Value (NPV): {round(NPV,3)}\n"
            txt_block += f"Accuracy (ACC): {round(ACC,3)}\n"
            txt_block += f"Dice Score (DS): {round(DS,3)}\n\n"
            output_txt_lines.append(txt_block)

    # Eredmények mentése CSV-be
    metrics_df = pd.DataFrame(metrics_rows, columns=["Name", "TumorType", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "ACC", "DS"])
    metrics_df.to_csv("metrics_output.csv", index=False)
    print("A metrics_output.csv fájl elmentve.")

    # Eredmények mentése TXT-be
    with open("output.txt", "w") as f:
        f.write("".join(output_txt_lines))
    print("Az output.txt fájl elmentve.")


Train adatok betöltése...
Tréning pixel adatok előállítása...
Összes minta: (771000, 4) (771000,)
XGBoost tréning...
[0]	validation_0-mlogloss:0.94814


Parameters: { "use_label_encoder" } are not used.



[1]	validation_0-mlogloss:0.72045
[2]	validation_0-mlogloss:0.57577
[3]	validation_0-mlogloss:0.47606
[4]	validation_0-mlogloss:0.40509
[5]	validation_0-mlogloss:0.35455
[6]	validation_0-mlogloss:0.31755
[7]	validation_0-mlogloss:0.28996
[8]	validation_0-mlogloss:0.27001
[9]	validation_0-mlogloss:0.25491
[10]	validation_0-mlogloss:0.24277
[11]	validation_0-mlogloss:0.23371
[12]	validation_0-mlogloss:0.22641
[13]	validation_0-mlogloss:0.22126
[14]	validation_0-mlogloss:0.21702
[15]	validation_0-mlogloss:0.21367
[16]	validation_0-mlogloss:0.21127
[17]	validation_0-mlogloss:0.20902
[18]	validation_0-mlogloss:0.20702
[19]	validation_0-mlogloss:0.20506
[20]	validation_0-mlogloss:0.20324
[21]	validation_0-mlogloss:0.20238
[22]	validation_0-mlogloss:0.20159
[23]	validation_0-mlogloss:0.20097
[24]	validation_0-mlogloss:0.19978
[25]	validation_0-mlogloss:0.19910
[26]	validation_0-mlogloss:0.19797
[27]	validation_0-mlogloss:0.19708
[28]	validation_0-mlogloss:0.19644
[29]	validation_0-mlogloss:0.