In [1]:
import openslide
import pandas as pd
import numpy as np

df = pd.read_csv("/rds/general/user/dla24/home/thesis/TGCA_dataset/df_clean.csv")

# Helper to classify slides
def classify_mpp(mpp):
    if np.isclose(mpp, 0.25, atol=0.02):
        return "0.25"
    elif np.isclose(mpp, 0.5, atol=0.02):
        return "0.5"
    elif np.isclose(mpp, 0.233, atol=0.02):  # treat as 40x
        return "0.25"
    else:
        return "other"

mpp_list = []
class_list = []

for slide_path in df["slide_path"]:
    try:
        slide = openslide.OpenSlide(slide_path)
        mpp_x = float(slide.properties.get("openslide.mpp-x", 0.5))
        mpp_list.append(mpp_x)
        class_list.append(classify_mpp(mpp_x))
    except Exception as e:
        print(f"{slide_path}: ERROR {e}")
        mpp_list.append(None)
        class_list.append("unknown")

df["mpp_x"] = mpp_list
df["mpp_class"] = class_list

# Save split CSVs
df_025 = df[df["mpp_class"] == "0.25"].reset_index(drop=True)
df_05  = df[df["mpp_class"] == "0.5"].reset_index(drop=True)
df_other = df[df["mpp_class"] == "other"].reset_index(drop=True)

df_025.to_csv("slides_mpp025.csv", index=False)
df_05.to_csv("slides_mpp05.csv", index=False)
df_other.to_csv("slides_mpp_other.csv", index=False)

In [2]:
print("40x group (mpp 0.25):", len(df_025))
print("20x group (mpp 0.5):", len(df_05))
print("Other group:", len(df_other))

40x group (mpp 0.25): 396
20x group (mpp 0.5): 15
Other group: 0


In [3]:
print("Unique mpp_x in 40x group:", sorted(df_025["mpp_x"].unique()))
print("Unique mpp_x in 20x group:", sorted(df_05["mpp_x"].unique()))
print("Unique mpp_x in other group:", sorted(df_other["mpp_x"].unique()))

Unique mpp_x in 40x group: [0.2325, 0.2457, 0.2462, 0.2465, 0.2471, 0.2472, 0.248, 0.2498, 0.2519, 0.252, 0.2526, 0.2527]
Unique mpp_x in 20x group: [0.4942, 0.5014]
Unique mpp_x in other group: []


In [4]:
print("Sample 40x group:")
print(df_025[["slide_path", "mpp_x"]].head())

print("\nSample 20x group:")
print(df_05[["slide_path", "mpp_x"]].head())

print("\nSample other group:")
print(df_other[["slide_path", "mpp_x"]].head())

Sample 40x group:
                                          slide_path   mpp_x
0  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.2520
1  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.2520
2  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.2527
3  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.2520
4  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.2465

Sample 20x group:
                                          slide_path   mpp_x
0  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.5014
1  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.4942
2  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.5014
3  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.5014
4  /rds/general/user/dla24/home/thesis/TGCA_datas...  0.5014

Sample other group:
Empty DataFrame
Columns: [slide_path, mpp_x]
Index: []


In [1]:
# train test split
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/rds/general/user/dla24/home/thesis/TGCA_dataset/slides_mpp025.csv")

# First, get unique patient or slide IDs (replace 'slide_id' with 'patient_id' if splitting by patient)
unique_slides = df["slide_id"].unique()
train_slides, temp_slides = train_test_split(unique_slides, test_size=0.3, random_state=42)
val_slides, test_slides   = train_test_split(temp_slides, test_size=0.5, random_state=42)

train_df = df[df["slide_id"].isin(train_slides)].reset_index(drop=True)
val_df   = df[df["slide_id"].isin(val_slides)].reset_index(drop=True)
test_df  = df[df["slide_id"].isin(test_slides)].reset_index(drop=True)

train_df.to_csv("/rds/general/user/dla24/home/thesis/TGCA_dataset/train_40x.csv", index=False)
val_df.to_csv("/rds/general/user/dla24/home/thesis/TGCA_dataset/val_40x.csv", index=False)
test_df.to_csv("/rds/general/user/dla24/home/thesis/TGCA_dataset/test_40x.csv", index=False)