In [1]:
import pandas as pd
import numpy as np

# Seed für Reproduzierbarkeit
np.random.seed(42)

# CSV einlesen
df = pd.read_csv('C:/Users/norad/LetItShine_Dev/whole_19_patients.csv')

In [2]:
df

Unnamed: 0,Slide,Patch,Diagnosis
0,1,0,1
1,1,1,1
2,1,2,1
3,1,3,1
4,1,4,1
...,...,...,...
766560,19,8802,1
766561,19,8806,1
766562,19,8913,1
766563,19,8916,1


In [3]:
unique_slides = df["Slide"].unique()
unique_slides

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [13]:
slide_counts_df = df["Slide"].value_counts().reset_index()
slide_counts_df.columns = ["Slide", "Anzahl"]
print(slide_counts_df)


    Slide  Anzahl
0       6   99320
1       7   87290
2       8   79518
3      14   73005
4      13   70997
5      10   68251
6      18   42523
7      16   35303
8      15   34644
9       1   33707
10     12   30860
11      3   26196
12      9   22980
13     11   19616
14     17   16327
15      2   12686
16      4    7057
17      5    4302
18     19    1983


In [14]:
percentage=100
# Berechne die erwartete Gesamtlänge der Stichproben
total_rows = len(df)
expected_sample_size = total_rows * percentage / 100
print(f"Originallänge des DataFrames: {total_rows} Zeilen")
print(f"Erwartete Stichprobengröße ({percentage}%): {expected_sample_size:.2f} Zeilen")

Originallänge des DataFrames: 766565 Zeilen
Erwartete Stichprobengröße (100%): 766565.00 Zeilen


In [15]:
np.random.seed(42) 
# Initialisierung der Stichproben-DataFrames
sample1 = pd.DataFrame(columns=df.columns)
sample2 = pd.DataFrame(columns=df.columns)
sample3 = pd.DataFrame(columns=df.columns)

# Für jede eindeutige Slide drei separate Stichproben erstellen

if percentage == 100:
    print("100% ausgewählt – verwende vollständige Daten ohne Sampling.")
    sample1 = df.copy()
    sample2 = df.copy()
    sample3 = df.copy()

else:
    for slide in unique_slides:
        slide_data = df[df['Slide'] == slide]
        slide_size = len(slide_data)
        sample_size = max(1, int(slide_size * percentage / 100))

        print(f"Slide {slide}: Gesamtanzahl = {slide_size}, Stichprobengröße = {sample_size}")

        if sample_size * 3 > slide_size:
            print(f"Warnung: Slide {slide} hat nicht genügend Daten für drei nicht-überlappende "
                    f"Stichproben von {percentage}%. Verwende kleinere Stichproben.")
            sample_size = slide_size // 3

        # Indizes mischen
        indices = slide_data.index.tolist()
        np.random.shuffle(indices)

        # Drei separate, nicht-überlappende Stichproben erstellen
        indices1 = indices[:sample_size]
        indices2 = indices[sample_size:2*sample_size]
        indices3 = indices[2*sample_size:3*sample_size]
        
        # Stichproben zu den jeweiligen DataFrames hinzufügen
        sample1 = pd.concat([sample1, df.loc[indices1]])
        sample2 = pd.concat([sample2, df.loc[indices2]])
        sample3 = pd.concat([sample3, df.loc[indices3]])

print(f"Stichprobe 1: {len(sample1)} Zeilen")
print(f"Stichprobe 2: {len(sample2)} Zeilen")
print(f"Stichprobe 3: {len(sample3)} Zeilen")


100% ausgewählt – verwende vollständige Daten ohne Sampling.
Stichprobe 1: 766565 Zeilen
Stichprobe 2: 766565 Zeilen
Stichprobe 3: 766565 Zeilen


In [16]:
sample1, sample2, sample3

(        Slide  Patch  Diagnosis
 0           1      0          1
 1           1      1          1
 2           1      2          1
 3           1      3          1
 4           1      4          1
 ...       ...    ...        ...
 766560     19   8802          1
 766561     19   8806          1
 766562     19   8913          1
 766563     19   8916          1
 766564     19   9303          1
 
 [766565 rows x 3 columns],
         Slide  Patch  Diagnosis
 0           1      0          1
 1           1      1          1
 2           1      2          1
 3           1      3          1
 4           1      4          1
 ...       ...    ...        ...
 766560     19   8802          1
 766561     19   8806          1
 766562     19   8913          1
 766563     19   8916          1
 766564     19   9303          1
 
 [766565 rows x 3 columns],
         Slide  Patch  Diagnosis
 0           1      0          1
 1           1      1          1
 2           1      2          1
 3           1  

In [17]:
# Überprüfung auf Überlappungen zwischen den Stichproben
overlap_1_2 = set(sample1.index).intersection(set(sample2.index))
overlap_1_3 = set(sample1.index).intersection(set(sample3.index))
overlap_2_3 = set(sample2.index).intersection(set(sample3.index))

if overlap_1_2 or overlap_1_3 or overlap_2_3:
    print("\nWARNUNG: Es wurden Überlappungen zwischen den Stichproben gefunden!")
    print(f"Überlappung zwischen Stichprobe 1 und 2: {len(overlap_1_2)} Einträge")
    print(f"Überlappung zwischen Stichprobe 1 und 3: {len(overlap_1_3)} Einträge")
    print(f"Überlappung zwischen Stichprobe 2 und 3: {len(overlap_2_3)} Einträge")
else:
    print("\nKeine Überlappungen zwischen den Stichproben gefunden. ✓")


WARNUNG: Es wurden Überlappungen zwischen den Stichproben gefunden!
Überlappung zwischen Stichprobe 1 und 2: 766565 Einträge
Überlappung zwischen Stichprobe 1 und 3: 766565 Einträge
Überlappung zwischen Stichprobe 2 und 3: 766565 Einträge


In [18]:
wenyi_fold1_train = ["02", "03", "08", "10", "13", "14", "17", "18", "19"]
wenyi_fold1_test = ["05", "07", "09", "15", "16"]
wenyi_fold2_train = ["02", "05", "07", "09", "13", "14", "15", "16", "18", "19"]
wenyi_fold2_test = ["03", "08", "10", "17"]
wenyi_fold3_train = ["03", "05", "07", "08", "09", "10", "15", "16", "17"]
wenyi_fold3_test = ["02", "13", "14", "18", "19"]
wenyi_val = ["01", "04", "06","11", "12"]

partition1 = ["01", "02", "04", "06", "08", "12", "19"]
partition2 = ["05", "07", "13", "15", "17", "18"]
partition3 = ["03", "09", "10", "11", "14", "16"]

partition4 = ["01", "02", "05", "07", "09", "14", "17"]
partition5 = ["04", "08", "10", "13", "18", "19"]
partition6 = ["03", "06", "11", "12", "15", "16"]

cv5_partition1 = ["01", "02", "04", "06"]
cv5_partition2 = ["08", "13", "16", "18"]
cv5_partition1 = ["03", "07", "09", "17"]
cv5_partition2 = ["10", "11", "15", "19"]
cv5_partition1 = ["05", "12", "14"]

In [10]:
# output_dir = "C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/wenyi_20"
# fold1_train = wenyi_fold1_train
# fold1_val = wenyi_val
# fold1_test = wenyi_fold1_test

# fold2_train = wenyi_fold2_train
# fold2_val = wenyi_val
# fold2_test = wenyi_fold2_test

# fold3_train = wenyi_fold3_train
# fold3_val = wenyi_val
# fold3_test = wenyi_fold3_test

In [19]:
output_dir = "C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123"
fold1_train = partition3
fold1_val = partition2
fold1_test = partition1

fold2_train = partition1
fold2_val = partition3
fold2_test = partition2

fold3_train = partition2
fold3_val = partition1
fold3_test = partition3

In [None]:
# output_dir = "C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p456_5"
# fold1_train = partition6
# fold1_val = partition5
# fold1_test = partition4

# fold2_train = partition4
# fold2_val = partition6
# fold2_test = partition5

# fold3_train = partition5
# fold3_val = partition4
# fold3_test = partition6

In [20]:
import os
# Funktion zum Filtern der Daten nach Slide-Nummern und Speichern als CSV
def filter_and_save(sample_df, slide_numbers, output_path):
    # Debugging: Überprüfen der unique Werte in der Slide-Spalte
    #print(f"Unique Werte in der Slide-Spalte: {sample_df['Slide'].unique()}")
    #print(f"Gesuchte Slide-Nummern: {slide_numbers}")
    
    # Umwandlung der zu suchenden Slide-Nummern in eine Liste von Strings
    slide_numbers_int = [int(s) for s in slide_numbers]
    
    # Filtern nach den angegebenen Slide-Nummern
    filtered_data = sample_df[sample_df['Slide'].isin(slide_numbers_int)]
    
    # Ausgabe zur Überprüfung
    #print(f"Anzahl der gefilterten Zeilen: {len(filtered_data)}")
    
    # Sicherstellen, dass das Ausgabeverzeichnis existiert
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Als CSV speichern, nur wenn Daten vorhanden sind
    if len(filtered_data) > 0:
        filtered_data.to_csv(output_path, index=False)
        print(f"Gespeichert: {output_path} mit {len(filtered_data)} Zeilen")
    else:
        print(f"WARNUNG: Keine Daten zum Speichern für {output_path}")
    
    return filtered_data

In [21]:
# Fold 1: sample1 als Basis
train_file = os.path.join(output_dir, "fold_1/train.csv")
val_file = os.path.join(output_dir, "fold_1/val.csv")
test_file = os.path.join(output_dir, "fold_1/test.csv")

train_data = filter_and_save(sample1, fold1_train, train_file)
val_data = filter_and_save(sample1, fold1_val, val_file)
test_data = filter_and_save(sample1, fold1_test, test_file)

print("\nStatistiken Fold 1:")
print(f"Training: {len(train_data)} Zeilen, Slides: {fold1_train}")
print(f"Validierung: {len(val_data)} Zeilen, Slides: {fold1_val}")
print(f"Test: {len(test_data)} Zeilen, Slides: {fold1_test}")

# Fold 2: sample2 als Basis
train_file = os.path.join(output_dir, "fold_2/train.csv")
val_file = os.path.join(output_dir, "fold_2/val.csv")
test_file = os.path.join(output_dir, "fold_2/test.csv")

train_data = filter_and_save(sample2, fold2_train, train_file)
val_data = filter_and_save(sample2, fold2_val, val_file)
test_data = filter_and_save(sample2, fold2_test, test_file)

print("\nStatistiken Fold 2:")
print(f"Training: {len(train_data)} Zeilen, Slides: {fold2_train}")
print(f"Validierung: {len(val_data)} Zeilen, Slides: {fold2_val}")
print(f"Test: {len(test_data)} Zeilen, Slides: {fold2_test}")

# Fold 3: sample3 als Basis
train_file = os.path.join(output_dir, "fold_3/train.csv")
val_file = os.path.join(output_dir, "fold_3/val.csv")
test_file = os.path.join(output_dir, "fold_3/test.csv")

train_data = filter_and_save(sample3, fold3_train, train_file)
val_data = filter_and_save(sample3, fold3_val, val_file)
test_data = filter_and_save(sample3, fold3_test, test_file)

print("\nStatistiken Fold 3:")
print(f"Training: {len(train_data)} Zeilen, Slides: {fold3_train}")
print(f"Validierung: {len(val_data)} Zeilen, Slides: {fold3_val}")
print(f"Test: {len(test_data)} Zeilen, Slides: {fold3_test}")

Gespeichert: C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123\fold_1/train.csv mit 245351 Zeilen
Gespeichert: C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123\fold_1/val.csv mit 256083 Zeilen
Gespeichert: C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123\fold_1/test.csv mit 265131 Zeilen

Statistiken Fold 1:
Training: 245351 Zeilen, Slides: ['03', '09', '10', '11', '14', '16']
Validierung: 256083 Zeilen, Slides: ['05', '07', '13', '15', '17', '18']
Test: 265131 Zeilen, Slides: ['01', '02', '04', '06', '08', '12', '19']
Gespeichert: C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123\fold_2/train.csv mit 265131 Zeilen
Gespeichert: C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123\fold_2/val.csv mit 245351 Zeilen
Gespeichert: C:/Users/norad/LetItShine_Dev/FINAL_FOLDS/p123\fold_2/test.csv mit 256083 Zeilen

Statistiken Fold 2:
Training: 265131 Zeilen, Slides: ['01', '02', '04', '06', '08', '12', '19']
Validierung: 245351 Zeilen, Slides: ['03', '09', '10', '11', '14', '16']
Test: 256083 Zeilen, Slides: [

In [26]:
sample3

Unnamed: 0,Slide,Patch,Diagnosis
29842,1,30749,1
24408,1,25104,1
11967,1,12322,1
14467,1,14887,1
24929,1,25637,1
...,...,...,...
766003,19,3990,1
764863,19,978,1
764640,19,319,1
765187,19,2097,1


In [None]:
wenyi_val

['01', '04', '06', '11', '12']

In [49]:
wenyi_val_int = [int(slide_num) for slide_num in wenyi_val]
wenyi_val_int

[1, 4, 6, 11, 12]

In [48]:
unique_slides = sample1["Slide"].unique()
unique_slides

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
      dtype=object)

In [50]:
filtered_data = sample1[sample1['Slide'].isin(wenyi_val_int)]
filtered_data

Unnamed: 0,Slide,Patch,Diagnosis
22105,1,22735,1
14731,1,15153,1
17366,1,17854,1
11903,1,12251,1
31909,1,32889,1
...,...,...,...
474009,12,13260,0
486810,12,26326,0
485159,12,24648,0
486323,12,25834,0
