In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing import image
import os

In [None]:
path_df = pd.read_csv('datasets/thyroid_ultrasonic_image/batch1_image/batch1_image.csv')
image_label = pd.read_csv('datasets/thyroid_ultrasonic_image/batch1_image/batch1_image_label.csv')

In [16]:
cnn_model = EfficientNetB0(include_top=False, pooling='avg', weights='imagenet', input_shape=(224, 224, 3))
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img) / 255.0
    return img_array  

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [33]:
def extract_patient_features(df):
    patient_features = []
    patient_labels = []

    grouped = df.groupby('patient_name')
    for patient, group in grouped:
        images = group['image_path'].tolist()
        label = group['label'].iloc[0]

        image_vecs = []
        for img_path in images:
            if not os.path.exists(img_path):
                print(f"[SKIPPED] Missing file: {img_path}")
                continue  # skip if file doesn't exist

            try:
                img = preprocess_image(img_path)
                feat = cnn_model.predict(np.expand_dims(img, axis=0), verbose=0)
                image_vecs.append(feat[0])
            except Exception as e:
                print(f"[ERROR] Problem with {img_path}: {e}")
                continue

        if len(image_vecs) == 0:
            print(f"[WARNING] No valid images for patient: {patient}, skipping")
            continue

        # Aggregate all image vectors into one patient vector (mean pooling)
        patient_vector = np.mean(image_vecs, axis=0)
        patient_features.append(patient_vector)
        patient_labels.append(label)

    return np.array(patient_features), np.array(patient_labels)

In [34]:
# Merge on patient name
merged_df = path_df.merge(image_label, on='patient_name', how='inner')

# Final structure
final_df = merged_df[['path', 'patient_name', 'histo_label']]
final_df = final_df.rename(columns={'path': 'image_path', 'histo_label': 'label'})
final_df['image_path'] = 'datasets/thyroid_ultrasonic_image/batch1_image/dataset/' + final_df['image_path']
print(final_df.head())

                                          image_path  patient_name  label
0  datasets/thyroid_ultrasonic_image/batch1_image...           341      0
1  datasets/thyroid_ultrasonic_image/batch1_image...           341      0
2  datasets/thyroid_ultrasonic_image/batch1_image...           341      0
3  datasets/thyroid_ultrasonic_image/batch1_image...           341      0
4  datasets/thyroid_ultrasonic_image/batch1_image...           341      0


In [None]:
X, y = extract_patient_features(final_df)


[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/0_002.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/2_008.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/2_009.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/3_007.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/3_008.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/4_029.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/4_030.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/4_031.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/4_032.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/4_033.Jpg
[SKIPPED] Missing file: datasets/thyroid_ultrasonic_image/batch1_image/dataset/4_034.Jpg
[SKIPPED] Missing fil