In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from PIL import Image
import os
from tqdm import tqdm
import sys

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Paths
data_dir = r'D:\archive (2)\HAM10000_images_part_1'
metadata_file = r'D:\archive (2)\HAM10000_metadata.csv'

# Load metadata
df = pd.read_csv(metadata_file)

# Create a set of expected image filenames from metadata
expected_images = set(df['image_id'] + '.jpg')

# Get the set of actual image files in the directory
actual_images = set(os.listdir(data_dir))

# Find images in metadata but not in directory
missing_images = expected_images - actual_images

# Find images in directory but not in metadata
extra_images = actual_images - expected_images

print(f"Total images in metadata: {len(expected_images)}")
print(f"Total images in directory: {len(actual_images)}")
print(f"Images in metadata but missing from directory: {len(missing_images)}")
print(f"Images in directory but not in metadata: {len(extra_images)}")

# Optional: Save results to files
with open('missing_images.txt', 'w') as f:
    for img in missing_images:
        f.write(f"{img}\n")

with open('extra_images.txt', 'w') as f:
    for img in extra_images:
        f.write(f"{img}\n")

print("\nResults have been saved to 'missing_images.txt' and 'extra_images.txt'")

# Check if there are any mismatches
if missing_images or extra_images:
    print("\nThere are mismatches between the metadata and image files.")
    print("The script will continue, but please note that some images may be excluded from training.")
    
    # Remove missing images from the dataframe
    df = df[~df['image_id'].isin([img[:-4] for img in missing_images])]

# Prepare data for model
df['path'] = df['image_id'] + '.jpg'
df['path'] = df['path'].apply(lambda x: os.path.join(data_dir, x))

# Verify all paths exist
df = df[df['path'].apply(os.path.exists)]

print(f"\nProceeding with model training using {len(df)} images.")

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Image data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    validation_split=0.2
)

test_datagen = ImageDataGenerator(rescale=1./255)

# Set up generators
train_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col='path',
    y_col='dx',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

validation_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col='path',
    y_col='dx',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

test_generator = test_datagen.flow_from_dataframe(
    test_df,
    x_col='path',
    y_col='dx',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)

# Create the model
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(7, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=output)

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=10,
    validation_data=validation_generator,
    validation_steps=len(validation_generator)
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_generator, steps=len(test_generator))
print(f"Test accuracy: {test_accuracy:.4f}")

# Generate predictions
predictions = model.predict(test_generator, steps=len(test_generator))
predicted_classes = np.argmax(predictions, axis=1)
true_classes = test_generator.classes

# Print classification report
class_labels = list(train_generator.class_indices.keys())
print(classification_report(true_classes, predicted_classes, target_names=class_labels))

# Plot confusion matrix
cm = confusion_matrix(true_classes, predicted_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

Total images in metadata: 10015
Total images in directory: 5000
Images in metadata but missing from directory: 5015
Images in directory but not in metadata: 0

Results have been saved to 'missing_images.txt' and 'extra_images.txt'

There are mismatches between the metadata and image files.
The script will continue, but please note that some images may be excluded from training.

Proceeding with model training using 5000 images.
Found 3200 validated image filenames belonging to 7 classes.
Found 800 validated image filenames belonging to 7 classes.
Found 1000 validated image filenames belonging to 7 classes.


ValueError: Argument(s) not recognized: {'lr': 0.0001}