#Getting the data from Kaggle

In [2]:
!pip install kaggle



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [49]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [50]:
!kaggle competitions download -c isic-2024-challenge

!unzip -q isic-2024-challenge.zip

isic-2024-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)


Logisitic regression


In [66]:
#!mkdir -p /content/isic-2024-challenge
!ls /content/
#moved all files to isic-2024-challenge

drive  isic-2024-challenge


In [67]:
!ls /content/isic-2024-challenge/

isic-2024-challenge.zip  test-image.hdf5    train-image       train-metadata.csv
sample_submission.csv	 test-metadata.csv  train-image.hdf5


#Logisitic Regression Model

In [68]:
import h5py
import pandas as pd
import numpy as np
from PIL import Image
import io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

metadata_path = '/content/isic-2024-challenge/train-metadata.csv'
metadata = pd.read_csv(metadata_path, low_memory=False)

In [69]:
target_0 = metadata[metadata['target'] == 0].sample(n=10000, random_state=42)
target_1 = metadata[metadata['target'] == 1].sample(n=393, random_state=42)
balanced_metadata = pd.concat([target_0, target_1]).reset_index(drop=True)

train_metadata, val_metadata = train_test_split(balanced_metadata, test_size=0.2, random_state=42)

In [70]:
def load_and_flatten_images(metadata, hdf5_path):
    with h5py.File(hdf5_path, 'r') as hdf5_file:
        features = []
        labels = []
        for i, row in metadata.iterrows():
            isic_id = row['isic_id']
            label = row['target']
            image = Image.open(io.BytesIO(hdf5_file[isic_id][()]))
            image = image.resize((64, 64))  # Reduce image size to 64x64 to speed up processing
            image_flat = np.array(image).flatten()  # Flatten the image
            features.append(image_flat)
            labels.append(label)
    return np.array(features), np.array(labels)

train_image_path = '/content/isic-2024-challenge/train-image.hdf5'
X_train, y_train = load_and_flatten_images(train_metadata, train_image_path)
X_val, y_val = load_and_flatten_images(val_metadata, train_image_path)

print("Training Features Shape:", X_train.shape)
print("Training Labels Shape:", y_train.shape)
print("Validation Features Shape:", X_val.shape)
print("Validation Labels Shape:", y_val.shape)

Training Features Shape: (8314, 12288)
Training Labels Shape: (8314,)
Validation Features Shape: (2079, 12288)
Validation Labels Shape: (2079,)


In [71]:
#Standardizing Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
#Training Logistic Regression Model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

In [None]:
#Model Eval
y_pred = logistic_regression_model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_pred))

In [None]:
def plot_sample_images(metadata, hdf5_path, y_true, y_pred, num_images=10):
  with h5py.File(hdf5_path, 'r') as hdf5_file:
    fig, axes = plt.subplots(2, num_images // 2, figsize=(15, 6))
    axes = axes.flatten()
    indices = np.random.choice(len(metadata), num_images, replace=False)
    for idx, ax in zip(indices, axes):
      isic_id = metadata.iloc[idx]['isic_id']
      image = Image.open(io.BytesIO(hdf5_file[isic_id][()]))
      image = image.resize((64, 64))
      ax.imshow(image)
      ax.set_title(f'True: {y_true[idx]}\nPred: {y_pred[idx]}')
      ax.axis('off')
    plt.tight_layout()
    plt.show()

# Plot predictions for some validation images
plot_sample_images(val_metadata, train_image_path, y_val, y_pred, num_images=10)