<a href="https://colab.research.google.com/github/enVives/TFG/blob/main/tasca1/AlexNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [98]:
import torch
import cv2
import random
import numpy as np
import os


from google.colab import files
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from skimage import io
import kagglehub
import pandas as pd
from glob import glob


from torchvision import transforms,models
from torch import nn
from torch.utils.data import Dataset

# !rm -rf /content/sample_data/*

# !kaggle datasets download -d "kmader/skin-cancer-mnist-ham10000"

# !unzip -o skin-cancer-mnist-ham10000.zip -d /content/sample_data/

!rm -rf /content/sample_data/*

!kaggle datasets download -d "kmader/skin-cancer-mnist-ham10000"

!unzip -o skin-cancer-mnist-ham10000.zip -d /content/sample_data/

In [99]:
metadates = pd.read_csv('/content/sample_data/HAM10000_metadata.csv')
metadates = metadates.sort_values(by='image_id')
metadates.head()
print(metadates['dx'].value_counts())
print()
print(metadates['dx'].value_counts() / sum(metadates['dx'].value_counts()))

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

dx
nv       0.669496
mel      0.111133
bkl      0.109735
bcc      0.051323
akiec    0.032651
vasc     0.014179
df       0.011483
Name: count, dtype: float64


nv: melanocytic nevi

vasc: vascular lesions

mel:melanoma

df: dermatofibroma

bkl: benign keratosis-like lesions

bcc: basal cell carcinoma

akiec: Actinic keratoses and intraepithelial carcinoma / Bowen's disease

In [100]:
class Formes(Dataset):
  #Classe on gestionarem les imatges dels fitxers

  def __init__(self, images, labels, transform):
        super().__init__()
        self.paths = images
        self.labels = labels
        self.len = len(self.paths)
        self.transform = transform
        #Per defecte pens que el color pot extreure característiques importants, per tant en primer lloc
        #entrenarem les imatges de color
        self.greyscale = False

  def __len__(self):
        return self.len

  def __setgreyscale__(self,mode):
    self.greyscale = mode

  def __getitem__(self, index):
      path = self.paths[index]
      label = self.labels[index]

      image = cv2.imread(path, cv2.IMREAD_GRAYSCALE if self.greyscale else cv2.IMREAD_COLOR)  # Depén de vosaltres
      image = self.transform(image)

      return image, label

In [101]:
def calcula_mitjana_desviacio(img_files_path):
  mitjana = np.array([0.0, 0.0, 0.0], dtype=np.float64)
  desviacio = np.array([0.0, 0.0, 0.0], dtype=np.float64)

  x,y = 224,224
  pixels_totals_canal = len(img_files_path) * x * y

  for i in range(len(img_files_path)):
    imatge = cv2.imread(img_files_path[i])
    imatge = cv2.resize(imatge,(x,y))

    canal_b, canal_g, canal_r = cv2.split(imatge)

    mitjana += np.array([canal_r.sum(), canal_g.sum(), canal_b.sum()])
    desviacio += np.array([(canal_r**2).sum(), (canal_g**2).sum(), (canal_b**2).sum()])

  mitjana = mitjana / pixels_totals_canal
  variance = (desviacio/pixels_totals_canal) - mitjana**2
  print(desviacio)
  print(variance)
  desviacio = np.sqrt(np.maximum(0,variance))

  return mitjana,desviacio

In [102]:
TRAINING = 0.64
VALIDATION = 0.16
TESTING = 0.20

transform = transforms.Compose([
    transforms.ToTensor(),
])

illnes_dictionary = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses and intraepithelial carcinoma / Bowens disease',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

img_files_1 = sorted(glob('/content/sample_data/HAM10000_images_part_1/*'))
img_files_2 = sorted(glob('/content/sample_data/HAM10000_images_part_2/*'))
img_files = img_files_1 + img_files_2

img_files = np.array(img_files)

imgid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in img_files}

metadates['path'] = metadates['image_id'].map(imgid_path_dict.get)
metadates['illness'] = metadates['dx'].map(illnes_dictionary.get)
metadates['illness_code'] = metadates['dx'].map({'nv': 0, 'mel': 1, 'bkl': 2, 'bcc': 3, 'akiec': 4, 'vasc': 5, 'df': 6})

#Aquest illness_code s'utilitzarà com a label de la enfermetat

img_number = len(img_files)


X = metadates.drop('illness_code',axis= 1)
y = metadates['illness_code']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TESTING, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATION/(TRAINING+VALIDATION), random_state=42, stratify=y_train)

#Una bona idea seria aplicar data augmentation al conjunt d'entrenament ja que les classes estan molt desbalancejades
#Una altra bona idea seria emplear una funció de pèrdua que tengui en compte les classes desbalancejades.

#Antes de guardar els datasets en classes Formes, hauriem de caluclar la mitjana i desviació típica de les imatges
train_data = Formes(X_train['path'].to_numpy(),y_train.to_numpy(),transform)
test_data = Formes(X_test['path'].to_numpy(),y_test.to_numpy(),transform)
validation_data = Formes(X_val['path'].to_numpy(),y_val.to_numpy(),transform)


mitjana,desviacio = calcula_mitjana_desviacio(img_files)
print(mitjana)
print(desviacio)


[-37745.27740474 -19255.809691   -21022.728044  ]
[194.57463374 139.13953272 145.36132088]
[0. 0. 0.]


In [103]:
#Estaria bé millorar la distribució de les classes en el dataset original



# labels = numpy.array(metadates['dx'
# labels = metadates['dx'].to_numpy()

# idx_list = list(range(img_number))
# random.shuffle(idx_list)

# training_imgs = img_files[idx_list[:int(img_number*TRAINING)]]
# training_labels = labels[idx_list[:int(img_number*TRAINING)]]

# train_data = Formes(training_imgs,training_labels,transform)

# validation_imgs = img_files[idx_list[int(img_number*TRAINING):int(img_number*(TRAINING+VALIDATION))]]
# validation_labels = labels[idx_list[int(img_number*TRAINING):int(img_number*(TRAINING+VALIDATION))]]

# validation_data = Formes(validation_imgs,validation_labels,transform)

# testing_imgs = img_files[idx_list[int(img_number*(TRAINING+VALIDATION)):]]
# testing_labels = labels[idx_list[int(img_number*(TRAINING+VALIDATION)):]]

# testing_data = Formes(testing_imgs,testing_labels,transform)


# print(len(training_imgs))
# print(len(validation_imgs))
# print(len(testing_imgs))
# metadates.to_csv('metadatesnou.csv')

##ENTRENAMENT

In [104]:
alexnetbinary = models.alexnet(pretrained=False)
alexnetmulticlass = models.alexnet(pretrained=False)

alexnetbinary.classifier = nn.Sequential(
    torch.nn.Linear(9216, 1024),
    nn.ReLU(),
    torch.nn.Linear(1024, 1024),
    nn.ReLU(),
    torch.nn.Linear(1024, 512),
    nn.ReLU(),
    torch.nn.Linear(512, 1),
    nn.Sigmoid()
)
alexnetmulticlass.classifier = nn.Sequential(
    torch.nn.Linear(9216, 1024),
    nn.ReLU(),
    torch.nn.Linear(1024, 1024),
    nn.ReLU(),
    torch.nn.Linear(1024, 512),
    nn.ReLU(),
    torch.nn.Linear(512, 7),  # Ja que tenim 7 classes.
    nn.Softmax(dim=1)
)

