# Breast cancer image classification
**INSTITUTO FEDERAL DE MINAS GERIAS**
*Departamento de Engenharia e Computação*

**Professor:** Natalia C. do Carmo

**Alunos:** Antonio Ambrosio e Euler Gomes


# 1. Ambient preparation

In [9]:
from IPython.display import display, HTML
from babel.util import missing

display(HTML("<style>.container {widht: 100% !important;}</style>"))

## 1.1. Import packages

In [10]:
import subprocess
import sys

import os
import shutil
import random
from pathlib import Path

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader, random_split

## 1.2. GPU check

In [11]:
if torch.cuda.is_available():
    print("__CUDNN VERSION:", torch.backends.cudnn.version())
    print("Device Name:", torch.cuda.get_device_name(0))
    DEVICE = 'cuda'
else:
    print("CUDA is not available.")
    DEVICE = 'cpu'

torch.device(DEVICE)

__CUDNN VERSION: 91002
Device Name: NVIDIA GeForce RTX 5070


device(type='cuda')

## 1.3. Export requirements.txt

In [12]:
def export_requirements():
    try:
        result = subprocess.run([sys.executable, "-m", "pip", "freeze"],
                                   capture_output=True,
                                   text=True,
                                   check=True)
        with open('requirements.txt', 'w') as f:
            f.write(result.stdout)
        print('requirements.txt file generated sucessfully.')
    except subprocess.CalledProcessError as e:
        print('error:', e)


export_requirements()

requirements.txt file generated sucessfully.


# 2. Load dataset

## 2.1. Split dataset

In [15]:
new_split = False #if equals False doesn't make a new split

#dataset path
data_og = "data_og"
data_split = 'data_split'

#split ratio
# train + val + test == 1.0
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

Classe 'benign': 1764 train | 504 val | 253 test
Classe 'maglinant': 3578 train | 1022 val | 512 test


In [None]:
if new_split:
 #create dir
    for split in ["train", "val", "test"]:
        for cls in os.listdir(data_og):
            Path(f"{data_split}/{split}/{cls}").mkdir(parents=True, exist_ok=True)

    #split per class
    for cls in os.listdir(data_og):
        class_path = os.path.join(data_og, cls)
        images = os.listdir(class_path)

        random.shuffle(images)

        n_total = len(images)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)

        train_images = images[:n_train]
        val_images = images[n_train:n_train + n_val]
        test_images = images[n_train + n_val:]

        def copy_images(img_list, split_name):
            for img in img_list:
                src = os.path.join(class_path, img)
                dst = os.path.join(data_split, split_name, cls, img)
                shutil.copy2(src, dst)

        copy_images(train_images, "train")
        copy_images(val_images, "val")
        copy_images(test_images, "test")

        print(f"Classe '{cls}': {n_train} train | {n_val} val | {len(test_images)} test")


## 2.1. Pre-processing

In [16]:
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

## 2.2. Load dataset to pytorch

In [17]:
train_dataset = ImageFolder("data_split/train", transform=transform_train)
val_dataset   = ImageFolder("data_split/val", transform=transform_test)
test_dataset  = ImageFolder("data_split/test", transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=32)
test_loader  = DataLoader(test_dataset, batch_size=32)