### Etapas:
1. Examinar e entender os dados (Pré processamento)
2. **Criar um pipeline de entrada (Extraçao de características)**
3. Criar o modelo de classificação
4. Treine o modelo
5. Avaliar modelo

In [1]:
from PIL import Image

import numpy as np
import pandas as pd

import torch
import torchvision.models as models
import torchvision.transforms as transforms

#### Pre-Processamento

In [2]:
# Pre Processing

df = pd.read_csv('/kaggle/input/breakhis/Folds.csv')
df["class"] = df["filename"].str.split('/').str[3]
df["filename"] = '/kaggle/input/breakhis/BreaKHis_v1/' + df["filename"]

# print(df)

#### Inicializando VGG16

In [3]:
# Carregar o VGG16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vgg16 = models.vgg16().features.to(device)
# vgg16 = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features
# vgg16 = models.vgg16().features

vgg16_avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
vgg16.eval()


def extract_features(filename):
    img = Image.open(filename).convert('RGB')
    
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    img_tensor = transform(img).to(device)
    img_tensor = img_tensor.unsqueeze(0)
    
    with torch.no_grad():
        features = vgg16(img_tensor)
        features = vgg16_avgpool(features)
        features = torch.flatten(features)
        
    return features.cpu().numpy()

#### Extraindo Caracteristicas

In [4]:
base = df.query("fold == 1").query("mag == 400")

X_train, y_train, X_test, y_test = list(), list(), list(), list()

for index, row in base.iterrows():    
    features = extract_features(row["filename"])
    if row['grp'] == 'train':
        X_train.append(features)
        y_train.append(row["class"])
    else:
        X_test.append(features)
        y_test.append(row["class"])
#     if index%50 == 0: break

X_train, y_train, X_test, y_test = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

print('Treino:', X_train.shape, y_train.shape)
print('Teste:', X_test.shape, y_test.shape)

Treino: (1165, 512) (1165,)
Teste: (655, 512) (655,)


#### Guardando X_train e X_test

In [5]:
np.save('./vgg_X_train.npy', X_train)
np.save('./vgg_y_train.npy', y_train)
np.save('./vgg_X_test.npy', X_test)
np.save('./vgg_y_test.npy', y_test)

## Teste Prévio

In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Normalizando
ss = StandardScaler()
ss.fit(X_train)

X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

# Testando com SVM
clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

      benign       0.81      0.66      0.73       237
   malignant       0.83      0.91      0.87       418

    accuracy                           0.82       655
   macro avg       0.82      0.79      0.80       655
weighted avg       0.82      0.82      0.82       655

