### Cargar el dataset
El dataset carregat amb pandas es un objecte de tipus Dataframe

In [1]:
import pandas as pd

# Cargar csv
csv = pd.read_csv("healthcare-dataset-stroke-data.csv")
csv.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


#### Eliminar lo que no necesitamos
- Eliminar la columna ID.
- Eliminar las filas que contengan algún valor Nan o null en las columnas.

In [2]:
csv.drop('id', axis=1, inplace=True) #ID
csv = csv.dropna() #Valor null
csv.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


#### Preprocesado
- **Gender**: Male=0, Female=1 
- **Married**: No=0, Yes=1
- **Work type**: Private=0, Self employed=1, Other=2 
- **Residense**: Urban=0, Rural=1 
- **Fumador**: No=0, Yes=1, Previously=2, Unknown=3 
- **Heart disease**: No=0, Yes=1

In [3]:
csv['gender'].replace({'Male': 0, 'Female': 1, 'Other': 2}, inplace=True)
csv['ever_married'].replace({'No': 0, 'Yes': 1}, inplace=True)
csv['work_type'].replace({'Private': 0, 'Self-employed': 1}, inplace=True)
# r: indica que es un raw string | ^: inicio de línea | ?!: indica que la expresión no debe coincidir | $: final de línea
csv['work_type'].replace(to_replace=r'^(?!Private|Self-employed$).*$', value=2, regex=True, inplace=True)
csv['Residence_type'].replace({'Urban': 0, 'Rural': 1}, inplace=True)
csv['smoking_status'].replace({'never smoked': 0, 'smokes': 1, 'formerly smoked': 2, 'Unknown': 3}, inplace=True)
csv.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,0,0,228.69,36.6,2,1
2,0,80.0,0,1,1,0,1,105.92,32.5,0,1
3,1,49.0,0,0,1,0,0,171.23,34.4,1,1
4,1,79.0,1,0,1,1,1,174.12,24.0,0,1
5,0,81.0,0,0,1,0,0,186.21,29.0,2,1


### Diferenciar les dades
#### Partir les X Y
Indicar al Dataframe csv quines son les dades y quins son els resultats

In [4]:
# Diferenciar les X de les Y
X = csv.values[:, :-1]
Y = csv.values[:, -1]

# Indicar quines son les Y
csv['stroke'] = Y

### Convertir les X Y en tensor

In [5]:
import torch
X = torch.tensor(X.astype(float), dtype=torch.float32)
Y = torch.tensor(Y.astype(int))

### Clase Dataset

In [7]:
from torch.utils.data import Dataset

class myDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

### Partir la X i la Y en train i test 

In [None]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3)

#### Crear el propi dataset
Passar al train_dataloader i test_dataloader un objecte dataset, nosaltres hem de crear aquet dataset extenent de la clase Dataset.

In [8]:
from torch.utils.data import DataLoader

train_dataset = myDataset(trainX, trainY)
test_dataset = myDataset(testX, testY)

train_dataloader = DataLoader(train_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

### Crear la red neuronal
1. Crear el dispositivo
2. Definir la clase *Module* con la función forward
3. Crear el modelo y pasarlo a la GPU

In [9]:
from torch import nn

# Ya están implementadas las clases de las capas para hacer el forward
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(10, 1000), 
            nn.ReLU(),
            nn.Linear(1000, 100),
            nn.ReLU(),
            nn.Linear(100, 2),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x) # Crea las capas
        return logits
    
# Device
#if torch.backends.mps.is_available():
    #device = "mps"
#else:
device = "cpu"

model = NeuralNetwork().to(device) # Otiene los valores predichos

### Train y test
Definimos las funciones para train y test.

In [20]:
batch_size=64

def train_loop(train_dataloader, model, loss_fn, optimizer):
    size = len(train_dataloader.dataset)
    
    for batch, (X, Y) in enumerate(train_dataloader):
        X=X.to(device)
        Y=Y.to(device)
        
        pred = model(X) # Forward, ya ha calculado todos los gradientes
        loss = loss_fn(pred, Y) # Crear la función de costo: error

        loss.backward() # Le pasa el error al gradiente
        optimizer.step() # Actualiza los valores
        optimizer.zero_grad() # Pone el gradiente a 0

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(test_dataloader, model, loss_fn):
    size = len(test_dataloader.dataset)
    num_batches = len(test_dataloader)
    
    test_loss, correct = 0, 0

    # No calcula el gradiente automaticamente
    with torch.no_grad():
        for X, Y in test_dataloader:
            X=X.to(device)
            Y=Y.to(device)
            
            pred = model(X) # Forward
            test_loss += loss_fn(pred, Y).item() # Error
            correct += (pred.argmax(1) == Y).type(torch.float).sum().item() # Accuracy

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

### Llamar a train y test
En cada epoca hacer un train y un test

In [21]:
learning_rate = 0.01
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.247701  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.151062 

Epoch 2
-------------------------------
loss: 0.247029  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.150647 

Epoch 3
-------------------------------
loss: 0.250855  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.150001 

Epoch 4
-------------------------------
loss: 0.249095  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.149843 

Epoch 5
-------------------------------
loss: 0.248374  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.149722 

Epoch 6
-------------------------------
loss: 0.248081  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.150238 

Epoch 7
-------------------------------
loss: 0.244209  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.150606 

Epoch 8
-------------------------------
loss: 0.242104  [   64/ 3436]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.149861 

Epoch 9
----------------