In [41]:
import os

import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim


if not os.path.exists('data'):
    os.makedirs('data')

DATASET_OPTIONS = ["steel", "fetal_health"]
DATASET = "fetal_health"

In [42]:
if DATASET == "steel":
    if not os.path.exists('data/steel'):
        os.makedirs('data/steel')
    if not os.path.exists('data/steel/steel.train'):
        # fetch dataset
        steel_plates_faults = fetch_ucirepo(id=198)

        # data (as pandas dataframes)
        X = steel_plates_faults.data.features
        y = steel_plates_faults.data.targets

        # preprocess the targets
        # put the class as the index of the 1 in the row
        y = y.idxmax(axis=1)

        # convert to numbers
        y = y.astype('category').cat.codes

        # split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # saving the data
        X_train.to_csv('data/steel/steel.train', index=False)
        X_test.to_csv('data/steel/steel.test', index=False)
        y_train.to_csv('data/steel/steel.train.target', index=False)
        y_test.to_csv('data/steel/steel.test.target', index=False)

        print('Data saved in data/steel/steel.train, data/steel/steel.test, data/steel/steel.train.target, data/steel/steel.test.target')
    else:
        print('Data already exists in data/steel/steel.train, data/steel/steel.test, data/steel/steel.train.target, data/steel/steel.test.target')

    # load the data
    X_train = pd.read_csv('data/steel/steel.train')
    X_test = pd.read_csv('data/steel/steel.test')
    y_train = pd.read_csv('data/steel/steel.train.target')
    y_test = pd.read_csv('data/steel/steel.test.target')

In [43]:
if DATASET == "fetal_health":
    if not os.path.exists('data/fetal_health'):
        os.makedirs('data/fetal_health')

    if not os.path.exists('data/fetal_health/fetal_health.train'):
        # this data is meant to have been downloaded from kaggle
        fetal_health = pd.read_csv('data/fetal_health/fetal_health.csv')

        # split data
        X = fetal_health.drop(columns='fetal_health')
        y = fetal_health['fetal_health']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # saving the data
        X_train.to_csv('data/fetal_health/fetal_health.train', index=False)
        X_test.to_csv('data/fetal_health/fetal_health.test', index=False)
        y_train.to_csv('data/fetal_health/fetal_health.train.target', index=False)
        y_test.to_csv('data/fetal_health/fetal_health.test.target', index=False)

        print('Data saved in data/fetal_health/fetal_health.train, data/fetal_health/fetal_health.test, data/fetal_health/fetal_health.train.target, data/fetal_health/fetal_health.test.target')
    else:
        print('Data already exists in data/fetal_health/fetal_health.train, data/fetal_health/fetal_health.test, data/fetal_health/fetal_health.train.target, data/fetal_health/fetal_health.test.target')

    # load the data
    X_train = pd.read_csv('data/fetal_health/fetal_health.train')
    X_test = pd.read_csv('data/fetal_health/fetal_health.test')
    y_train = pd.read_csv('data/fetal_health/fetal_health.train.target')
    y_test = pd.read_csv('data/fetal_health/fetal_health.test.target')


Data already exists in data/fetal_health/fetal_health.train, data/fetal_health/fetal_health.test, data/fetal_health/fetal_health.train.target, data/fetal_health/fetal_health.test.target


In [44]:
print(f"Using dataset: {DATASET}")

Using dataset: fetal_health


In [45]:
# convert to numpy arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [46]:
# normalize the data
def normalize(X):
    return (X - X.mean(axis=0)) / X.std(axis=0)

X_train = normalize(X_train)
X_test = normalize(X_test)

# creating the tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)

# create the dataset
BATCH_SIZE = 32
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [47]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {DEVICE} device')

Using cpu device


In [48]:
# Define the model

class MLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.model(x)
        return self.softmax(x)

In [49]:
# hyperparameters
INPUT_SIZE = X_train.shape[1]
OUTPUT_SIZE = y_train.max() + 1
LEARNING_RATE = 0.001
EPOCHS = 1000

In [50]:
# training the model
model = MLP(INPUT_SIZE, OUTPUT_SIZE).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(1, EPOCHS + 1):
    model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)

        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    if epoch % 100 == 0:
        model.eval()
        with torch.no_grad():
            train_loss = criterion(model(X_train), y_train)
            test_loss = criterion(model(X_test), y_test)
            print(f'Epoch {epoch}/{EPOCHS} Train loss: {train_loss:.4f} Test loss: {test_loss:.4f}')

Epoch 100/1000 Train loss: 0.7712 Test loss: 0.8309
Epoch 200/1000 Train loss: 0.7655 Test loss: 0.8363
Epoch 300/1000 Train loss: 0.7642 Test loss: 0.8283
Epoch 400/1000 Train loss: 0.7637 Test loss: 0.8332
Epoch 500/1000 Train loss: 0.7637 Test loss: 0.8290
Epoch 600/1000 Train loss: 0.7636 Test loss: 0.8327
Epoch 700/1000 Train loss: 0.7643 Test loss: 0.8361
Epoch 800/1000 Train loss: 0.7631 Test loss: 0.8353
Epoch 900/1000 Train loss: 0.7619 Test loss: 0.8411
Epoch 1000/1000 Train loss: 0.7654 Test loss: 0.8388


In [51]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
with torch.no_grad():
    y_pred = model(X_test).argmax(dim=1).cpu().numpy()
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, y_pred))

Accuracy: 0.9038
              precision    recall  f1-score   support

           1       0.97      0.94      0.95       340
           2       0.64      0.81      0.72        53
           3       0.79      0.70      0.74        33

    accuracy                           0.90       426
   macro avg       0.80      0.82      0.80       426
weighted avg       0.91      0.90      0.91       426



In [53]:
# baseline dummy model
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy="uniform")
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Baseline accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))

Baseline accuracy: 0.3380
              precision    recall  f1-score   support

           1       0.78      0.35      0.48       340
           2       0.12      0.32      0.17        53
           3       0.06      0.24      0.10        33

    accuracy                           0.34       426
   macro avg       0.32      0.30      0.25       426
weighted avg       0.64      0.34      0.42       426



In [54]:
# baseline logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic regression accuracy: {accuracy:.4f}')
print(classification_report(y_test, y_pred))

Logistic regression accuracy: 0.8920
              precision    recall  f1-score   support

           1       0.96      0.94      0.95       340
           2       0.61      0.64      0.62        53
           3       0.73      0.82      0.77        33

    accuracy                           0.89       426
   macro avg       0.76      0.80      0.78       426
weighted avg       0.90      0.89      0.89       426

