## import statements

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import optim, nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## hyper-parameter definition

In [2]:
DATA_PATH = '/Users/yueyangwu/Desktop/CS6140/final_proj/data/wdbc.data'
TRAINED_MODEL_PATH = 'results/model.pth'
N_EPOCHS = 5
BATCH_SIZE_TRAIN = 5
BATCH_SIZE_TEST = 5
LEARNING_RATE = 0.01

## Custom Dataset definition

In [3]:
class BreastCancerDataset(Dataset):
    """Breast Cancer Dataset"""
    
    def __init__(self, dataframe):
        """
        @parameter dataframe (pd.dataframe): dataframe contains the features and ground truth
        """
        self.dataframe = dataframe
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        features = []
        for i in range(30):
            features.append(self.dataframe.iloc[idx, i])
        diagnosis_code = self.dataframe.iloc[idx, 30]
        
        features_tensor = torch.Tensor(features)
        
        return [features_tensor, diagnosis_code]

## Neural Network Definition

In [4]:
class Network(nn.Module):
    # initialize the model
    def __init__(self):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(30, 20)
        self.fc2 = nn.Linear(20, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

## helper functions

In [5]:
diagnosis_to_code = {'M' : 1, 'B' : 0}  # {malignant : 1, benign : 0}


def build_dataframe_with_code(csv_path):
    dataframe = pd.read_csv(DATA_PATH)
    dataframe['code'] = [diagnosis_to_code[x] for x in dataframe.diagnosis]
    dataframe = dataframe.drop(['id', 'diagnosis'], axis=1)
    return dataframe


def train(train_loader, test_loader, model, loss_fn, optimizer, n_epochs=N_EPOCHS):
    for epoch in range(n_epochs):
        size = len(train_loader.dataset)
        test(test_loader=test_loader, model=model)
        for batch, (X, y) in enumerate(train_loader):
            # compute prediction and loss
            pred = model(X)
            loss = loss_fn(pred, y)

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}, epoch = {epoch + 1}")

        # for each epoch, save a model version
        filename = 'results/model' + str(epoch) + '.pth'
        torch.save(model.state_dict(), filename)
        
        
def test(test_loader, model):
    size = len(test_loader.dataset)
    correct = 0

    with torch.no_grad():
        for X, y in test_loader:
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%\n")

## Experiment Setup
Download the wdbc.data from https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ \
Load the data using pd.read_csv, then split dataset into training data(70%) and testing data(30%)

In [6]:
dataframe = build_dataframe_with_code(DATA_PATH)
# print(dataframe)

train_df, test_df = np.split(dataframe.sample(frac=1, random_state=1729), [int(0.7 * len(dataframe))])
print(train_df.shape, test_df.shape)

# normalize features
train_df_scaled = train_df.copy()
test_df_scaled = test_df.copy()
for column in train_df_scaled.columns:
    if column == 'code':
        continue
    train_df_scaled[column] = (train_df[column] - train_df[column].mean()) / train_df[column].std()
    test_df_scaled[column] = (test_df[column] - train_df[column].mean()) / train_df_scaled[column].std()

(398, 31) (171, 31)


In [7]:
# make the code repeatable
torch.manual_seed(1)
torch.backends.cudnn.enabled = False

# get train and test dataloader
train_dataset = BreastCancerDataset(train_df_scaled)
test_dataset = BreastCancerDataset(test_df_scaled)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, shuffle=True)

# initialize a model
model = Network()
model.train()

# initialize the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# train the model
train(train_loader=train_loader, test_loader=test_loader, model=model, loss_fn=loss_fn, optimizer=optimizer, n_epochs=N_EPOCHS)

# save the model
torch.save(model, TRAINED_MODEL_PATH)

Test Error: 
 Accuracy: 67.8%

loss: 0.617053, epoch = 1
Test Error: 
 Accuracy: 91.2%

loss: 0.023505, epoch = 2
Test Error: 
 Accuracy: 91.2%

loss: 0.001617, epoch = 3
Test Error: 
 Accuracy: 91.2%

loss: 0.014798, epoch = 4
Test Error: 
 Accuracy: 91.2%

loss: 0.000149, epoch = 5


## Test the model

In [8]:
trained_model = torch.load(TRAINED_MODEL_PATH)
model.eval()
test_dataset = BreastCancerDataset(test_df_scaled)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, shuffle=True)
test(test_loader=test_loader, model=trained_model)

Test Error: 
 Accuracy: 91.2%

