## import statements

In [76]:
import os
import ssl

import pandas as pd
import numpy as np
import torch
from PIL import Image
from matplotlib import pyplot as plt
from torch import optim, nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## hyper-parameter definition

In [139]:
DATA_PATH = '/Users/yueyangwu/Desktop/CS6140/final_proj/data/wdbc.data'
N_EPOCHS = 10
BATCH_SIZE_TRAIN = 64
BATCH_SIZE_TEST = 64
LEARNING_RATE = 0.001
LOG_INTERVAL = 10

## helper functions

In [110]:
diagnosis_to_code = {'M' : 1, 'B' : 0}  # {malignant : 1, benign : 0}


def build_dataframe_with_code(csv_path):
    dataframe = pd.read_csv(DATA_PATH)
    dataframe['code'] = [diagnosis_to_code[x] for x in dataframe.diagnosis]
    return dataframe


def train(train_loader, test_loader, model, loss_fn, optimizer, n_epochs=N_EPOCHS):
    for epoch in range(n_epochs):
        size = len(train_loader.dataset)
        for batch, (X, y) in enumerate(train_loader):
            # compute prediction and loss
            pred = model(X)
            loss = loss_fn(pred, y)

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

        # for each epoch, save a model version
        filename = 'results/model' + str(epoch) + '.pth'
        torch.save(model.state_dict(), filename)

        test(test_loader=test_loader, model=model, loss_fn=loss_fn)
        
        
def test(test_loader, model, loss_fn=None):
    size = len(test_loader.dataset)
    num_batches = len(test_loader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in test_loader:
            pred = model(X)
            if loss_fn:
                test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

## Experiment Setup
Download the wdbc.data from https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/ \
Load the data using pd.read_csv, then split dataset into training data(70%) and testing data(30%)

In [111]:
dataframe = build_dataframe_with_code(DATA_PATH)
print(dataframe)

train_df, test_df = np.split(dataframe.sample(frac=1, random_state=1729), [int(0.7 * len(dataframe))])
print(train_df.shape, test_df.shape)

           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compa

## Custom Dataset definition

In [128]:
class BreastCancerDataset(Dataset):
    """Breast Cancer Dataset"""
    
    def __init__(self, dataframe):
        """
        @parameter dataframe (pd.dataframe): dataframe contains the features and ground truth
        """
        self.dataframe = dataframe
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        features = []
        for i in range(2, 32):
            features.append(self.dataframe.iloc[idx, i])
        diagnosis_code = self.dataframe.iloc[idx, 32]
        
        features_tensor = torch.Tensor(features)
        
        return [features_tensor, diagnosis_code]

## Neural Network Definition

In [137]:
class Network(nn.Module):
    # initialize the model
    def __init__(self):
        super(Network, self).__init__()
        self.fc1 = nn.Linear(30, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 2)
        self.fc = nn.Linear(30, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x
#         return F.relu(self.fc(x))

## Main Function

In [124]:
def main():
    # make the code repeatable
    torch.manual_seed(1)
    torch.backends.cudnn.enabled = False
    
    # get train and test dataloader
    train_dataset = BreastCancerDataset(train_df)
    test_dataset = BreastCancerDataset(test_df)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, shuffle=True)
    
    # initialize a model
    model = Network()
    
    # initialize the loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # train the model
    train(train_loader=train_loader, test_loader=test_loader, model=model, loss_fn=loss_fn, optimizer=optimizer)
    
    # save the model
    torch.save(model.state_dict(), 'results/model.pth')

In [140]:
main()

loss: 1.186537  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

loss: 0.693147  [    0/  398]
Test Error: 
 Accuracy: 66.1%, Avg loss: 0.693147 

