In [1]:
tr_path = 'deal_train.csv'  # path to training data
tt_path = 'deal_test.csv'   # path to te|sting data
se_path = 'season.csv'

In [2]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For data preprocess
import numpy as np
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [3]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.xlabel('Training steps')
    plt.ylim(0.0, 1.)
    plt.ylabel('loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()


In [4]:
tr_sz = 0
dv_sz = 0
class empDataset(Dataset):

    def __init__(self,
                 path,
                 mode='train',
                 target_only=False):
        self.mode = mode
        global tr_sz
        global dv_sz
        # Read data into numpy arrays
        with open(path, 'r', encoding="Big5") as fp:
            data = list(csv.reader(fp))
            data = np.array([list(map(float,i)) for i in data[1:]]).astype(float)
        self.idx = data[:,2]
        if not target_only:
            feats = list(range(4,46))
        else:
            feats = list(range(4,46))
        if mode == 'test':
            data = data[:, feats]
            self.data = torch.FloatTensor(data)
        else:
            target = data[:, 3]
            data = data[:, feats]
            
            # Splitting training data into train & dev sets
            if mode == 'train':
                indices = [i for i in range(len(data)) if i % 10 < 7]
#             elif mode == 'valid':
#                 indices = [i for i in range(len(data)) if i % 10 == 0]
            elif mode == 'dev':
                indices = [i for i in range(len(data)) if i % 10 >= 7]
            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.LongTensor(target[indices])
        
        self.dim = self.data.shape[1]
        if mode == 'train':
            tr_sz = len(self.data)
        elif mode == 'dev':
            dv_sz = len(self.data)
        print('Finished reading the {} set of Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev', 'valid']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            return self.data[index], self.idx[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

In [5]:
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = empDataset(path, mode=mode, target_only=target_only)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

In [6]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight, gain=1.0)
        m.bias.data.fill_(0.01)

class Classifier(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(Classifier, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64,2),
        )
        self.net.apply(init_weights)
        self.criterion = nn.CrossEntropyLoss() 

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        return self.net(x).squeeze(1)

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        return self.criterion(pred, target)

In [7]:
best_acc = 0.0
def train(tr_set, dv_set, model, config, device, tr_sz, dv_sz):
    ''' DNN training '''

    n_epochs = config['n_epochs']  # Maximum number of epochs

    # Setup optimizer
    optimizer = getattr(torch.optim, config['optimizer'])(
        model.parameters(), **config['optim_hparas'])
    global best_acc
    epoch = 0
    model_path = './model.ckpt'
    loss_record = {"train":[], "dev":[]}
    while epoch < n_epochs:
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0
        model.train()                           # set model to training mode
        for inputs, labels in tr_set:                     # iterate through the dataloader
            inputs, labels = inputs.to(device), labels.to(device)   # move data to device (cpu/cuda)
            outputs = model(inputs)                     # forward pass (compute output)
#             print(outputs, labels, inputs)
            batch_loss = model.cal_loss(outputs, labels)  # compute loss
            _, train_pred = torch.max(outputs, 1) 
            batch_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            optimizer.zero_grad()               # set gradient to zero
            train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
            train_loss += batch_loss.item()
        loss_record["train"].append(train_loss/len(tr_set))
        model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for inputs, labels in dv_set:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                batch_loss = model.cal_loss(outputs, labels) 
                _, val_pred = torch.max(outputs, 1) 

                val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
                val_loss += batch_loss.item()
            loss_record["dev"].append(val_loss/len(dv_set))
            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                epoch + 1, n_epochs, train_acc/tr_sz, train_loss/len(tr_set), val_acc/dv_sz, val_loss/len(dv_set)
            ))
            # if the model improves, save a checkpoint at this epoch
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print('saving model with acc {:.3f}'.format(best_acc/dv_sz))

        epoch += 1
    print('Finished training after {} epochs'.format(epoch))
    return val_loss, loss_record

In [8]:
device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/
target_only = True                 # TODO: Using 40 states & 2 tested_positive features

# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
    'n_epochs': 300,                # maximum number of epochs
    'batch_size': 256,               # mini-batch size for dataloader
    'optimizer': 'SGD',              # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)
        'lr': 0.0001,                 # learning rate of SGD
#         'weight_decay':0.005,
        'momentum': 0.9              # momentum for SGD
    },
    'early_stop': 1000,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.pth'  # your model will be saved here
}

In [9]:
tr_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
# va_set = prep_dataloader(tr_path, 'valid', config['batch_size'], target_only=target_only)
tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)

Finished reading the train set of Dataset (10075 samples found, each dim = 42)
Finished reading the dev set of Dataset (4317 samples found, each dim = 42)
Finished reading the test set of Dataset (3739 samples found, each dim = 42)


In [10]:
model = Classifier(tr_set.dataset.dim).to(device)  # Construct model and move to device

In [11]:
model_loss, model_loss_record = train(tr_set, dv_set, model, config, device, tr_sz, dv_sz)

[001/300] Train Acc: 0.561886 Loss: 793.600150 | Val Acc: 0.940468 loss: 7.717760
saving model with acc 0.940
[002/300] Train Acc: 0.939851 Loss: 4.158265 | Val Acc: 0.940468 loss: 5.025876
[003/300] Train Acc: 0.939851 Loss: 2.077791 | Val Acc: 0.940468 loss: 1.813776
[004/300] Train Acc: 0.940149 Loss: 0.638429 | Val Acc: 0.946491 loss: 0.210444
saving model with acc 0.946
[005/300] Train Acc: 0.943921 Loss: 0.216811 | Val Acc: 0.946491 loss: 0.211237
[006/300] Train Acc: 0.943921 Loss: 0.216050 | Val Acc: 0.946491 loss: 0.211321
[007/300] Train Acc: 0.943921 Loss: 0.214927 | Val Acc: 0.946491 loss: 0.211125
[008/300] Train Acc: 0.943921 Loss: 0.214986 | Val Acc: 0.946491 loss: 0.210986
[009/300] Train Acc: 0.943921 Loss: 0.217066 | Val Acc: 0.946491 loss: 0.211348
[010/300] Train Acc: 0.943921 Loss: 0.216796 | Val Acc: 0.946491 loss: 0.211412
[011/300] Train Acc: 0.943921 Loss: 0.215520 | Val Acc: 0.946491 loss: 0.210997
[012/300] Train Acc: 0.943921 Loss: 0.216685 | Val Acc: 0.9464

KeyboardInterrupt: 

In [None]:
plot_learning_curve(model_loss_record, title='deep model')

In [43]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    print('Saving results to {}'.format(file))
    with open(file, 'w', newline='') as fp:
        writer = csv.writer(fp)
        writer.writerow(['PerNo', 'PerStatus'])
        for i, p in preds:
            writer.writerow([i,p])
model.eval() # set the model to evaluation mode
preds = []
with torch.no_grad():
    for inputs, idx in tt_set:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, pred = torch.max(outputs, 1)
        preds.extend([[int(idx[i]), int(pred[i])]for i in range(pred.shape[0])])
save_pred(preds, 'pred.csv')         # save prediction file to pred.csv

Saving results to pred.csv
