In [135]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import classification_report

In [136]:
class IVFDataset(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        # print(y, len(y))
        
        if y is None:
            self.y = y
        else:
            self.y = torch.LongTensor(y)
        self.x = torch.FloatTensor(x)
        
    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

In [137]:
class DNN_Regression(nn.Module):
    def __init__(self, input_dim):
        super(DNN_Regression, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 2)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

In [138]:
train_loss=[] 
val_loss=[]

def trainer(train_loader, valid_loader, model, config, device):
    
    # TODO try differnet criterion or optimizer 
    # criterion = nn.MSELoss(reduction='mean')
    w=torch.tensor([0.1,0.9])
    criterion = nn.CrossEntropyLoss(weight=w)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], betas=(0.9, 0.999), eps=1e-08)
    writer = SummaryWriter()
    
    if not os.path.isdir('/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/DNN_Regression/models'):
        os.mkdir('/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/DNN_Regression/models') # Create directory of saving models.
    
    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
    
    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)
        train_acc = []
        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)             
            
            loss = criterion(pred, y)
            
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            
            acc = (pred.argmax(dim=-1) == y.to(device)).float().mean()
            train_acc.append(acc)
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})
        
        print(f"Train Acc: {sum(train_acc) / len(train_acc)}")
        mean_train_loss = sum(loss_record)/len(loss_record)
        train_loss.append(mean_train_loss)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        val_acc = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)
                
                acc = (pred.argmax(dim=-1) == y.to(device)).float().mean()
                val_acc.append(acc)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        val_loss.append(mean_valid_loss)
        print(f"Val Acc: {sum(val_acc) / len(val_acc)}")
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

In [139]:
device = 'cpu'

config = {
    'seed': 1322,      # Your seed number, you can pick your lucky number. :)
    'select_all': True,   # Whether to use all features.
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 500,     # Number of epochs.            
    'batch_size': 256, 
    'learning_rate': 1e-4,              
    'early_stop': 20,    # If model has not improved for this many consecutive epochs, stop training.     
    'save_path': '/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/DNN_Regression/models/model_classification_Adam_WC.ckpt'  # Your model will be saved here.
}

In [140]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

In [141]:
same_seed(config['seed'])

In [142]:
train_x = pd.read_csv('/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/train_binary_x.csv')
train_y = pd.read_csv('/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/train_binary_y.csv')
test_x = pd.read_csv('/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/test_binary_x.csv')
test_y = pd.read_csv('/mnt/hdd18.2t/sea120424/exchange/ML/datasets/project/test_binary_y.csv')

In [143]:
from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.10, random_state=42)

In [144]:
print(train_x.shape, train_y.shape)
print(valid_x.shape, valid_y.shape)
print(test_x.shape, test_y.shape)

(112338, 21) (112338, 1)
(12482, 21) (12482, 1)
(31205, 21) (31205, 1)


In [145]:
train_x = train_x.to_numpy()
train_y = train_y.to_numpy().ravel()
valid_x = valid_x.to_numpy()
valid_y = valid_y.to_numpy().ravel()
test_x  = test_x.to_numpy()
test_y = test_y.to_numpy().ravel()

In [146]:
train_dataset = IVFDataset(train_x, train_y)
valid_dataset = IVFDataset(valid_x, valid_y)
test_dataset = IVFDataset(test_x)

In [147]:
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

In [148]:
train_loss = []
val_loss = []
model = DNN_Regression(input_dim=train_x.shape[1]).to(device)

In [None]:
trainer(train_loader, valid_loader, model, config, device)

Epoch [1/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 246.89it/s, loss=0.668]


Train Acc: 0.5512064695358276
Val Acc: 0.11978865414857864
Epoch [1/500]: Train loss: 0.6959, Valid loss: 0.6767
Saving model with loss 0.677...


Epoch [2/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 267.40it/s, loss=0.643]


Train Acc: 0.12156432867050171
Val Acc: 0.11950840055942535
Epoch [2/500]: Train loss: 0.6688, Valid loss: 0.6623
Saving model with loss 0.662...


Epoch [3/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 262.12it/s, loss=0.615]


Train Acc: 0.13287825882434845
Val Acc: 0.19155828654766083
Epoch [3/500]: Train loss: 0.6468, Valid loss: 0.6298
Saving model with loss 0.630...


Epoch [4/500]: 100%|████████████████████████████████████████████████████████| 439/439 [00:01<00:00, 256.83it/s, loss=0.57]


Train Acc: 0.509315013885498
Val Acc: 0.6886464953422546
Epoch [4/500]: Train loss: 0.5985, Valid loss: 0.5655
Saving model with loss 0.565...


Epoch [5/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 247.78it/s, loss=0.511]


Train Acc: 0.7448898553848267
Val Acc: 0.7765925526618958
Epoch [5/500]: Train loss: 0.5257, Valid loss: 0.4896
Saving model with loss 0.490...


Epoch [6/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 270.95it/s, loss=0.415]


Train Acc: 0.795215368270874
Val Acc: 0.8056358695030212
Epoch [6/500]: Train loss: 0.4567, Valid loss: 0.4298
Saving model with loss 0.430...


Epoch [7/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 262.51it/s, loss=0.414]


Train Acc: 0.8130142688751221
Val Acc: 0.8258681893348694
Epoch [7/500]: Train loss: 0.4077, Valid loss: 0.3895
Saving model with loss 0.390...


Epoch [8/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 240.38it/s, loss=0.334]


Train Acc: 0.8290719985961914
Val Acc: 0.8344080448150635
Epoch [8/500]: Train loss: 0.3722, Valid loss: 0.3574
Saving model with loss 0.357...


Epoch [9/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 228.78it/s, loss=0.256]


Train Acc: 0.8429913520812988
Val Acc: 0.8530591130256653
Epoch [9/500]: Train loss: 0.3413, Valid loss: 0.3268
Saving model with loss 0.327...


Epoch [10/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 243.04it/s, loss=0.321]


Train Acc: 0.8590448498725891
Val Acc: 0.8720832467079163
Epoch [10/500]: Train loss: 0.3097, Valid loss: 0.2935
Saving model with loss 0.294...


Epoch [11/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 277.53it/s, loss=0.249]


Train Acc: 0.8800990581512451
Val Acc: 0.8925456404685974
Epoch [11/500]: Train loss: 0.2757, Valid loss: 0.2596
Saving model with loss 0.260...


Epoch [12/500]: 100%|███████████████████████████████████████████████████████| 439/439 [00:01<00:00, 273.33it/s, loss=0.26]


Train Acc: 0.9018036723136902
Val Acc: 0.912886381149292
Epoch [12/500]: Train loss: 0.2405, Valid loss: 0.2244
Saving model with loss 0.224...


Epoch [13/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 239.15it/s, loss=0.199]


Train Acc: 0.9223915934562683
Val Acc: 0.9331285953521729
Epoch [13/500]: Train loss: 0.2058, Valid loss: 0.1913
Saving model with loss 0.191...


Epoch [14/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 267.42it/s, loss=0.183]


Train Acc: 0.9399316310882568
Val Acc: 0.9469997882843018
Epoch [14/500]: Train loss: 0.1742, Valid loss: 0.1618
Saving model with loss 0.162...


Epoch [15/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 232.86it/s, loss=0.154]


Train Acc: 0.9531806111335754
Val Acc: 0.9570411443710327
Epoch [15/500]: Train loss: 0.1481, Valid loss: 0.1392
Saving model with loss 0.139...


Epoch [16/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 261.59it/s, loss=0.142]


Train Acc: 0.9608340859413147
Val Acc: 0.9604723453521729
Epoch [16/500]: Train loss: 0.1284, Valid loss: 0.1231
Saving model with loss 0.123...


Epoch [17/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 281.85it/s, loss=0.107]


Train Acc: 0.9622408151626587
Val Acc: 0.9609695672988892
Epoch [17/500]: Train loss: 0.1143, Valid loss: 0.1112
Saving model with loss 0.111...


Epoch [18/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 237.20it/s, loss=0.0837]


Train Acc: 0.962402880191803
Val Acc: 0.9612374305725098
Epoch [18/500]: Train loss: 0.1048, Valid loss: 0.1036
Saving model with loss 0.104...


Epoch [19/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 251.74it/s, loss=0.122]


Train Acc: 0.9624287486076355
Val Acc: 0.9614191055297852
Epoch [19/500]: Train loss: 0.0985, Valid loss: 0.0975
Saving model with loss 0.097...


Epoch [20/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 261.28it/s, loss=0.0934]


Train Acc: 0.962443470954895
Val Acc: 0.961266279220581
Epoch [20/500]: Train loss: 0.0941, Valid loss: 0.0950
Saving model with loss 0.095...


Epoch [21/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 258.09it/s, loss=0.0927]


Train Acc: 0.962443470954895
Val Acc: 0.9611898064613342
Epoch [21/500]: Train loss: 0.0915, Valid loss: 0.0925
Saving model with loss 0.093...


Epoch [22/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 264.42it/s, loss=0.115]


Train Acc: 0.9624395966529846
Val Acc: 0.9612407088279724
Epoch [22/500]: Train loss: 0.0894, Valid loss: 0.0909
Saving model with loss 0.091...


Epoch [23/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 229.55it/s, loss=0.125]


Train Acc: 0.962437629699707
Val Acc: 0.9612407088279724
Epoch [23/500]: Train loss: 0.0880, Valid loss: 0.0902
Saving model with loss 0.090...


Epoch [24/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 272.10it/s, loss=0.108]


Train Acc: 0.9624395966529846
Val Acc: 0.9611898064613342
Epoch [24/500]: Train loss: 0.0871, Valid loss: 0.0892
Saving model with loss 0.089...


Epoch [25/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 232.27it/s, loss=0.0532]


Train Acc: 0.9624512791633606
Val Acc: 0.9613936543464661
Epoch [25/500]: Train loss: 0.0864, Valid loss: 0.0889
Saving model with loss 0.089...


Epoch [26/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 259.64it/s, loss=0.0663]


Train Acc: 0.9624474048614502
Val Acc: 0.961266279220581
Epoch [26/500]: Train loss: 0.0860, Valid loss: 0.0884
Saving model with loss 0.088...


Epoch [27/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 260.87it/s, loss=0.046]


Train Acc: 0.9624532461166382
Val Acc: 0.9613681435585022
Epoch [27/500]: Train loss: 0.0856, Valid loss: 0.0876
Saving model with loss 0.088...


Epoch [28/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 236.35it/s, loss=0.0803]


Train Acc: 0.9624454379081726
Val Acc: 0.9611898064613342
Epoch [28/500]: Train loss: 0.0854, Valid loss: 0.0884


Epoch [29/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 263.53it/s, loss=0.0696]


Train Acc: 0.9624474048614502
Val Acc: 0.9613681435585022
Epoch [29/500]: Train loss: 0.0852, Valid loss: 0.0873
Saving model with loss 0.087...


Epoch [30/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 253.30it/s, loss=0.064]


Train Acc: 0.962449312210083
Val Acc: 0.9612152576446533
Epoch [30/500]: Train loss: 0.0849, Valid loss: 0.0874


Epoch [31/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 238.82it/s, loss=0.114]


Train Acc: 0.962437629699707
Val Acc: 0.9612152576446533
Epoch [31/500]: Train loss: 0.0850, Valid loss: 0.0868
Saving model with loss 0.087...


Epoch [32/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 245.76it/s, loss=0.0888]


Train Acc: 0.962443470954895
Val Acc: 0.9613426327705383
Epoch [32/500]: Train loss: 0.0847, Valid loss: 0.0869


Epoch [33/500]: 100%|██████████████████████████████████████████████████████| 439/439 [00:01<00:00, 246.90it/s, loss=0.104]


Train Acc: 0.9624395966529846
Val Acc: 0.9613426327705383
Epoch [33/500]: Train loss: 0.0845, Valid loss: 0.0867
Saving model with loss 0.087...


Epoch [34/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 253.49it/s, loss=0.0623]


Train Acc: 0.962449312210083
Val Acc: 0.9613171815872192
Epoch [34/500]: Train loss: 0.0845, Valid loss: 0.0866
Saving model with loss 0.087...


Epoch [35/500]: 100%|█████████████████████████████████████████████████████| 439/439 [00:01<00:00, 256.23it/s, loss=0.0943]


Train Acc: 0.9624395966529846
Val Acc: 0.9612917304039001
Epoch [35/500]: Train loss: 0.0845, Valid loss: 0.0868


Epoch [36/500]:  94%|█████████████████████████████████████████████████▉   | 414/439 [00:02<00:00, 238.35it/s, loss=0.0685]

In [None]:
from matplotlib import pyplot
pyplot.plot(train_loss, label='train')
pyplot.plot(val_loss, label='valid')
pyplot.legend()
pyplot.show()

In [None]:
model = DNN_Regression(input_dim=train_x.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
pca_train_x = pca.fit_transform(train_x)

pca_test_x = pca.transform(test_x)

df = pd.DataFrame(pca_test_x)
df['y'] = preds
df['groundTrue'] = test_y
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16,10))
sns.scatterplot(
    x=0, y='y',
    hue="groundTrue",
    palette=sns.color_palette("hls", 10),
    data=df,
    legend="full",
    alpha=1
)

In [None]:
hard_result = np.round(preds)

print(classification_report(test_y, hard_result))