# Dataset and Dataloader

In [1]:
import numpy as np
import pandas as pd
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import *
import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [2]:
class CancerDataset(Dataset):
    
    # constructor
    def __init__(self, df_X, df_y):
        
        
        self.data_list = torch.FloatTensor(df_X.values)
        
        if df_y is not None:
            assert len(df_X) == len(df_y)
            self.target_list = torch.LongTensor(df_y.values)
        else:
            df_y = np.zeros(len(df_X)) * (-1)
            self.target_list = torch.LongTensor(df_y)
        
        assert(len(self.data_list) == len(self.target_list))
      
    # return the length of dataset
    def __len__(self):
        return len(self.data_list)
    
    # return the key-th element of dataset
    def __getitem__(self, key):
        
        return self.data_list[key], self.target_list[key]
    

In [3]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, hidden_size, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, hidden_size)
#         self.layer_2 = nn.Linear(hidden_size, 10)
# #         self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(hidden_size, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)
#         self.batchnorm2 = nn.BatchNorm1d(10)
#         self.batchnorm3 = nn.BatchNorm1d(64)
        self.soft_max = nn.Softmax(dim=-1)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        
#         x = self.layer_2(x)
#         x = self.batchnorm2(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
#         x = self.layer_3(x)
#         x = self.batchnorm3(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x
    
    def predict_prob(self, inputs):
        """
        call self.forward, then calculate softmax to get the probability
        """
        out = self.forward(inputs)
        out = self.soft_max(out)
        
        return out
        
    def predict(self, inputs):
        """
        hard predict, call predict_prob, then get the max index
        """
        out = self.predict_prob(inputs)
        pred = torch.argmax(out, dim=1)
        
        return pred


In [4]:
class CancerPredictor(nn.Module):
    
    # constructor
    def __init__(self, input_size, hidden_size, n_class):
        super().__init__()
        
        self.input_layer = nn.Linear(input_size, hidden_size)
        self.input_relu = nn.ReLU()
        self.hidden_linear = nn.Linear(hidden_size, n_class)
        
        # do not use in forward, but use in predict
        self.soft_max = nn.Softmax(dim=-1)
        

    
    def forward(self, inputs):
        out = None
        out = self.input_layer(inputs)
        out = self.input_relu(out)
        out = self.hidden_linear(out)
        out = self.input_relu(out)
        
        ## We do not compute soft max directly
        ## But combine it with loss. Because F.cross_entropy 
        ## Will compute soft max
        # out = self.soft_max(out)
        
        return out
    
    def predict_prob(self, inputs):
        """
        call self.forward, then calculate softmax to get the probability
        """
        out = self.forward(inputs)
        out = self.soft_max(out)
        
        return out
        
    def predict(self, inputs):
        """
        hard predict, call predict_prob, then get the max index
        """
        out = self.predict_prob(inputs)
        pred = torch.argmax(out, dim=1)
        
        return pred


In [5]:
class Learner():
    # constructor
    def __init__(self,  model, criterion, optimizer):
        
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
       
        
    def train(self, train_loader, valid_loader, device, num_epochs, early_stop_patience=5, print_log=True):
        self.early_stop_patience = early_stop_patience
        no_improve = 0
        
        train_loss_history = []
        valid_loss_history = []
        
        min_loss = np.Infinity
        for epoch in tqdm.notebook.tqdm(range(num_epochs)):
            
            train_loss_epoch = []
            valid_loss_epoch = []
            
            self.model.train()

            for i, (data_batch, target_batch) in enumerate(train_loader):
                preds = self.model(data_batch.to(device))
                train_loss = self.criterion(preds, target_batch.to(device))
                train_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                train_loss_epoch.append(train_loss.item())
            
            batch_min_loss = np.Infinity
            self.model.eval()
            for i, (data_batch, target_batch) in enumerate(valid_loader):
                preds = self.model(data_batch.to(device))
                valid_loss = criterion(preds, target_batch.to(device))
                optimizer.zero_grad()
                valid_loss_epoch.append(valid_loss.item())
                    
                if valid_loss < batch_min_loss:
                    batch_min_loss = valid_loss
                    
                
            
            mean_train_loss_epoch = np.mean(train_loss_epoch)
            mean_valid_loss_epoch = np.mean(valid_loss_epoch)
            
            train_loss_history.append(mean_train_loss_epoch)
            valid_loss_history.append(mean_valid_loss_epoch)
            
            if batch_min_loss < min_loss:
                no_improve = 0
                min_loss = batch_min_loss
                torch.save(model, 'best_model.pt')
            else:
                no_improve += 1
                if no_improve > self.early_stop_patience:
                    print(f"Early Stop at epoch {epoch}")
                    break
                
            if print_log:
                print(f"Epoch: {epoch}, train_loss: {mean_train_loss_epoch},\
                        valid_loss: {mean_valid_loss_epoch}")
        
        return train_loss_history, valid_loss_history
    
    def do_test(self, test_loader, device, best_model=None):
        if best_model == None:
            best_model = self.model
            
        real_data = []
        pred_data = []
        best_model.eval()
        
        for i, (data_batch, target_batch) in enumerate(test_loader):
            preds = best_model.predict(data_batch.to(device))
            # test_loss = self.criterion(preds, target_batch.to(device))
            optimizer.zero_grad()

            for i in range(len(target_batch)):
                
                real_data.append(target_batch[i].item())
                pred_data.append(preds[i].item())
        
        return np.array(real_data), np.array(pred_data)

In [6]:
df = pd.read_csv('train_ml2_2021.csv')
X = df.drop(columns=['target', 'problem_id'])
y = df.target
df_test = pd.read_csv('test0.csv', index_col='obs_id').drop(columns=['problem_id'])

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, stratify=y, random_state=1)

In [8]:
X_test, y_test = df_test.drop(columns='target'), df_test.target

In [11]:
def make_weights(dataset):
    X = dataset.data_list
    Y = dataset.target_list
    nclasses = Y.unique(return_counts=True)[1] / Y.shape[0]
    
    return torch.DoubleTensor([nclasses[y] for y in Y])

train_dataset = CancerDataset(X_train, y_train)
valid_dataset = CancerDataset(X_val, y_val)
test_dataset = CancerDataset(df_test.drop(columns='target'), df_test.target)

weights = make_weights(train_dataset)
sampler = WeightedRandomSampler(weights, len(weights))                     

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
# train_loader = DataLoader(train_dataset, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [12]:
if torch.cuda.is_available():
    device = torch.device("cuda: 5")
else:
    device = torch.device("cpu")

In [13]:
# fig, ax = plt.subplots(figsize=(20, 10))
# ax.plot(train_log, label="train_loss")
# ax.plot(valid_log, label="valid_loss")
# ax.legend()
# plt.show()

In [15]:
n, input_size = X.shape
# hidden_size = 100
n_class = len(y.unique())
criterion = F.cross_entropy
lr = 0.001
nb_epoch = 100

In [22]:
hidden_sizes = [180, 200, 220]
# hidden_sizes = [21]

In [23]:
for hidden_size in hidden_sizes:
#     model = CancerPredictor(input_size, hidden_size, n_class)
    model = MulticlassClassification(input_size, hidden_size, n_class)
    
    optimizer = optim.Adam(model.parameters(), lr=lr)

    learner = Learner(model, criterion, optimizer)
    
    train_log, valid_log = learner.train(train_loader, valid_loader, \
                                         device, nb_epoch, early_stop_patience=20, \
                                         print_log=False)
    
    best_model = torch.load("best_model.pt")
    y_real_train, y_pred_train = learner.do_test(train_loader, device, best_model=best_model)
    y_real_valid, y_pred_valid = learner.do_test(valid_loader, device, best_model=best_model)
    # y_real_test, y_pred_test = learner.do_test(test_loader, device, best_model=best_model)
    
    train_accuracy = accuracy_score(y_real_train, y_pred_train)
    valid_accuracy = accuracy_score(y_real_valid, y_pred_valid)
    # test_accuracy = accuracy_score(y_real_test, y_pred_test)
    

#     train_accuracy = balanced_accuracy_score(y_real_train, y_pred_train)
#     valid_accuracy = balanced_accuracy_score(y_real_valid, y_pred_valid)
    
    print(f"hidden size: {hidden_size}, train accuracy: {train_accuracy}, valid accuracy: {valid_accuracy}")

  0%|          | 0/100 [00:00<?, ?it/s]

Early Stop at epoch 26
hidden size: 180, train accuracy: 0.9430808613160668, valid accuracy: 0.6682721252257676


  0%|          | 0/100 [00:00<?, ?it/s]

Early Stop at epoch 26
hidden size: 200, train accuracy: 0.8944436078903779, valid accuracy: 0.6435881998795906


  0%|          | 0/100 [00:00<?, ?it/s]

Early Stop at epoch 29
hidden size: 220, train accuracy: 0.9591928926366511, valid accuracy: 0.6604455147501506


In [24]:
valid_accuracy = 0

In [25]:
while valid_accuracy < 0.66:
    hidden_size = 200
#     model = CancerPredictor(input_size, hidden_size, n_class)
    model = MulticlassClassification(input_size, hidden_size, n_class)
    
    optimizer = optim.Adam(model.parameters(), lr=lr)

    learner = Learner(model, criterion, optimizer)
    
    train_log, valid_log = learner.train(train_loader, valid_loader, \
                                         device, nb_epoch, early_stop_patience=20, \
                                         print_log=False)
    
    best_model = torch.load("best_model.pt")
    y_real_train, y_pred_train = learner.do_test(train_loader, device, best_model=best_model)
    y_real_valid, y_pred_valid = learner.do_test(valid_loader, device, best_model=best_model)
    # y_real_test, y_pred_test = learner.do_test(test_loader, device, best_model=best_model)
    
    train_accuracy = accuracy_score(y_real_train, y_pred_train)
    valid_accuracy = accuracy_score(y_real_valid, y_pred_valid)
    # test_accuracy = accuracy_score(y_real_test, y_pred_test)
    

#     train_accuracy = balanced_accuracy_score(y_real_train, y_pred_train)
#     valid_accuracy = balanced_accuracy_score(y_real_valid, y_pred_valid)
    
    print(f"hidden size: {hidden_size}, train accuracy: {train_accuracy}, valid accuracy: {valid_accuracy}")
    
    if valid_accuracy >= 0.66:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

Early Stop at epoch 29
hidden size: 200, train accuracy: 0.9727450685137781, valid accuracy: 0.679108970499699


NameError: name 'test_accuracy' is not defined

In [35]:
best_model = torch.load("best_model_save.pt")
y_real_train, y_pred_train = learner.do_test(train_loader, device, best_model=best_model)
y_real_valid, y_pred_valid = learner.do_test(valid_loader, device, best_model=best_model)
y_real_test, y_pred_test = learner.do_test(test_loader, device, best_model=best_model)

train_accuracy = accuracy_score(y_real_train, y_pred_train)
valid_accuracy = accuracy_score(y_real_valid, y_pred_valid)
# test_accuracy = accuracy_score(y_real_test, y_pred_test)


In [36]:
valid_accuracy

0.679108970499699

In [37]:
submission = pd.read_csv("sample_submission.csv")

In [38]:
submission.head()

Unnamed: 0,obs_id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [39]:
len(submission)

2041

In [40]:
y_test.head()

obs_id
0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [41]:
len(y_test)

2041

In [42]:
submission["target"] = y_pred_test

In [43]:
submission.head(20)

Unnamed: 0,obs_id,target
0,0,1
1,1,0
2,2,0
3,3,0
4,4,1
5,5,0
6,6,0
7,7,1
8,8,1
9,9,1


In [46]:
submission.to_csv("submission.csv", index=False)

In [44]:
set(y_pred_train)

{0, 1, 2}

In [45]:
set(y_pred_valid)

{0, 1, 2}