In [1]:
########### importing necessary libraries
import torch.nn as nn
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from torch.utils.tensorboard import SummaryWriter
import yaml

In [2]:
######## Defining factory class for model creation
class BaseModel(nn.Module):
    '''
        This is the Facotry class of base learner
    '''
    def __init__(self, input_dim,n_layers, n_hidden_units, ouput_dim):
        super(BaseModel, self).__init__()
        '''
            input_dim: no of input features
            n_layers: no of hidden layers for this base learner except the output layer
            n_hidden_units: array of size n_layers containing hidden units for each n_layers[i]
            ouput_dim: dimension of output units
        '''
        
        layers = []
        for i in range(n_layers):
            layers.append(nn.Linear(input_dim, n_hidden_units[i]))
            layers.append(nn.BatchNorm1d(num_features=n_hidden_units[i]))
            layers.append(nn.LeakyReLU(0.1))
            input_dim = n_hidden_units[i]

        layers.append(nn.Linear(n_hidden_units[-1], ouput_dim))
        layers.append(nn.Softmax(1))

        self.layers = layers
        self.n_layers = n_layers+1
        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.net(x)

In [3]:
class EnsembleClassifier:
    '''
        This class is supposed to train and validate ensemble models
    '''
    
    def __init__(self, model_config):
        '''
            model_config: yaml config file for the base models
        '''

        # Device configuration
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        ########### creating base learners
        with open(model_config, "r") as stream:
            try:
                self.model_configs = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                exit()

        models = []
        for i in range(len(self.model_configs)):
            model_cfg = self.model_configs['model_'+str(i)]
            model = BaseModel(input_dim=393, n_layers = model_cfg['n_layers'], 
                        n_hidden_units = model_cfg['n_hidden_units'], ouput_dim = 10)
            model.to(self.device)
            models.append((model, model_cfg['learning_rate']))
        self.models = models
        
        
        
    def load_data(self, csv_file):
        ''' 
            loads data from csv file, normalize and return X, y denoting feature and target variables
            csv_file: csv file path of training data
        '''
        ########## load train data
        train_df = pd.read_csv(csv_file, index_col=False)
        # copy the data
        self.traindf_scaled = train_df.copy()
        y = self.traindf_scaled.pop('Y').to_frame()
        X = self.traindf_scaled
        # apply normalization techniques
        for column in X.columns:
            X[column] = (X[column] - X[column].min()) / (X[column].max() - X[column].min())

        y = y.to_numpy().reshape(-1)
        X = X.to_numpy()
        #train_df.groupby('Y').size()
        return X, y
    
    def load_test_data(self, csv_file):
        '''
            loads test data from csv file, normalize using train config and return X_test, y_test denoting feature and target variables
            csv_file: csv file path of training data
        '''
        test_df = pd.read_csv(csv_file)
        X = self.traindf_scaled
        
        # copy the data
        test_df_scaled = test_df.copy()
        y_test = test_df_scaled.pop('Y').to_frame()
        X_test = test_df_scaled
        # apply normalization techniques
        for column in X.columns:
            X_test[column] = (X_test[column] - X[column].min()) / (X[column].max() - X[column].min())

        X_test, y_test = torch.FloatTensor(X_test.to_numpy()).to(self.device ), y_test.to_numpy()
        return X_test, y_test

    def train_base_learners(self, epochs, batch_size, X, y):
        '''
            main loop of training and evaluationg all base learners
            epochs : no of epochs to train each base learner
            batch_size : batch size to stack for batch grad descent during training
            X, y : feature and target varables
        '''
        
        ############# training of base learners with stratified k-fold cross validation
        skf = StratifiedKFold(n_splits = len(self.model_configs), random_state=42, shuffle = True)
        self.epochs = epochs
        self.batch_size = batch_size
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.writer = SummaryWriter()
        
        ################ training base learners -
        for fold_idx, (train_index, valid_index) in enumerate(skf.split(X, y)):

            print("TRAIN:", train_index, "VALID:", valid_index)
            X_train, X_valid = torch.FloatTensor(X[train_index]).to(self.device), torch.FloatTensor(X[valid_index]).to(self.device)
            y_train, y_valid = torch.LongTensor(y[train_index]).to(self.device), torch.LongTensor(y[valid_index]).to(self.device)

            model,lr = self.models[fold_idx]
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            model_name = list(self.model_configs.keys())[fold_idx]
            log = 'started training---{}'.format(model_name)
            print(log )

            best_acc = 0.0
            for epoch in range(epochs):
                train_loss, acc = self.training_loop(model, X_train, y_train, optimizer)
                self.writer.add_scalar('train/loss/model_{}'.format(str(fold_idx)), train_loss, epoch)
                self.writer.add_scalar('train/acc/model_{}'.format(str(fold_idx)), acc, epoch)
                log = 'Training epoch:{} loss:{} accuracy:{}'.format(epoch, train_loss, acc)
                print(log)

                valid_loss, acc = self.validation_loop(model, X_valid, y_valid)
                self.writer.add_scalar('valid/loss/model_{}'.format(str(fold_idx)), valid_loss, epoch)
                self.writer.add_scalar('valid/acc/model_{}'.format(str(fold_idx)), acc, epoch)
                log = 'Validation epoch:{} loss:{} accuracy:{}'.format(epoch, valid_loss, acc)
                print(log)
                #----------save best valid acc model
                if acc > best_acc:
                    torch.save(model.state_dict(), '{}.pt'.format(model_name))
                    best_acc = acc
            self.writer.close()

    def training_loop(self, model, X_train, y_train, optimizer):
        '''
            generatlized training loop 
            model : the model to train
            X_train : training feature data 
            y_train : training target data
            optimizer : optimizer to be used to update network weights
        '''
        #------------- training ---------------
        train_loss = 0
        model.train()
        y_pred_all = []
        for batch_idx in range(0,len(X_train),self.batch_size):
            optimizer.zero_grad()
            if batch_idx+self.batch_size < len(X_train):
                X_batch_train, y_batch_train = X_train[batch_idx : batch_idx+self.batch_size], y_train[batch_idx : batch_idx+self.batch_size]
            else:
                X_batch_train, y_batch_train = X_train[batch_idx : ], y_train[batch_idx :]

            y_pred = model(X_batch_train)
            loss = self.loss_fn(y_pred, y_batch_train)
            loss.backward()
            optimizer.step()
            y_pred_all.extend(y_pred.argmax(1).detach().cpu().numpy())
            train_loss += loss.item()

        iteration = len(X_train)//self.batch_size
        train_loss /= iteration
        acc = accuracy_score(y_train.cpu().numpy(), y_pred_all)
        return train_loss, acc
        
        
    def validation_loop(self, model, X_valid, y_valid):
        '''
            generatlized validation loop 
            model : the model to validate
            X_train : validation feature data 
            y_train : validation target data
        '''
        #-------validation---------
        valid_loss = 0
        model.eval()
        y_pred_all = []
        for batch_idx in range(0,len(X_valid), self.batch_size):
            if batch_idx+self.batch_size < len(X_valid):
                #print(batch_idx+batch_size)
                X_batch_valid, y_batch_valid = X_valid[batch_idx : batch_idx+self.batch_size], y_valid[batch_idx : batch_idx+self.batch_size]
            else:
                X_batch_valid, y_batch_valid = X_valid[batch_idx : ], y_valid[batch_idx :]

            y_pred = model(X_batch_valid)
            loss = self.loss_fn(y_pred, y_batch_valid)
            valid_loss += loss.item()
            y_pred_all.extend(y_pred.argmax(1).detach().cpu().numpy())

        iteration = len(X_valid)//self.batch_size
        valid_loss /= iteration
        
        acc = accuracy_score(y_valid.cpu().numpy(), y_pred_all)
        return valid_loss, acc
        
        
    def voting(self, batch_size, X_test, y_test):
        '''
            voting ensembling of all base learners during testing 
             batch_size: size for batch testing
             X_test : testing feature data 
             y_test : testing target data to evaluate accuracy
        '''
        ############ ensemble voting of base learners
        for i, (model,_) in enumerate(self.models):
            model_name = list(self.model_configs.keys())[i]
            model.load_state_dict(torch.load('{}.pt'.format(model_name)))

        model_pred, final_pred = [[] for _ in range(len(self.models))], []
        for batch_idx in range(0,len(X_test),batch_size):
            data = X_test[batch_idx : batch_idx+batch_size]
            labels = y_test[batch_idx : batch_idx+batch_size]
            y_pred = np.zeros((len(labels),10), dtype='float')

            with torch.no_grad():
                for i, (model,_) in enumerate(self.models):
                    y_batch_pred = model(data)
                    model_pred[i].extend(y_batch_pred.argmax(1).detach().cpu().numpy())
                    y_pred += y_batch_pred.detach().cpu().numpy()

            y_pred /= len(self.models)
            y_pred = np.argmax(y_pred, axis=1) 
            final_pred.extend(y_pred)

        #print(model_pred)
        for i in range(len(model_pred)):
            acc = accuracy_score(y_test, model_pred[i])
            print('model{} accuracy:'.format(str(i)),acc)

        acc = accuracy_score(y_test, final_pred)
        print('final accuracy: ',acc)
        cm = confusion_matrix(y_test, final_pred, labels=np.unique(y_test))
        print('confusion matrix:\n',cm)


In [4]:
########## driver code
ensembleClf = EnsembleClassifier('model_config.yaml')
X, y = ensembleClf.load_data('data/train.csv')
ensembleClf.train_base_learners(epochs=50, batch_size=32, X=X, y=y)

TRAIN: [    0     1     2 ... 29996 29997 29998] VALID: [    5     6    19 ... 29976 29995 29999]
started training---model_0
Training epoch:0 loss:1.8830627824465433 accuracy:0.5788333333333333
Validation epoch:0 loss:1.7698186604096928 accuracy:0.699
Training epoch:1 loss:1.7512275536855062 accuracy:0.7100833333333333
Validation epoch:1 loss:1.7085592396119063 accuracy:0.7615
Training epoch:2 loss:1.711018678665161 accuracy:0.7490833333333333
Validation epoch:2 loss:1.6787542346964546 accuracy:0.7896666666666666
Training epoch:3 loss:1.6845776974360147 accuracy:0.7759166666666667
Validation epoch:3 loss:1.6773433608804795 accuracy:0.7918333333333333
Training epoch:4 loss:1.6761287589073182 accuracy:0.7845416666666667
Validation epoch:4 loss:1.6719418394374337 accuracy:0.7985
Training epoch:5 loss:1.6642075061798096 accuracy:0.7965833333333333
Validation epoch:5 loss:1.6568539129858986 accuracy:0.8125
Training epoch:6 loss:1.6527144813537598 accuracy:0.8075416666666667
Validation epoch

Training epoch:9 loss:1.63538169892629 accuracy:0.8245
Validation epoch:9 loss:1.643346113317153 accuracy:0.8256666666666667
Training epoch:10 loss:1.6349751261075338 accuracy:0.8255
Validation epoch:10 loss:1.6294303730847364 accuracy:0.839
Training epoch:11 loss:1.6328817892074585 accuracy:0.827875
Validation epoch:11 loss:1.6425818969859158 accuracy:0.8268333333333333
Training epoch:12 loss:1.6200587882995605 accuracy:0.84075
Validation epoch:12 loss:1.6355126605314367 accuracy:0.8323333333333334
Training epoch:13 loss:1.6180562221209207 accuracy:0.842125
Validation epoch:13 loss:1.6380407676339788 accuracy:0.83
Training epoch:14 loss:1.6139552424748738 accuracy:0.8467083333333333
Validation epoch:14 loss:1.6338036092207393 accuracy:0.8351666666666666
Training epoch:15 loss:1.6133153017361959 accuracy:0.846875
Validation epoch:15 loss:1.6358694615848561 accuracy:0.8333333333333334
Training epoch:16 loss:1.6069015588760376 accuracy:0.8537916666666666
Validation epoch:16 loss:1.612008

Validation epoch:19 loss:1.576282474446424 accuracy:0.8923333333333333
Training epoch:20 loss:1.5347510101000468 accuracy:0.9275833333333333
Validation epoch:20 loss:1.5766076372269002 accuracy:0.893
Training epoch:21 loss:1.5327365431785585 accuracy:0.9294166666666667
Validation epoch:21 loss:1.5778882669255059 accuracy:0.8923333333333333
Training epoch:22 loss:1.5307012004852294 accuracy:0.932125
Validation epoch:22 loss:1.572532213307957 accuracy:0.8968333333333334
Training epoch:23 loss:1.528396781762441 accuracy:0.9342916666666666
Validation epoch:23 loss:1.5724828950861558 accuracy:0.8976666666666666
Training epoch:24 loss:1.5273261756896972 accuracy:0.935125
Validation epoch:24 loss:1.5758032231407368 accuracy:0.8935
Training epoch:25 loss:1.5269241387049357 accuracy:0.935375
Validation epoch:25 loss:1.5784701217304578 accuracy:0.8915
Training epoch:26 loss:1.521365151087443 accuracy:0.9405833333333333
Validation epoch:26 loss:1.5676236994126265 accuracy:0.9023333333333333
Train

Training epoch:31 loss:1.5140252318382263 accuracy:0.9469583333333333
Validation epoch:31 loss:1.5827059484420614 accuracy:0.8866666666666667
Training epoch:32 loss:1.5112292963663736 accuracy:0.95025
Validation epoch:32 loss:1.579824213038154 accuracy:0.889
Training epoch:33 loss:1.5115814739863078 accuracy:0.9500416666666667
Validation epoch:33 loss:1.578539771829697 accuracy:0.891
Training epoch:34 loss:1.5097097641626993 accuracy:0.9521666666666667
Validation epoch:34 loss:1.574046079487724 accuracy:0.8945
Training epoch:35 loss:1.5090852646827697 accuracy:0.95275
Validation epoch:35 loss:1.5783763856173836 accuracy:0.8908333333333334
Training epoch:36 loss:1.5093497438430785 accuracy:0.9520833333333333
Validation epoch:36 loss:1.577007035520625 accuracy:0.893
Training epoch:37 loss:1.5079608246485392 accuracy:0.95425
Validation epoch:37 loss:1.5767255384016803 accuracy:0.8926666666666667
Training epoch:38 loss:1.506980905532837 accuracy:0.9543333333333334
Validation epoch:38 loss:

Validation epoch:41 loss:1.57523016241145 accuracy:0.894
Training epoch:42 loss:1.4902285091082255 accuracy:0.9718333333333333
Validation epoch:42 loss:1.5672700041755636 accuracy:0.9033333333333333
Training epoch:43 loss:1.4918453200658162 accuracy:0.9701666666666666
Validation epoch:43 loss:1.5774652639174844 accuracy:0.8911666666666667
Training epoch:44 loss:1.4920439443588256 accuracy:0.9701666666666666
Validation epoch:44 loss:1.5732927449884262 accuracy:0.897
Training epoch:45 loss:1.4910016326904296 accuracy:0.971125
Validation epoch:45 loss:1.5756757176496128 accuracy:0.8953333333333333
Training epoch:46 loss:1.489454359849294 accuracy:0.972875
Validation epoch:46 loss:1.5761609746810588 accuracy:0.8938333333333334
Training epoch:47 loss:1.4910243471463522 accuracy:0.9710833333333333
Validation epoch:47 loss:1.575101390241939 accuracy:0.8955
Training epoch:48 loss:1.4901563186645508 accuracy:0.9719583333333334
Validation epoch:48 loss:1.5725544308596116 accuracy:0.8968333333333

In [6]:
# %load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir runs --port=6008

Reusing TensorBoard on port 6008 (pid 104697), started 9:27:31 ago. (Use '!kill 104697' to kill it.)

In [7]:
########### evaluate on test data by soft voting
X_test, y_test = ensembleClf.load_test_data('data/test.csv')
ensembleClf.voting(batch_size=32, X_test=X_test, y_test=y_test)

model0 accuracy: 0.7454
model1 accuracy: 0.7375
model2 accuracy: 0.7778
model3 accuracy: 0.7942
model4 accuracy: 0.768
final accuracy:  0.7963
confusion matrix:
 [[ 914    0    8    9    1    0    5    3   16    7]
 [   1 1108   11    5    5    1    0    4    5    0]
 [  25   10  756   25   66    2    3   65   40    3]
 [  10    5   34  783    7   27   36   10   91    5]
 [   5    5   92    2  736   18   12   69   18   26]
 [  13   10    3   37   23  707   51   17   19   15]
 [  12   10    4   28   22   42  717   56   13   96]
 [   7   10   91   11   72    1   46  754    6   20]
 [  10   10   55   58   24    9   12    6  799    2]
 [  24    2   18   17   48   27  126   42   20  689]]
