In [1]:
########### importing necessary libraries
import torch.nn as nn
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from torch.utils.tensorboard import SummaryWriter
import yaml

In [2]:
######## Defining factory class for model creation
class BaseModel(nn.Module):
    '''
    This is the Facotry class of base learner
    '''
    def __init__(self, input_dim,n_layers, n_hidden_units, ouput_dim):
        super(BaseModel, self).__init__()
        '''
        input_dim: no of input features
        n_layers: no of hidden layers for this base learner except the output layer
        n_hidden_units: array of size n_layers containing hidden units for each n_layers[i]
        ouput_dim: dimension of output units
        '''
        
        layers = []
        for i in range(n_layers):
            layers.append(nn.Linear(input_dim, n_hidden_units[i]))
            layers.append(nn.BatchNorm1d(num_features=n_hidden_units[i]))
            layers.append(nn.LeakyReLU(0.1))
            input_dim = n_hidden_units[i]

        layers.append(nn.Linear(n_hidden_units[-1], ouput_dim))
        layers.append(nn.Softmax(1))

        self.layers = layers
        self.n_layers = n_layers+1
        self.net = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.net(x)

In [16]:
class EnsembleClassifier:
    
    def __init__(self, model_config):

        # Device configuration
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        ########### creating base learners
        with open(model_config, "r") as stream:
            try:
                self.model_configs = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                exit()

        models = []
        for i in range(len(self.model_configs)):
            model_cfg = self.model_configs['model_'+str(i)]
            model = BaseModel(input_dim=393, n_layers = model_cfg['n_layers'], 
                        n_hidden_units = model_cfg['n_hidden_units'], ouput_dim = 10)
            model.to(self.device)
            models.append((model, model_cfg['learning_rate']))
        self.models = models
        
        
        
    def load_data(self, csv_file):
        ########## load train data
        train_df = pd.read_csv(csv_file, index_col=False)
        # copy the data
        self.traindf_scaled = train_df.copy()
        y = self.traindf_scaled.pop('Y').to_frame()
        X = self.traindf_scaled
        # apply normalization techniques
        for column in X.columns:
            X[column] = (X[column] - X[column].min()) / (X[column].max() - X[column].min())

        y = y.to_numpy().reshape(-1)
        X = X.to_numpy()
        #train_df.groupby('Y').size()
        return X, y
    
    def load_test_data(self, csv_file):
        test_df = pd.read_csv(csv_file)
        X = self.traindf_scaled
        
        # copy the data
        test_df_scaled = test_df.copy()
        y_test = test_df_scaled.pop('Y').to_frame()
        X_test = test_df_scaled
        # apply normalization techniques
        for column in X.columns:
            X_test[column] = (X_test[column] - X[column].min()) / (X[column].max() - X[column].min())

        X_test, y_test = torch.FloatTensor(X_test.to_numpy()).to(self.device ), y_test.to_numpy()
        return X_test, y_test

    def train_base_learners(self, epochs, batch_size, X, y):
        
        ############# training of base learners with stratified k-fold cross validation
        skf = StratifiedKFold(n_splits = len(self.model_configs), random_state=42, shuffle = True)
        self.epochs = epochs
        self.batch_size = batch_size
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.writer = SummaryWriter()
        
        ################ training base learners -
        for fold_idx, (train_index, valid_index) in enumerate(skf.split(X, y)):

            print("TRAIN:", train_index, "VALID:", valid_index)
            X_train, X_valid = torch.FloatTensor(X[train_index]).to(self.device), torch.FloatTensor(X[valid_index]).to(self.device)
            y_train, y_valid = torch.LongTensor(y[train_index]).to(self.device), torch.LongTensor(y[valid_index]).to(self.device)

            model,lr = self.models[fold_idx]
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            model_name = list(self.model_configs.keys())[fold_idx]
            log = 'started training---{}'.format(model_name)
            print(log )

            best_acc = 0.0
            for epoch in range(epochs):
                train_loss, acc = self.training_loop(model, X_train, y_train, optimizer)
                self.writer.add_scalar('train/loss/model_{}'.format(str(fold_idx)), train_loss, epoch)
                self.writer.add_scalar('train/acc/model_{}'.format(str(fold_idx)), acc, epoch)
                log = 'Training epoch:{} loss:{} accuracy:{}'.format(epoch, train_loss, acc)
                print(log)

                valid_loss, acc = self.validation_loop(model, X_valid, y_valid)
                self.writer.add_scalar('valid/loss/model_{}'.format(str(fold_idx)), valid_loss, epoch)
                self.writer.add_scalar('valid/acc/model_{}'.format(str(fold_idx)), acc, epoch)
                log = 'Validation epoch:{} loss:{} accuracy:{}'.format(epoch, valid_loss, acc)
                print(log)
                #----------save best valid acc model
                if acc > best_acc:
                    torch.save(model.state_dict(), '{}.pt'.format(model_name))
                    best_acc = acc
            self.writer.close()

    def training_loop(self, model, X_train, y_train, optimizer):
        #------------- training ---------------
        train_loss = 0
        model.train()
        y_pred_all = []
        for batch_idx in range(0,len(X_train),self.batch_size):
            optimizer.zero_grad()
            if batch_idx+self.batch_size < len(X_train):
                X_batch_train, y_batch_train = X_train[batch_idx : batch_idx+self.batch_size], y_train[batch_idx : batch_idx+self.batch_size]
            else:
                X_batch_train, y_batch_train = X_train[batch_idx : ], y_train[batch_idx :]

            y_pred = model(X_batch_train)
            loss = self.loss_fn(y_pred, y_batch_train)
            loss.backward()
            optimizer.step()
            y_pred_all.extend(y_pred.argmax(1).detach().cpu().numpy())
            train_loss += loss.item()

        iteration = len(X_train)//self.batch_size
        train_loss /= iteration
        acc = accuracy_score(y_train.cpu().numpy(), y_pred_all)
        return train_loss, acc
        
        
    def validation_loop(self, model, X_valid, y_valid):
        #-------validation---------
        valid_loss = 0
        model.eval()
        y_pred_all = []
        for batch_idx in range(0,len(X_valid), self.batch_size):
            if batch_idx+self.batch_size < len(X_valid):
                #print(batch_idx+batch_size)
                X_batch_valid, y_batch_valid = X_valid[batch_idx : batch_idx+self.batch_size], y_valid[batch_idx : batch_idx+self.batch_size]
            else:
                X_batch_valid, y_batch_valid = X_valid[batch_idx : ], y_valid[batch_idx :]

            y_pred = model(X_batch_valid)
            loss = self.loss_fn(y_pred, y_batch_valid)
            valid_loss += loss.item()
            y_pred_all.extend(y_pred.argmax(1).detach().cpu().numpy())

        iteration = len(X_valid)//self.batch_size
        valid_loss /= iteration
        
        acc = accuracy_score(y_valid.cpu().numpy(), y_pred_all)
        return valid_loss, acc
        
        
    def voting(self, batch_size, X_test, y_test):
        ############ ensemble voting of base learners
        for i, (model,_) in enumerate(self.models):
            model_name = list(self.model_configs.keys())[i]
            model.load_state_dict(torch.load('{}.pt'.format(model_name)))

        model_pred, final_pred = [[] for _ in range(len(self.models))], []
        for batch_idx in range(0,len(X_test),batch_size):
            data = X_test[batch_idx : batch_idx+batch_size]
            labels = y_test[batch_idx : batch_idx+batch_size]
            y_pred = np.zeros((len(labels),10), dtype='float')

            with torch.no_grad():
                for i, (model,_) in enumerate(self.models):
                    y_batch_pred = model(data)
                    model_pred[i].extend(y_batch_pred.argmax(1).detach().cpu().numpy())
                    y_pred += y_batch_pred.detach().cpu().numpy()

            y_pred /= len(self.models)
            y_pred = np.argmax(y_pred, axis=1) 
            final_pred.extend(y_pred)

        #print(model_pred)
        for i in range(len(model_pred)):
            acc = accuracy_score(y_test, model_pred[i])
            print('model{} accuracy:'.format(str(i)),acc)

        acc = accuracy_score(y_test, final_pred)
        print('final accuracy: ',acc)
        cm = confusion_matrix(y_test, final_pred, labels=np.unique(y_test))
        print('confusion matrix:\n',cm)


In [17]:
########## driver code
ensembleClf = EnsembleClassifier('model_config.yaml')
X, y = ensembleClf.load_data('data/train.csv')
ensembleClf.train_base_learners(epochs=50, batch_size=32, X=X, y=y)

TRAIN: [    0     1     2 ... 29996 29997 29998] VALID: [    5     6    19 ... 29976 29995 29999]
started training---model_0
Training epoch:0 loss:1.8966492172876994 accuracy:0.5647916666666667
Validation epoch:0 loss:1.7814447777794007 accuracy:0.6896666666666667
Training epoch:1 loss:1.7589945891698202 accuracy:0.7026666666666667
Validation epoch:1 loss:1.7175252526839149 accuracy:0.7513333333333333
Training epoch:2 loss:1.713661168575287 accuracy:0.747625
Validation epoch:2 loss:1.6830464021407348 accuracy:0.7878333333333334
Training epoch:3 loss:1.6925141867001852 accuracy:0.7682083333333334
Validation epoch:3 loss:1.6892018381924552 accuracy:0.781
Training epoch:4 loss:1.6741395177841187 accuracy:0.7864583333333334
Validation epoch:4 loss:1.6550396387589807 accuracy:0.8126666666666666
Training epoch:5 loss:1.6631477071444194 accuracy:0.7973333333333333
Validation epoch:5 loss:1.6486811300012516 accuracy:0.8205
Training epoch:6 loss:1.652392089207967 accuracy:0.8080833333333334
Val

Validation epoch:9 loss:1.6568469070495768 accuracy:0.8135
Training epoch:10 loss:1.6263337370554607 accuracy:0.8339583333333334
Validation epoch:10 loss:1.6371736124875074 accuracy:0.8321666666666667
Training epoch:11 loss:1.624139455318451 accuracy:0.8356666666666667
Validation epoch:11 loss:1.6291348577183198 accuracy:0.8405
Training epoch:12 loss:1.62255322488149 accuracy:0.8385833333333333
Validation epoch:12 loss:1.6396620165218005 accuracy:0.8283333333333334
Training epoch:13 loss:1.615576304912567 accuracy:0.844875
Validation epoch:13 loss:1.6369003739586487 accuracy:0.832
Training epoch:14 loss:1.6114202195803324 accuracy:0.8489583333333334
Validation epoch:14 loss:1.628655464891444 accuracy:0.841
Training epoch:15 loss:1.6126750280062359 accuracy:0.8477916666666667
Validation epoch:15 loss:1.6233954340378869 accuracy:0.8446666666666667
Training epoch:16 loss:1.6078768026034038 accuracy:0.8520416666666667
Validation epoch:16 loss:1.6418177259159599 accuracy:0.8275
Training epo

Validation epoch:20 loss:1.5868342968231854 accuracy:0.8821666666666667
Training epoch:21 loss:1.533428811868032 accuracy:0.9291666666666667
Validation epoch:21 loss:1.5775669315919518 accuracy:0.8925
Training epoch:22 loss:1.5315784459114075 accuracy:0.9301666666666667
Validation epoch:22 loss:1.5737180410221936 accuracy:0.8948333333333334
Training epoch:23 loss:1.5314910504023234 accuracy:0.9305833333333333
Validation epoch:23 loss:1.5769612489537121 accuracy:0.892
Training epoch:24 loss:1.5296067363421122 accuracy:0.9325416666666667
Validation epoch:24 loss:1.5764847120499228 accuracy:0.8923333333333333
Training epoch:25 loss:1.525617691675822 accuracy:0.9364583333333333
Validation epoch:25 loss:1.5742754477230623 accuracy:0.8946666666666667
Training epoch:26 loss:1.5265405050913492 accuracy:0.9355
Validation epoch:26 loss:1.5751445478296535 accuracy:0.8948333333333334
Training epoch:27 loss:1.5221656374931336 accuracy:0.9403333333333334
Validation epoch:27 loss:1.5791923693794618 a

Training epoch:31 loss:1.5119996361732484 accuracy:0.9499583333333333
Validation epoch:31 loss:1.5741404314092136 accuracy:0.8955
Training epoch:32 loss:1.5124935364723207 accuracy:0.94925
Validation epoch:32 loss:1.5779353182583569 accuracy:0.8921666666666667
Training epoch:33 loss:1.5125999363263447 accuracy:0.948875
Validation epoch:33 loss:1.5762130991022854 accuracy:0.8938333333333334
Training epoch:34 loss:1.5130522292455038 accuracy:0.94875
Validation epoch:34 loss:1.5766667758717257 accuracy:0.8941666666666667
Training epoch:35 loss:1.50801078303655 accuracy:0.9535
Validation epoch:35 loss:1.574478503854517 accuracy:0.8961666666666667
Training epoch:36 loss:1.5098735901514688 accuracy:0.95125
Validation epoch:36 loss:1.5780876619930573 accuracy:0.8908333333333334
Training epoch:37 loss:1.5089330155054728 accuracy:0.952625
Validation epoch:37 loss:1.5844156410604875 accuracy:0.8846666666666667
Training epoch:38 loss:1.508041572411855 accuracy:0.9535
Validation epoch:38 loss:1.57

Training epoch:42 loss:1.4940524794260661 accuracy:0.968
Validation epoch:42 loss:1.5828788764974013 accuracy:0.885
Training epoch:43 loss:1.4926717487970989 accuracy:0.969625
Validation epoch:43 loss:1.5764006573886158 accuracy:0.8908333333333334
Training epoch:44 loss:1.4936343679428101 accuracy:0.9685833333333334
Validation epoch:44 loss:1.5796372036245419 accuracy:0.8913333333333333
Training epoch:45 loss:1.4934260501861572 accuracy:0.9683333333333334
Validation epoch:45 loss:1.5813994433153122 accuracy:0.887
Training epoch:46 loss:1.4906353815396627 accuracy:0.9714583333333333
Validation epoch:46 loss:1.5775639118357776 accuracy:0.8921666666666667
Training epoch:47 loss:1.4929668243726095 accuracy:0.9693333333333334
Validation epoch:47 loss:1.580699037103092 accuracy:0.8885
Training epoch:48 loss:1.4922637751897176 accuracy:0.9699166666666666
Validation epoch:48 loss:1.577054049241989 accuracy:0.8925
Training epoch:49 loss:1.4922667209307352 accuracy:0.969625
Validation epoch:49 l

In [15]:
# %load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir runs --port=6008

In [19]:
########### evaluate on test data by soft voting
X_test, y_test = ensembleClf.load_test_data('data/test.csv')
ensembleClf.voting(batch_size=32, X_test=X_test, y_test=y_test)

model0 accuracy: 0.7526
model1 accuracy: 0.7303
model2 accuracy: 0.7807
model3 accuracy: 0.7837
model4 accuracy: 0.778
final accuracy:  0.7964
confusion matrix:
 [[ 918    0    5    5    1    5    7    0   12   10]
 [   0 1097   16    4   12    1    0    5    4    1]
 [  19    3  766   14   84    4    5   56   41    3]
 [  10    3   38  778   11   39   29   11   80    9]
 [   5    8   83    1  744   36   12   58   11   25]
 [  15    4    2   33   32  726   45   16   12   10]
 [   7    3    4   41   16   28  741   42    9  109]
 [   8   10   87   11   80    5   64  715    6   32]
 [   5    5   46   48   47   20   22    7  779    6]
 [  25    1   11   13   45   24  150   31   13  700]]
