In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict

In [2]:
# from sklearn.metrics import classification_report, confusion_matrix
import sklearn
print(sklearn.__version__)

0.24.2


In [3]:
class Combination:
    def __init__(self, num_model=0, file_list=[], model_weights=[], model_outputs=[]):
        # Number of members
        self.num_model = num_model
        # The list of the file names of the members
        self.file_list = file_list
        # The list of dataframes of model outputs
        self.model_outputs = model_outputs
        # The model weights
        self.model_weights = model_weights
               
    def get_config(self, config_name):
        df_config = pd.read_csv(config_name,header=None,names=['model','weight'])
        self.file_list = df_config['model'].tolist()
        self.model_weights = df_config['weight'].tolist()
        self.num_model = len(self.file_list)
        
    def read_preds(self, file_name):
        return pd.read_csv(file_name, header=0, index_col=0, delimiter='\t').prediction.values
        
    def read_model_outputs(self):
        for name in self.file_list:
            self.model_outputs.append(self.read_preds(name))

    def set_model_outputs(self, model_outputs):
        self.model_outputs = model_outputs
    
    def set_model_weights(self, model_weights):
        self.model_weights = model_weights
        
    def set_num_model(self, num_model):
        self.num_model = num_model
        
    def set_file_list(self, file_list):
        self.file_list = file_list
            
    def get_file_list(self):
        return self.file_list
    
    def get_model_outputs(self):
        return self.model_outputs
    
    def get_model_weights(self):
        return self.model_weights
        
    def majority_voting_str(self):
        df_preds = pd.DataFrame(self.model_outputs).transpose()
        final = df_preds.mode(axis='columns')[0]
        return final
    
    def majority_voting(self):
        shape = self.model_outputs[0].shape
        votes = np.zeros(shape, dtype=int)
        final = np.zeros_like(votes)
        for i in range(shape[0]):
            for j in range(self.num_model):
                c = int(np.argmax(self.model_outputs[j].iloc[i]))
                votes[i][c] += 1
        final[np.arange(len(votes)), votes.argmax(1)] = 1
        final = pd.DataFrame(data=final, index=self.model_outputs[0].index, columns=self.model_outputs[0].columns)
        return final
    
    def weighted_voting(self):
        shape = self.model_outputs[0].shape
        votes = np.zeros(shape, dtype=float)
        final = np.zeros_like(votes, dtype=int)
        for i in range(shape[0]):
            for j in range(self.num_model):
                c = int(np.argmax(self.model_outputs[j].iloc[i]))
                votes[i][c] += self.model_weights[j]
        final[np.arange(len(votes)), votes.argmax(1)] = 1
        final = pd.DataFrame(data=final, index=self.model_outputs[0].index, columns=self.model_outputs[0].columns)
        return final
    
    def averaging(self):
        shape = self.model_outputs[0].shape
        final = np.zeros(shape, dtype=float)
        for i in range(self.num_model):
            final += self.model_outputs[i] 
        final /= self.num_model
        return final
        
    def weighted_averaging(self):
        shape = self.model_outputs[0].shape
        final = np.zeros(shape, dtype=float)
        for i in range(self.num_model):
            final += self.model_outputs[i]*self.model_weights[i] 
        final /= self.num_model
        return final

In [None]:
# Get ensemble components from log files
model_weights = []
file_list = []
names = ['log_idx', 'val_acc', 'checkpoint_step', 'checkpoint_epoch', 'seed']
rundir = "/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13"
runs = [f"{rundir}/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_cosine_with_restarts_epoch.15_seed.42",
        f"{rundir}/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_cosine_with_restarts_epoch.15_seed.52",
        f"{rundir}/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_cosine_with_restarts_epoch.15_seed.62",
        ]
for rundir in runs:
    log = f"{rundir}/ensemble/prune_log.csv" 
    df = pd.read_csv(log, header=None, names=names)
    seeds, steps, weights = df.seed.values, df.checkpoint_step.values, df.val_acc.values
    for i in range(len(seeds)):
        seed, step, weight = seeds[i], steps[i], weights[i]
        file = f"{rundir}/ensemble/predict_results_None_{seed}_{step}.txt"
        file_list.append(file)
        model_weights.append(weight)
        # model_weights.append(1) # majority voting with all weights=1
print(file_list)
num_model = len(file_list)

In [4]:
# List ensemble components from scratch
file_list = [
'/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_linear_epoch.3_seed.42_split_r42/predict_results_None.txt',
'/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_linear_epoch.3_seed.42_split_r52/predict_results_None.txt',
'/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_linear_epoch.3_seed.42_split_r62/predict_results_None.txt',
'/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_linear_epoch.3_seed.42_split_r72/predict_results_None.txt',
'/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13/thyme_PubmedBERTbase-MimicBig-EntityBERT_lr.4e-5_linear_epoch.3_seed.42_split_r82/predict_results_None.txt',
]
num_model = len(file_list)
model_weights = [1] * num_model


In [5]:
model_weights = [1] * num_model 
combination = Combination(num_model=num_model, file_list=file_list, model_weights=model_weights)
combination.read_model_outputs()
final = combination.majority_voting_str()
final = final.to_frame('prediction')

In [6]:
final.to_csv('/home/lijing/My_Work/Projects/Thyme/thyme_model/output/EnsTraining/setting-13/ensemble/majority_vote.csv', index=True, header=True, sep='\t')