In [1]:
import os
from glob import glob
import numpy as np
from sklearn.metrics import classification_report
import math


def softmax(x, T = 1):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp((x - np.max(x)) / T)

    return e_x / e_x.sum()

In [2]:
class fileScore:
    
    def __init__(self, file_name, option="common"):
        
        self.file_name = file_name
        self.option = option
        
        self.score = float('.'.join(file_name.split('-')[-1].split('.')[:-1]))
        self.filescore = self.make_score_list()

    def make_score_list(self):
        
        fp = open(self.file_name, 'r')
        line = fp.readline()
            
        index, score = line.split(',')
        
        if index.strip() != "index" or score.strip() != "score":
            print(f"format error {self.file_name}")
            return []
            
        filescore = []
        if self.option == "common":
            
            line = fp.readline()
            while line:
                sc = float(line.split(',')[-1])
                filescore.append(sc)
                line = fp.readline()
                
        elif self.option == "temperature":
            T = 3
            line = fp.readline()
            while line:
                sc_1 = float(line.split(',')[-1])
                sc_0 = 1 - sc_1
                new_sc_1 = sc_1 + 5
                new_sc_0 = sc_0 + 5
                [new_sc_0, new_sc_1] = softmax([sc_0, sc_1], T)
                # print(sc_1, new_sc_1)
                
                filescore.append(new_sc_1)
                line = fp.readline()
            
                
        
        return filescore
    
    def get_train_acc(self):
        return self.score
    
    def get_score_list(self):
        return self.filescore
    
    def get_score_list_idx(self, idx):
        return self.filescore[idx]
    
            
    

In [3]:
def get_file_list(option = "common"):
    file_list = glob("*.txt")
    
    file_class = []
    for i in file_list:
        fc = fileScore(i, option)
        file_class.append(fc)
    
    return file_class
       
    

In [4]:
def get_ground_true_list():
    fp = open("../ML_storage/test_binary_y.csv", "r")
    line = fp.readline()
    if line.strip() != "LiveBirth":
        print(line)
        print("format error")
    
    ground_true = []
    line = fp.readline()
    
    while line:
        ground_true.append(int(line))
        line = fp.readline()
    return ground_true

In [None]:
## ---------------------------- I am a line ------------------------------------- ##

In [25]:
from matplotlib import pyplot as plt
import matplotlib
import bob.measure
from matplotlib import image as mpimg

def generate_det_curve(p_scores, n_scores):
        # matplotlib.use('TkAgg')

        plt.switch_backend('agg')
        bob.measure.plot.det(n_scores, p_scores, 1000, color = (0,0,0), linestyle = '-')
        bob.measure.plot.det_axis([0.01, 99, 0.01, 99])
        threshold = bob.measure.eer_threshold(n_scores, p_scores)
        far, frr = bob.measure.farfrr(n_scores, p_scores, threshold)
        
        x = range(99)
        ax = plt.gca()
        #ax.plot(x)
        ax.set_aspect('equal', adjustable='box')
        plt.plot([100, -10], [100, -10], linestyle='--', label=f"Equal error rate = {max(far, frr)* 100}%")
        print("##########")
        print(max(far, frr) * 100)
        print("##########")
        plt.xlabel('FAR (%)')
        plt.ylabel('FRR (%)')
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.legend(loc="lower right")
        
        plt.savefig('det_tmp.png')
        plt.cla()
        plt.clf()
        
        return max(far, frr) * 100

In [26]:
# one original result
file_class = get_file_list("common")
ground_true_list = get_ground_true_list()
mix_prediction = []
for idx in range(len(ground_true_list)):
    mix_prediction.append(c.get_score_list_idx(idx))
    
hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

ground_true_list = np.array(ground_true_list)
hard_prediction = np.array(hard_prediction)
mix_prediction = np.array(mix_prediction)

p_scores, n_scores = mix_prediction[np.where(ground_true_list == 1)].astype(np.double), mix_prediction[np.where(ground_true_list == 0)[0]].astype(np.double)

generate_det_curve(p_scores, n_scores)
image = mpimg.imread("det_tmp.png")
plt.imshow(image)
plt.show()

              precision    recall  f1-score   support

           0       0.99      0.96      0.98     27432
           1       0.77      0.92      0.84      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.94      0.91     31205
weighted avg       0.96      0.96      0.96     31205

0.9575709020990226
##########
4.002120328650941
##########


In [19]:
# simple model ensemble

file_class = get_file_list("common")
ground_true_list = get_ground_true_list()

mix_prediction = []

for idx in range(len(ground_true_list)):
    pred = []
    # print(idx)
    for c in file_class:
        pred.append(c.get_score_list_idx(idx))
    
    mix_prediction.append(np.mean(pred))

hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

ground_true_list = np.array(ground_true_list)
hard_prediction = np.array(hard_prediction)
mix_prediction = np.array(mix_prediction)

p_scores, n_scores = mix_prediction[np.where(ground_true_list == 1)].astype(np.double), mix_prediction[np.where(ground_true_list == 0)[0]].astype(np.double)

generate_det_curve(p_scores, n_scores)
image = mpimg.imread("det_tmp.png")
plt.imshow(image)
plt.show()

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      1.00      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.98      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9622496394808524
0.04024496937882765 0.04028624436787702
##########
4.028624436787702
##########


  plt.show()


In [22]:
# temperature model ensemble

file_class = get_file_list("temperature")
ground_true_list = get_ground_true_list()


mix_prediction = []

for idx in range(len(ground_true_list)):
    pred = []
    # print(idx)
    for c in file_class:
        pred.append(c.get_score_list_idx(idx))
    
    mix_prediction.append(np.mean(pred))
    # print(pred)


hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1
    else:
        pass
        # print(mix_prediction[idx], ground_true_list[idx])

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

# simple model ensemble

file_class = get_file_list("common")
ground_true_list = get_ground_true_list()

mix_prediction = []

for idx in range(len(ground_true_list)):
    pred = []
    # print(idx)
    for c in file_class:
        pred.append(c.get_score_list_idx(idx))
    
    mix_prediction.append(np.mean(pred))

hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

ground_true_list = np.array(ground_true_list)
hard_prediction = np.array(hard_prediction)
mix_prediction = np.array(mix_prediction)

p_scores, n_scores = mix_prediction[np.where(ground_true_list == 1)].astype(np.double), mix_prediction[np.where(ground_true_list == 0)[0]].astype(np.double)

generate_det_curve(p_scores, n_scores)
image = mpimg.imread("det_tmp.png")
plt.imshow(image)
plt.show()

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      1.00      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.98      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9622496394808524
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      1.00      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.98      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9622496394808524
0.04024496937882765 0.04028624436787702
##########
4.028624436787702
##########


  plt.show()


In [23]:
# Linear model ensemble
file_class = get_file_list("common")
ground_true_list = get_ground_true_list()

s = 0
for c in file_class:
    s += c.get_train_acc()

mix_prediction = []

for idx in range(len(ground_true_list)):
    mix = 0
    # print(idx)
    for c in file_class:
        mix += (c.get_train_acc()/s) * c.get_score_list_idx(idx) 
    
    mix_prediction.append(mix)
    # print(pred)


hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1
    else:
        pass
        # print(mix_prediction[idx], ground_true_list[idx])

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))    

# simple model ensemble

file_class = get_file_list("common")
ground_true_list = get_ground_true_list()

mix_prediction = []

for idx in range(len(ground_true_list)):
    pred = []
    # print(idx)
    for c in file_class:
        pred.append(c.get_score_list_idx(idx))
    
    mix_prediction.append(np.mean(pred))

hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

ground_true_list = np.array(ground_true_list)
hard_prediction = np.array(hard_prediction)
mix_prediction = np.array(mix_prediction)

p_scores, n_scores = mix_prediction[np.where(ground_true_list == 1)].astype(np.double), mix_prediction[np.where(ground_true_list == 0)[0]].astype(np.double)

generate_det_curve(p_scores, n_scores)
image = mpimg.imread("det_tmp.png")
plt.imshow(image)
plt.show()

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      1.00      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.98      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9622496394808524
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      1.00      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.98      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9622496394808524
0.04024496937882765 0.04028624436787702
##########
4.028624436787702
##########


  plt.show()


In [24]:
# Log model ensemble
file_class = get_file_list("temperature")
ground_true_list = get_ground_true_list()

s = 0
for c in file_class:
    s += math.log(c.get_train_acc(), 2)

mix_prediction = []

for idx in range(len(ground_true_list)):
    mix = 0
    # print(idx)
    for c in file_class:
        mix += (math.log(c.get_train_acc(), 2)/s) * c.get_score_list_idx(idx) 
    
    mix_prediction.append(mix)
    # print(pred)


hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1
    else:
        pass
        # print(mix_prediction[idx], ground_true_list[idx])

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

# simple model ensemble

file_class = get_file_list("common")
ground_true_list = get_ground_true_list()

mix_prediction = []

for idx in range(len(ground_true_list)):
    pred = []
    # print(idx)
    for c in file_class:
        pred.append(c.get_score_list_idx(idx))
    
    mix_prediction.append(np.mean(pred))

hard_prediction = []
acc = 0
for i in mix_prediction:
    if i >= 0.5:
        hard_prediction.append(1)
    else:
        hard_prediction.append(0)

for idx in range(len(hard_prediction)):
    if ground_true_list[idx] == hard_prediction[idx]:
        acc += 1

print(classification_report(ground_true_list, hard_prediction))
print(acc / len(ground_true_list))

ground_true_list = np.array(ground_true_list)
hard_prediction = np.array(hard_prediction)
mix_prediction = np.array(mix_prediction)

p_scores, n_scores = mix_prediction[np.where(ground_true_list == 1)].astype(np.double), mix_prediction[np.where(ground_true_list == 0)[0]].astype(np.double)

generate_det_curve(p_scores, n_scores)
image = mpimg.imread("det_tmp.png")
plt.imshow(image)
plt.show()

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      0.99      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.97      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9619932703092453
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     27432
           1       0.76      1.00      0.86      3773

    accuracy                           0.96     31205
   macro avg       0.88      0.98      0.92     31205
weighted avg       0.97      0.96      0.96     31205

0.9622496394808524
0.04024496937882765 0.04028624436787702
##########
4.028624436787702
##########


  plt.show()
