In [84]:
import pandas as pd
import json
import collections
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
from functools import reduce

In [4]:
def accuracy(truth_json, predicted_path):
    
    predicted = pd.read_csv(predicted_path)
    correct = 0
    truth_dict = collections.Counter()
    
    with open(truth_json, 'r') as json_file:
        json_list = list(json_file)

    for i, json_str in enumerate(json_list):
        result = json.loads(json_str)
        truth_dict[result['label']] += 1
        if result['label'] == predicted['label'][i]:
            correct += 1
    print(correct, len(json_list))
    print(truth_dict.items())

    return


In [70]:
def json_to_csv(truth_json):
    dict = collections.defaultdict(list)
    with open(truth_json, 'r') as json_file:
        json_list = list(json_file)
    for i, json_str in enumerate(json_list):
        result = json.loads(json_str)
        dict['label'].append(result['label'])
        dict['id'].append(result['id'])
        dict['text'].append(result['text'])
    df = pd.DataFrame(data=dict)
    return df

def log_to_proba(file, res_file):
    df = pd.read_csv(file)
    df['proba'] = df['proba'].apply(lambda x: np.exp(x))
    
    df.to_csv(res_file, index=False)
    return df
        
        
    

In [80]:
#log_to_proba("./vilio/Ensemble_files/U36_test_seen.csv", "./vilio/Ensemble_files/U36_test_seen_converted.csv")
#log_to_proba("./vilio/data/O36/O36_test_seen_SA.csv", "./vilio/Ensemble_files/U36_test_seen_converted.csv")

In [61]:
test_ground_truth = json_to_csv('./vilio/data/test_unseen.jsonl')
test_ground_truth.to_csv('./vilio/Ensemble_files/test.csv')

In [5]:
accuracy("./vilio/data/test_unseen.jsonl", "./vilio/data/O36/O36_test_seen_SA.csv")

589 1000
dict_items([(0, 753), (1, 247)])


In [196]:
class Ensemble:
    def __init__(self, ground_truth, uniter, oscar, visual_bert=None, vilbert=None):
        self.ground_truth = pd.read_csv(ground_truth)
        self.uniter = pd.read_csv(uniter)
        self.oscar = pd.read_csv(oscar)
        self.visual_bert = pd.read_csv(visual_bert)
        self.vilbert = pd.read_csv(vilbert)
        
        
        self.combined = self.combine()
    
    def compare(self, predict, array=False):
        correct = 0
        all = len(predict)
        for i in range(all):
            if predict['result'][i] == self.ground_truth['label'][i]:
                correct += 1
        #print(correct, '/', all, "Accuracy:", correct/all)
        return correct/all
    
    def combine(self):
        combine_a = pd.merge(self.uniter, self.oscar, on='id', suffixes=('_u', '_x'), left_index=False, right_index=False)
        combine_b = pd.merge(self.visual_bert, self.vilbert, on='id', suffixes=('_y', '_z'), left_index=False, right_index=False)
        combined = pd.merge(combine_a, combine_b, how='left', on='id')
        return combined
        
    
    def simple_average(self):
        cols = ['proba_u','proba_x','proba_y','proba_z']
        
        combined = self.combined
        
        combined['avg'] = combined[cols].mean(axis=1)
        combined['result'] = combined['avg'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(combined)
        return combined
        
        
    
    def weighted_average(self):
        combined = self.combined
        max_acc = 0
        best_config = [0, 0, 0, 0]
        
        for i in range(1, 10):
            for j in range(1, 10-i):
                for k in range(1, 10-i-j):
                    for l in range(1, 10-i-j-k):
                        combined['avg'] = (combined['proba_u']*i + combined['proba_x']*j + combined['proba_y']*k + combined['proba_z']*l)/(i+j+k+l)
                        acc = self.compare(combined)
                        if accuracy > max_acc:
                            max_acc = acc
                            best_config = [i, j, k, l]
        
        return max_acc, best_config
    
    def random_forest_train(self, train_data):
        X = train_data['proba'].to_numpy()
        X = [[i] for i in X]
        y = train_data['label'].to_numpy()

        rf = RandomForestRegressor(max_depth=2, random_state=0)
        rf.fit(X, y)
        joblib.dump(rf, "./random_forest.joblib")
        print("completed")
    
    def random_forest_predict(self, model):
        loaded_rf = joblib.load(model)
        combined = self.simple_average()
        X = combined['avg'].to_numpy()
        X = X.reshape(-1, 1)
        result = loaded_rf.predict(X)
        
        correct = 0
        all = len(result)
        for i in range(all):
            if result[i] == self.ground_truth['label'][i]:
                correct += 1
        print(correct, '/', all, "Accuracy:", correct/all)
        
            
            
        
        
        

In [197]:
ensemble = Ensemble("./uniter-oscar/vilio/Ensemble_files/test.csv",
                    "./uniter-oscar/vilio/Ensemble_files/U36_test_seen.csv",
                    "./uniter-oscar/vilio/data/O36/O36_test_seen_SA.csv", 
                    "./vilbert/output/test.csv",
                    "./visual_bert/output/test.csv")


In [198]:
ensemble.simple_average()


Unnamed: 0,id,proba_u,label_u,proba_x,label_x,proba_y,label_y,proba_z,label_z,avg,result
0,16395,0.999917,1,0.969714,1,,,,,0.984816,1
1,37405,0.007725,0,0.031993,1,,,,,0.019859,0
2,94180,0.014005,0,0.030658,0,,,,,0.022332,0
3,54321,0.528871,1,0.339809,0,,,,,0.434340,0
4,97015,0.999642,1,0.995357,1,,,,,0.997499,1
...,...,...,...,...,...,...,...,...,...,...,...
995,3869,0.000160,0,0.000435,0,,,,,0.000297,0
996,23817,0.000503,0,0.002383,0,,,,,0.001443,0
997,56280,0.996595,1,0.186905,0,,,,,0.591750,1
998,29384,0.000172,0,0.000658,0,,,,,0.000415,0


In [199]:
train_data = pd.read_csv("./uniter-oscar/vilio/Ensemble_files/U36_train.csv")
ensemble.random_forest_train(train_data)

completed


In [200]:
ensemble.random_forest_predict("./random_forest.joblib")

247 / 1000 Accuracy: 0.247


In [None]:
ensemble.weighted_

In [9]:
data = pd.read_csv("./vilio/Ensemble_files/U36_test_seen.csv")

        id     proba  label
0    16395 -0.000083      1
1    37405 -4.863254      0
2    94180 -4.268325      0
3    54321 -0.637010      1
4    97015 -0.000358      1
..     ...       ...    ...
995   3869 -8.740327      0
996  23817 -7.594582      0
997  56280 -0.003411      1
998  29384 -8.667371      0
999  34127 -0.130345      1

[1000 rows x 3 columns]
