In [208]:
import pandas as pd
import json
import collections
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
from functools import reduce

In [2]:
def accuracy(truth_json, predicted_path):
    
    predicted = pd.read_csv(predicted_path)
    correct = 0
    truth_dict = collections.Counter()
    
    with open(truth_json, 'r') as json_file:
        json_list = list(json_file)

    for i, json_str in enumerate(json_list):
        result = json.loads(json_str)
        truth_dict[result['label']] += 1
        if result['label'] == predicted['label'][i]:
            correct += 1
    print(correct, len(json_list))
    print(truth_dict.items())

    return


In [214]:
def json_to_csv(truth_json):
    dict = collections.defaultdict(list)
    with open(truth_json, 'r') as json_file:
        json_list = list(json_file)
    for i, json_str in enumerate(json_list):
        result = json.loads(json_str)
        dict['label'].append(result['label'])
        dict['id'].append(result['id'])
        dict['text'].append(result['text'])
    df = pd.DataFrame(data=dict)
    return df

def log_to_proba(file, res_file):
    df = pd.read_csv(file)
    df['proba'] = df['proba'].apply(lambda x: np.exp(x))
    
    df.to_csv(res_file, index=False)
    return df
        
        
    

In [227]:
log_to_proba("./uniter-oscar/vilio/Ensemble_files/U36_test_unseen.csv", "./uniter-oscar/vilio/Ensemble_files/U36_test_unseen_converted.csv")
log_to_proba("./uniter-oscar/vilio/data/O36/O36_test_unseen_SA.csv", "./uniter-oscar/vilio/Ensemble_files/O36_test_unseen_converted.csv")

Unnamed: 0,id,proba,label
0,42953,0.026402,0
1,23058,0.002381,0
2,13894,0.003865,0
3,37408,0.007473,0
4,82403,0.238715,0
...,...,...,...
995,23705,0.012218,0
996,49806,0.009403,0
997,40813,0.009714,0
998,1468,0.125314,0


In [61]:
test_ground_truth = json_to_csv('./vilio/data/test_unseen.jsonl')
test_ground_truth.to_csv('./vilio/Ensemble_files/test.csv')

In [210]:
accuracy("/home/jupyter/.cache/torch/mmf/data/datasets/hateful_memes/defaults/annotations/test.jsonl", "./vilbert/output/test.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/.cache/torch/mmf/data/datasets/hateful_memes/defaults/annotations/test.jsonl'

In [282]:
class Ensemble:
    def __init__(self, ground_truth, uniter, oscar, visual_bert=None, vilbert=None):
        self.ground_truth = pd.read_csv(ground_truth)
        self.uniter = pd.read_csv(uniter)
        self.oscar = pd.read_csv(oscar)
        self.visual_bert = self.mmf_read_csv(visual_bert)
        self.vilbert = self.mmf_read_csv(vilbert)
        
        
        self.combined = self.combine()
        
    def mmf_read_csv(self, path):
        
        df = pd.read_csv(path)
        df['proba'] = df['scores'].apply(lambda x: float(x[1:-1].split(", ")[1]))
        df = df.drop(columns=['scores'])
        df['label'] = df['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        return df
        
    
    def compare(self, predict, array=False):
        correct = 0
        all = len(predict)
        for i in range(all):
            if predict['result'][i] == self.ground_truth['label'][i]:
                correct += 1
        #print(correct, '/', all, "Accuracy:", correct/all)
        return correct/all
    
    def combine(self):
        combine_a = pd.merge(self.uniter, self.oscar, on='id', suffixes=('_u', '_x'), left_index=False, right_index=False)
        combine_b = pd.merge(self.visual_bert, self.vilbert, on='id', suffixes=('_y', '_z'), left_index=False, right_index=False)
        combined = pd.merge(combine_a, combine_b, how='left', on='id')
        return combined
        
    
    def simple_average(self):
        cols = ['proba_u','proba_x','proba_y','proba_z']
        
        combined = self.combined
        
        combined['avg'] = combined[cols].mean(axis=1)
        combined['result'] = combined['avg'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(combined)
        print("Simple Average Accuracy:", acc)
        return combined
    
    def single_model_acc(self):
        self.uniter['result'] = self.uniter['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.uniter)
        print("Uniter Accuracy :", acc)
        
        self.oscar['result'] = self.oscar['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.oscar)
        print("Oscar Accuracy :", acc)
        
        self.visual_bert['result'] = self.visual_bert['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.visual_bert)
        print("Visual BERT Accuracy :", acc)
        
        self.vilbert['result'] = self.vilbert['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.vilbert)
        print("VilBERT Accuracy :", acc)
        
    
    def weighted_average(self):
        combined = self.combined
        max_acc = 0
        best_config = [0, 0, 0, 0]
        
        for i in range(1, 10):
            for j in range(1, 10):
                for k in range(1, 10):
                    for l in range(1, 10):
                        combined['avg'] = (combined['proba_u']*i + combined['proba_x']*j + combined['proba_y']*k + combined['proba_z']*l)/(i+j+k+l)
                        combined['result'] = combined['avg'].apply(lambda x: 1 if x >= 0.5 else 0)
                        acc = self.compare(combined)
                        if acc > max_acc:
                            max_acc = acc
                            best_config = [i, j, k, l]
        
        return max_acc, best_config
    
    def random_forest_train(self, train_data):
        X = train_data['proba'].to_numpy()
        X = [[i] for i in X]
        y = train_data['label'].to_numpy()

        rf = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=10)
        rf.fit(X, y)
        joblib.dump(rf, "./random_forest.joblib")
        print("completed")
    
    def random_forest_predict(self, model):
        loaded_rf = joblib.load(model)
        combined = self.simple_average()
        X = combined['avg'].to_numpy()
        X = X.reshape(-1, 1)
        result = loaded_rf.predict(X)
        
        correct = 0
        all = len(result)
        for i in range(all):
            if result[i] == self.ground_truth['label'][i]:
                correct += 1
        print(correct, '/', all, "Accuracy:", correct/all)
        
        
        

In [283]:
ensemble = Ensemble("./uniter-oscar/vilio/Ensemble_files/test.csv",
                    "./uniter-oscar/vilio/Ensemble_files/U36_test_unseen_converted.csv",
                    "./uniter-oscar/vilio/Ensemble_files/O36_test_unseen_converted.csv", 
                    "./vilbert/output/test.csv",
                    "./visual_bert/output/test.csv")


In [284]:
ensemble.single_model_acc()

Uniter Accuracy : 0.909
Oscar Accuracy : 0.902
Visual BERT Accuracy : 0.604
VilBERT Accuracy : 0.584


In [267]:
ensemble.simple_average()


Simple Average Accuracy: 0.853


Unnamed: 0,id,proba_u,label_u,proba_x,label_x,proba_y,label_y,proba_z,label_z,avg,result
0,42953,0.000067,0,0.026402,0,0.004799,0.0,0.003127,0.0,0.008599,0
1,23058,0.000073,0,0.002381,0,0.000394,0.0,0.002611,0.0,0.001365,0
2,13894,0.000044,0,0.003865,0,0.004470,0.0,0.002874,0.0,0.002814,0
3,37408,0.000052,0,0.007473,0,0.000793,0.0,0.003317,0.0,0.002909,0
4,82403,0.000048,0,0.238715,0,0.329488,0.0,0.772131,1.0,0.335096,0
...,...,...,...,...,...,...,...,...,...,...,...
995,23705,0.000113,0,0.012218,0,,,,,0.006165,0
996,49806,0.000049,0,0.009403,0,,,,,0.004726,0
997,40813,0.000048,0,0.009714,0,,,,,0.004881,0
998,1468,0.000047,0,0.125314,0,,,,,0.062681,0


In [268]:
train_data = pd.read_csv("./uniter-oscar/vilio/Ensemble_files/U36_train.csv")
ensemble.random_forest_train(train_data)

completed


In [269]:
ensemble.random_forest_predict("./random_forest.joblib")

Simple Average Accuracy: 0.853
247 / 1000 Accuracy: 0.247


In [270]:
ensemble.weighted_average()

(0.921, [4, 8, 1, 1])

In [9]:
data = pd.read_csv("./vilio/Ensemble_files/U36_test_seen.csv")

        id     proba  label
0    16395 -0.000083      1
1    37405 -4.863254      0
2    94180 -4.268325      0
3    54321 -0.637010      1
4    97015 -0.000358      1
..     ...       ...    ...
995   3869 -8.740327      0
996  23817 -7.594582      0
997  56280 -0.003411      1
998  29384 -8.667371      0
999  34127 -0.130345      1

[1000 rows x 3 columns]
