In [1]:
import pandas as pd
import json
import collections
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import joblib
from functools import reduce

In [2]:
def accuracy(truth_json, predicted_path):
    
    predicted = pd.read_csv(predicted_path)
    correct = 0
    truth_dict = collections.Counter()
    
    with open(truth_json, 'r') as json_file:
        json_list = list(json_file)

    for i, json_str in enumerate(json_list):
        result = json.loads(json_str)
        truth_dict[result['label']] += 1
        if result['label'] == predicted['label'][i]:
            correct += 1
    print(correct, len(json_list))
    print(truth_dict.items())

    return


In [3]:
def json_to_csv(truth_json):
    dict = collections.defaultdict(list)
    with open(truth_json, 'r') as json_file:
        json_list = list(json_file)
    for i, json_str in enumerate(json_list):
        result = json.loads(json_str)
        dict['label'].append(result['label'])
        dict['id'].append(result['id'])
        dict['text'].append(result['text'])
    df = pd.DataFrame(data=dict)
    return df

def log_to_proba(file, res_file):
    df = pd.read_csv(file)
    df['proba'] = df['proba'].apply(lambda x: np.exp(x))
    
    df.to_csv(res_file, index=False)
    return df
        
        
    

In [4]:
log_to_proba("./uniter-oscar/vilio/Ensemble_files/U36_test_unseen.csv", "./uniter-oscar/vilio/Ensemble_files/U36_test_unseen_converted.csv")
log_to_proba("./uniter-oscar/vilio/data/O36/O36_test_unseen_SA.csv", "./uniter-oscar/vilio/Ensemble_files/O36_test_unseen_converted.csv")

FileNotFoundError: [Errno 2] No such file or directory: './uniter-oscar/vilio/data/O36/O36_test_unseen_SA.csv'

In [61]:
test_ground_truth = json_to_csv('./vilio/data/test_unseen.jsonl')
test_ground_truth.to_csv('./vilio/Ensemble_files/test.csv')

In [210]:
accuracy("/home/jupyter/.cache/torch/mmf/data/datasets/hateful_memes/defaults/annotations/test.jsonl", "./vilbert/output/test.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/.cache/torch/mmf/data/datasets/hateful_memes/defaults/annotations/test.jsonl'

In [205]:
class Ensemble:
    def __init__(self, ground_truth, uniter, oscar, visual_bert, vilbert, ground_truth_train, uniter_train, visual_bert_train, vilbert_train):
        self.ground_truth = self.jsonl_to_df(ground_truth)
        self.uniter = pd.read_csv(uniter)
        self.oscar = pd.read_csv(oscar)
        self.visual_bert = self.mmf_read_csv(visual_bert)
        self.vilbert = self.mmf_read_csv(vilbert)
        
        self.ground_truth_train = self.jsonl_to_df(ground_truth_train)
        self.uniter_train = self.log_to_proba(uniter_train)
        self.visual_bert_train = self.mmf_read_csv(visual_bert_train)
        self.vilbert_train = self.mmf_read_csv(vilbert_train)
        
        self.combined = self.combine()
        
    def jsonl_to_df(self, jsonl_path):
        dict = collections.defaultdict(list)
        with open(jsonl_path, 'r') as json_file:
            json_list = list(json_file)
        for i, json_str in enumerate(json_list):
            result = json.loads(json_str)
            dict['label'].append(result['label'])
            dict['id'].append(result['id'])
            dict['text'].append(result['text'])
        df = pd.DataFrame(data=dict)
        return df
        
    def log_to_proba(self, file):
        df = pd.read_csv(file)
        df['proba'] = df['proba'].apply(lambda x: np.exp(x))
        return df
        
    def mmf_read_csv(self, path):
        
        df = pd.read_csv(path)
        df['proba'] = df['scores'].apply(lambda x: float(x[1:-1].split(", ")[1]))
        df = df.drop(columns=['scores'])
        df['label'] = df['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        return df
        
    
    def compare(self, predict, array=False):
        correct = 0
        all = len(predict)
        for i in range(all):
            if predict['result'][i] == self.ground_truth['label'][i]:
                correct += 1
        #print(correct, '/', all, "Accuracy:", correct/all)
        return correct/all
    
    def combine(self):
        combine_a = pd.merge(self.uniter, self.oscar, on='id', suffixes=('_u', '_x'), left_index=False, right_index=False)
        combine_b = pd.merge(self.visual_bert, self.vilbert, on='id', suffixes=('_y', '_z'), left_index=False, right_index=False)
        combined = pd.merge(combine_a, combine_b, how='left', on='id')
        return combined
        
    
    def simple_average(self):
        cols = ['proba_u','proba_x','proba_y','proba_z']
        
        combined = self.combined
        
        combined['avg'] = combined[cols].mean(axis=1)
        combined['result'] = combined['avg'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(combined)
        print("Simple Average Accuracy:", acc)
        return combined
    
    def single_model_acc(self):
        self.uniter['result'] = self.uniter['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.uniter)
        print("Uniter Accuracy :", acc)
        
        self.oscar['result'] = self.oscar['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.oscar)
        print("Oscar Accuracy :", acc)
        
        self.visual_bert['result'] = self.visual_bert['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.visual_bert)
        print("Visual BERT Accuracy :", acc)
        
        self.vilbert['result'] = self.vilbert['proba'].apply(lambda x: 1 if x >= 0.5 else 0)
        acc = self.compare(self.vilbert)
        print("VilBERT Accuracy :", acc)
        
    
    def weighted_average(self):
        combined = self.combined
        max_acc = 0
        best_config = [0, 0, 0, 0]
        
        for i in range(1, 10):
            for j in range(1, 10):
                for k in range(1, 10):
                    for l in range(1, 10):
                        combined['avg'] = (combined['proba_u']*i + combined['proba_x']*j + combined['proba_y']*k + combined['proba_z']*l)/(i+j+k+l)
                        combined['result'] = combined['avg'].apply(lambda x: 1 if x >= 0.5 else 0)
                        acc = self.compare(combined)
                        if acc > max_acc:
                            max_acc = acc
                            best_config = [i, j, k, l]
        
        return max_acc, best_config
        
    
    def random_forest_train(self):
        
        combine_a = pd.merge(self.uniter_train, self.visual_bert_train, on='id', suffixes=('_u', '_y'), left_index=False, right_index=False)
        combined = pd.merge(combine_a, self.vilbert_train, on='id', left_index=False, right_index=False)
        self.X_train = combined[['proba_u','proba_y','proba']].replace(np.nan, 0)
        self.y_train = self.ground_truth_train['label']
        
        # X = train_data['proba'].to_numpy()
        # X = [[i] for i in X]
        # y = train_data['label'].to_numpy()

        rf = RandomForestRegressor()
        rf.fit(self.X_train, self.y_train)
        joblib.dump(rf, "./random_forest.joblib")
        print("completed")
    
    def random_forest_predict(self, model):
        
        self.X_test = self.combined[['proba_u','proba_y','proba_z']].replace(np.nan, 0)
        self.y_test = self.ground_truth['label'].replace(np.nan, 0)
        
        loaded_rf = joblib.load(model)
        result = loaded_rf.predict(self.X_test)
        result = np.where(result<0.5, 0, 1)
        acc = np.mean(np.equal(result, self.y_test.to_numpy()))
        print("Accuracy:", acc)
        
        
        # combined = self.simple_average()
        # X = combined['avg'].to_numpy()
        # X = X.reshape(-1, 1)
        # result = loaded_rf.predict(X)
        
        # correct = 0
        # all = len(result)
        # for i in range(all):
        #     if result[i] == self.ground_truth['label'][i]:
        #         correct += 1
        # print(correct, '/', all, "Accuracy:", correct/all)
        
        
        

In [206]:
ensemble = Ensemble("./test.jsonl",
                    "./uniter-oscar/vilio/Ensemble_files/U36_test_unseen_converted.csv",
                    "./uniter-oscar/vilio/Ensemble_files/O36_test_unseen_converted.csv", 
                    "./vilbert/output/test.csv",
                    "./visual_bert/output/test.csv",
                    "./train.jsonl",
                    "./uniter-oscar/vilio/Ensemble_files/U36_train.csv",
                    "./vilbert/output/train.csv",
                    "./visual_bert/output/train.csv"
                   )


In [207]:
ensemble.single_model_acc()

Uniter Accuracy : 0.65
Oscar Accuracy : 0.647
Visual BERT Accuracy : 0.749
VilBERT Accuracy : 0.751


In [208]:
ensemble.random_forest_train()

completed


In [209]:
ensemble.random_forest_predict("./random_forest.joblib",)

Accuracy: 0.626


Feature names unseen at fit time:
- proba_z
Feature names seen at fit time, yet now missing:
- proba



In [210]:
ensemble.X_train

Unnamed: 0,proba_u,proba_y,proba
0,0.000325,0.016702,0.003083
1,0.000332,0.002615,0.003664
2,0.000260,0.000546,0.003114
3,0.000475,0.000515,0.005600
4,0.000355,0.000345,0.002541
...,...,...,...
7495,0.999927,0.993523,0.996348
7496,0.999811,0.998505,0.995791
7497,0.999956,0.997529,0.994840
7498,0.999967,0.999054,0.995344


In [211]:
ensemble.simple_average()

Simple Average Accuracy: 0.648


Unnamed: 0,id,proba_u,label_u,proba_x,label_x,proba_y,label_y,proba_z,label_z,avg,result
0,42953,0.000067,0,0.026402,0,0.004799,0.0,0.003127,0.0,0.008599,0
1,23058,0.000073,0,0.002381,0,0.000394,0.0,0.002611,0.0,0.001365,0
2,13894,0.000044,0,0.003865,0,0.004470,0.0,0.002874,0.0,0.002814,0
3,37408,0.000052,0,0.007473,0,0.000793,0.0,0.003317,0.0,0.002909,0
4,82403,0.000048,0,0.238715,0,0.329488,0.0,0.772131,1.0,0.335096,0
...,...,...,...,...,...,...,...,...,...,...,...
995,23705,0.000113,0,0.012218,0,,,,,0.006165,0
996,49806,0.000049,0,0.009403,0,,,,,0.004726,0
997,40813,0.000048,0,0.009714,0,,,,,0.004881,0
998,1468,0.000047,0,0.125314,0,,,,,0.062681,0


In [29]:

def mmf_acc(df, dataset):
    
    dev_label = dict()

    acc_count = 0
    total_count = 0

    with open(f'/home/jupyter/.cache/torch/mmf/data/datasets/hateful_memes/defaults/annotations/{dataset}.jsonl', 'r') as f:
        for line in f:
            total_count += 1

            data = json.loads(line)
            [imid, url, label, text] = data['id'], data['img'], data['label'], data['text']
            dev_label[imid] = label

            if df.loc[df['id'] == int(imid)]['label'].item() == int(label):
                acc_count += 1

    print(acc_count, total_count, acc_count/total_count)

In [30]:
mmf_acc(ensemble.visual_bert, 'test')

749 1000 0.749


In [31]:
mmf_acc(ensemble.vilbert, 'test')

751 1000 0.751
