In [1]:
import glob
import numpy as np
import pandas as pd
import os
import subprocess
from tqdm import tqdm
from  scipy.stats import wilcoxon 
from collections import *
from itertools import combinations 
import matplotlib.pyplot as plt
import time
import json

In [2]:
# 0: author belief; 1: reader perception
# dataset mean baseline
baseline_0 = {0: 0.01172136003409683,
 1: 0.011393162508572975,
 2: 0.012991479736112662,
 3: 0.011472596814037131,
 4: 0.013052989060847564,
 5: 0.011191171663513818,
 6: 0.011190326327897952,
 7: 0.010770187318900754,
 8: 0.013587015189733232,
 9: 0.013343154820454863,
 10: 0.014086093819935305,
 11: 0.012687008638000513,
 12: 0.011463242699349497,
 13: 0.01289961796034648,
 14: 0.011315056576575984,
 15: 0.012583584902803448,
 16: 0.011477805904252436,
 17: 0.011946150629143948,
 18: 0.010374673021724316,
 19: 0.012556488602115258}
baseline_1 = {0: 0.01036717877492403,
 1: 0.01209759007350398,
 2: 0.012478679800216452,
 3: 0.011572136779348012,
 4: 0.011637602921731799,
 5: 0.011432101360841676,
 6: 0.011883801987325018,
 7: 0.010627776313629808,
 8: 0.012105152035567468,
 9: 0.013703768476735144,
 10: 0.013725614881463044,
 11: 0.011825360132213972,
 12: 0.013722422776237152,
 13: 0.011892114028702865,
 14: 0.01102798606301675,
 15: 0.011613000320785416,
 16: 0.012230064471946887,
 17: 0.011316997485008275,
 18: 0.011548714344527892,
 19: 0.011862464984699672}

In [3]:
# CLS result

cls_baseline_0 = {0: 0.015076630748808384,
 1: 0.011021974496543407,
 2: 0.01928694173693657,
 3: 0.011901570484042168,
 4: 0.012104419060051441,
 5: 0.01304316334426403,
 6: 0.012016581371426582,
 7: 0.021323826164007187,
 8: 0.014588769525289536,
 9: 0.012867984361946583,
 10: 0.013279014267027378,
 11: 0.015219847671687603,
 12: 0.01269700936973095,
 13: 0.012873978354036808,
 14: 0.011254058219492435,
 15: 0.011802978813648224,
 16: 0.012902085669338703,
 17: 0.013524947687983513,
 18: 0.014083235524594784,
 19: 0.013549418188631535}

cls_baseline_1 = {0: 0.011687993071973324,
 1: 0.012818840332329273,
 2: 0.010826860554516315,
 3: 0.011899133212864399,
 4: 0.010485338978469372,
 5: 0.013663796707987785,
 6: 0.014323411509394646,
 7: 0.009847083128988743,
 8: 0.011885341256856918,
 9: 0.013058829121291637,
 10: 0.012822887860238552,
 11: 0.009758047759532928,
 12: 0.014955582097172737,
 13: 0.01023859716951847,
 14: 0.011705967597663403,
 15: 0.010860863141715527,
 16: 0.01201367937028408,
 17: 0.010263219475746155,
 18: 0.016624394804239273,
 19: 0.011730768717825413}


cls_pretrain_0 = {0: 0.011772687546908855,
 1: 0.011663882993161678,
 2: 0.012046667747199535,
 3: 0.010502349585294724,
 4: 0.01126586738973856,
 5: 0.010797788389027119,
 6: 0.009528196416795254,
 7: 0.012486882507801056,
 8: 0.011860222555696964,
 9: 0.013049684464931488,
 10: 0.011996299028396606,
 11: 0.011349176988005638,
 12: 0.010714655742049217,
 13: 0.01160876452922821,
 14: 0.010803456418216228,
 15: 0.010940995067358017,
 16: 0.01171813253313303,
 17: 0.010957208462059498,
 18: 0.011648415587842464,
 19: 0.01142322737723589}


cls_pretrain_1 = {0: 0.009826753288507462,
 1: 0.01171858236193657,
 2: 0.01269275788217783,
 3: 0.011869787238538265,
 4: 0.00865564402192831,
 5: 0.009982829913496971,
 6: 0.009456614963710308,
 7: 0.00972529873251915,
 8: 0.010288670659065247,
 9: 0.011499857529997826,
 10: 0.011565973050892353,
 11: 0.008836844004690647,
 12: 0.011288951151072979,
 13: 0.010178232565522194,
 14: 0.010461992584168911,
 15: 0.009325415827333927,
 16: 0.009630292654037476,
 17: 0.009059793315827847,
 18: 0.013054706156253815,
 19: 0.009014557115733624}


cls_pretrain_token_0 = {0: 0.010958217084407806,
 1: 0.012663745321333408,
 2: 0.01364106498658657,
 3: 0.010398928076028824,
 4: 0.011044549755752087,
 5: 0.010212672874331474,
 6: 0.009952478110790253,
 7: 0.010292639024555683,
 8: 0.011206110939383507,
 9: 0.01133931428194046,
 10: 0.012384709902107716,
 11: 0.01153546292334795,
 12: 0.010479369200766087,
 13: 0.012149660848081112,
 14: 0.010966392233967781,
 15: 0.011027338914573193,
 16: 0.011458709836006165,
 17: 0.010489186272025108,
 18: 0.012499464675784111,
 19: 0.011900661513209343}


cls_pretrain_token_1 = {0: 0.010187826119363308,
 1: 0.009284949861466885,
 2: 0.011204943060874939,
 3: 0.00975991040468216,
 4: 0.00895152147859335,
 5: 0.009684860706329346,
 6: 0.009337670169770718,
 7: 0.011540099047124386,
 8: 0.010375048033893108,
 9: 0.011429155245423317,
 10: 0.01108305063098669,
 11: 0.011541424319148064,
 12: 0.013212177902460098,
 13: 0.008555375039577484,
 14: 0.010322933085262775,
 15: 0.009173530153930187,
 16: 0.009265675209462643,
 17: 0.009923133999109268,
 18: 0.010472690686583519,
 19: 0.009371738880872726}

cls_token_0 = {0: 0.011851641349494457,
 1: 0.014552637934684753,
 2: 0.012243733741343021,
 3: 0.013025390915572643,
 4: 0.012282731011509895,
 5: 0.014993528835475445,
 6: 0.010731258429586887,
 7: 0.014323625713586807,
 8: 0.014467737637460232,
 9: 0.013181411661207676,
 10: 0.016441211104393005,
 11: 0.013400928117334843,
 12: 0.010852320119738579,
 13: 0.012433438561856747,
 14: 0.010618895292282104,
 15: 0.012067402713000774,
 16: 0.01128038763999939,
 17: 0.011528553441166878,
 18: 0.013170424848794937,
 19: 0.01362711749970913}



cls_token_1 = {0: 0.008733064867556095,
 1: 0.0150664784014225,
 2: 0.012439951300621033,
 3: 0.010115419514477253,
 4: 0.011370135471224785,
 5: 0.011775325052440166,
 6: 0.009588608518242836,
 7: 0.009756277315318584,
 8: 0.013968260958790779,
 9: 0.01315283216536045,
 10: 0.0188335869461298,
 11: 0.015840822830796242,
 12: 0.011678376235067844,
 13: 0.01047901064157486,
 14: 0.013545296154916286,
 15: 0.010109839960932732,
 16: 0.010027422569692135,
 17: 0.010481423698365688,
 18: 0.012841147370636463,
 19: 0.010704511776566505}

In [4]:
# Mean result

mean_baseline_0 = {0: 0.010912482626736164,
 1: 0.009446736425161362,
 2: 0.011114019900560379,
 3: 0.011512912809848785,
 4: 0.011424017138779163,
 5: 0.01125143188983202,
 6: 0.009530827403068542,
 7: 0.010746382176876068,
 8: 0.015334735624492168,
 9: 0.011673932895064354,
 10: 0.011761762201786041,
 11: 0.011442889459431171,
 12: 0.010391511023044586,
 13: 0.012286833487451077,
 14: 0.012907731346786022,
 15: 0.011062775738537312,
 16: 0.010419790633022785,
 17: 0.010654672980308533,
 18: 0.010152311064302921,
 19: 0.011911202222108841}


mean_baseline_1 = {0: 0.008975665085017681,
 1: 0.012871118262410164,
 2: 0.010828208178281784,
 3: 0.013823370449244976,
 4: 0.009051607921719551,
 5: 0.00975124817341566,
 6: 0.009832537733018398,
 7: 0.010135890915989876,
 8: 0.010720125399529934,
 9: 0.01189338881522417,
 10: 0.012871596030890942,
 11: 0.010032083839178085,
 12: 0.010887455195188522,
 13: 0.009292349219322205,
 14: 0.010650141164660454,
 15: 0.011393895372748375,
 16: 0.009873481467366219,
 17: 0.009167039766907692,
 18: 0.010148119181394577,
 19: 0.009745611809194088}

mean_pretrain_0 = {0: 0.010555644519627094,
 1: 0.009298759512603283,
 2: 0.012931217439472675,
 3: 0.010526604019105434,
 4: 0.010234516113996506,
 5: 0.010758125223219395,
 6: 0.009859448298811913,
 7: 0.011328736320137978,
 8: 0.010987618006765842,
 9: 0.011236891150474548,
 10: 0.012399856001138687,
 11: 0.011608326807618141,
 12: 0.009888077154755592,
 13: 0.01166562084108591,
 14: 0.011282245628535748,
 15: 0.013453686609864235,
 16: 0.010721906088292599,
 17: 0.01870395429432392,
 18: 0.010741780512034893,
 19: 0.011953089386224747}

mean_pretrain_1 =  {0: 0.009020980447530746,
 1: 0.01051703654229641,
 2: 0.011570064350962639,
 3: 0.01585499942302704,
 4: 0.009239102713763714,
 5: 0.009845034219324589,
 6: 0.010603810660541058,
 7: 0.00928241852670908,
 8: 0.010495041497051716,
 9: 0.012400380335748196,
 10: 0.010766462422907352,
 11: 0.009198974817991257,
 12: 0.012046100571751595,
 13: 0.009208966977894306,
 14: 0.010130263864994049,
 15: 0.011245718225836754,
 16: 0.009181786328554153,
 17: 0.019324488937854767,
 18: 0.011134417727589607,
 19: 0.010168075561523438}

mean_pretrain_token_0 = {0: 0.010629464872181416,
 1: 0.008711942471563816,
 2: 0.011627019383013248,
 3: 0.0103102782741189,
 4: 0.009941129945218563,
 5: 0.01144125685095787,
 6: 0.009070545434951782,
 7: 0.011675920337438583,
 8: 0.010393081232905388,
 9: 0.010451609268784523,
 10: 0.012483809143304825,
 11: 0.013250203803181648,
 12: 0.01021381001919508,
 13: 0.011525669135153294,
 14: 0.011124390177428722,
 15: 0.010406434535980225,
 16: 0.010536178946495056,
 17: 0.009811390191316605,
 18: 0.010005353018641472,
 19: 0.010356627404689789}

mean_pretrain_token_1 =  {0: 0.007998875342309475,
 1: 0.009115912951529026,
 2: 0.010501768440008163,
 3: 0.009103926829993725,
 4: 0.009008046239614487,
 5: 0.009604079648852348,
 6: 0.008502543903887272,
 7: 0.01006884966045618,
 8: 0.010045601986348629,
 9: 0.010097094811499119,
 10: 0.009851574897766113,
 11: 0.008672867901623249,
 12: 0.010319382883608341,
 13: 0.008220409974455833,
 14: 0.010264684446156025,
 15: 0.009321077726781368,
 16: 0.011766400188207626,
 17: 0.008433387614786625,
 18: 0.009731031954288483,
 19: 0.008625686168670654}

mean_token_0 = {0: 0.010859662666916847,
 1: 0.013358328491449356,
 2: 0.014206349849700928,
 3: 0.01020983699709177,
 4: 0.011261258274316788,
 5: 0.01040391344577074,
 6: 0.009236782789230347,
 7: 0.01078416220843792,
 8: 0.011540740728378296,
 9: 0.011419079266488552,
 10: 0.01232809666544199,
 11: 0.011511663906276226,
 12: 0.010567260906100273,
 13: 0.012348045594990253,
 14: 0.011959454044699669,
 15: 0.010821693576872349,
 16: 0.011016984470188618,
 17: 0.010680891573429108,
 18: 0.010018059983849525,
 19: 0.011027124710381031}

mean_token_1 = {0: 0.009245892986655235,
 1: 0.009451670572161674,
 2: 0.012048500590026379,
 3: 0.009786992333829403,
 4: 0.009239010512828827,
 5: 0.008905514143407345,
 6: 0.008432052098214626,
 7: 0.009385034441947937,
 8: 0.01059951912611723,
 9: 0.011577473022043705,
 10: 0.009742723777890205,
 11: 0.009026164188981056,
 12: 0.009933843277394772,
 13: 0.008880550041794777,
 14: 0.009770245291292667,
 15: 0.00922287255525589,
 16: 0.009117812849581242,
 17: 0.009355617687106133,
 18: 0.010069800540804863,
 19: 0.008805280551314354}

In [5]:
def eval_one(a):
    print(np.round(np.mean([a[i] for i in range(20)]), 4), '±',np.round(np.std([a[i] for i in range(20)]), 4))
def eval_diff(a, b):
    print(wilcoxon([a[i] for i in range(20)], [b[i] for i in range(20)], alternative='greater')) # a's values are larger than b's values
    print('A-B:', np.mean([a[i] for i in range(20)])  - np.mean([b[i] for i in range(20)]) )
    print('A:', np.round(np.mean([a[i] for i in range(20)]), 4), '±', np.round(np.std([a[i] for i in range(20)]), 4))
    print('B:', np.round(np.mean([b[i] for i in range(20)]), 4), '±', np.round(np.std([b[i] for i in range(20)]), 4))
    print()

In [6]:
eval_diff(baseline_0, cls_baseline_0), eval_diff(baseline_1, cls_baseline_1)

WilcoxonResult(statistic=33.0, pvalue=0.9975700378417969)
A-B: -0.0016157634415536436
A: 0.0121 ± 0.001
B: 0.0137 ± 0.0025

WilcoxonResult(statistic=109.0, pvalue=0.4491586685180664)
A-B: -0.0001400053928091831
A: 0.0119 ± 0.0009
B: 0.0121 ± 0.0018



(None, None)

In [7]:
eval_diff(baseline_0, mean_baseline_0), eval_diff(baseline_1, mean_baseline_1)

WilcoxonResult(statistic=175.0, pvalue=0.0036478042602539062)
A-B: 0.0008082103590608333
A: 0.0121 ± 0.001
B: 0.0113 ± 0.0012

WilcoxonResult(statistic=190.0, pvalue=0.00035381317138671875)
A-B: 0.0013362797015860975
A: 0.0119 ± 0.0009
B: 0.0106 ± 0.0013



(None, None)

In [8]:
eval_diff(cls_baseline_0, mean_baseline_0), eval_diff(cls_baseline_1, mean_baseline_1)

WilcoxonResult(statistic=195.0, pvalue=0.00013065338134765625)
A-B: 0.002423973800614477
A: 0.0137 ± 0.0025
B: 0.0113 ± 0.0012

WilcoxonResult(statistic=176.0, pvalue=0.0031948089599609375)
A-B: 0.0014762850943952806
A: 0.0121 ± 0.0018
B: 0.0106 ± 0.0013



(None, None)

In [9]:
eval_diff(cls_pretrain_token_0, mean_pretrain_token_0), eval_diff(cls_pretrain_token_1, mean_pretrain_token_1)

WilcoxonResult(statistic=159.0, pvalue=0.022027015686035156)
A-B: 0.0006317281164228902
A: 0.0113 ± 0.0009
B: 0.0107 ± 0.0011

WilcoxonResult(statistic=187.0, pvalue=0.0006046295166015625)
A-B: 0.000771225523203611
A: 0.0102 ± 0.0011
B: 0.0095 ± 0.0009



(None, None)

In [10]:
eval_diff(mean_baseline_0, mean_pretrain_token_0), eval_diff(mean_pretrain_0, mean_pretrain_token_0), eval_diff(mean_token_0, mean_pretrain_token_0)

WilcoxonResult(statistic=157.0, pvalue=0.026584625244140625)
A-B: 0.0005986422300338738
A: 0.0113 ± 0.0012
B: 0.0107 ± 0.0011

WilcoxonResult(statistic=160.0, pvalue=0.019994735717773438)
A-B: 0.0008084994740784161
A: 0.0115 ± 0.0019
B: 0.0107 ± 0.0011

WilcoxonResult(statistic=159.0, pvalue=0.022027015686035156)
A-B: 0.000579663785174489
A: 0.0113 ± 0.0011
B: 0.0107 ± 0.0011



(None, None, None)

In [11]:
eval_diff(mean_baseline_1, mean_pretrain_token_1), eval_diff(mean_pretrain_1, mean_pretrain_token_1), eval_diff(mean_token_1, mean_pretrain_token_1)

WilcoxonResult(statistic=194.0, pvalue=0.00016117095947265625)
A-B: 0.0011345865204930305
A: 0.0106 ± 0.0013
B: 0.0095 ± 0.0009

WilcoxonResult(statistic=185.0, pvalue=0.0008449554443359375)
A-B: 0.0015990460291504856
A: 0.0111 ± 0.0024
B: 0.0095 ± 0.0009

WilcoxonResult(statistic=136.0, pvalue=0.13054943084716797)
A-B: 0.00016716835089027916
A: 0.0096 ± 0.0009
B: 0.0095 ± 0.0009



(None, None, None)

In [12]:
eval_diff(mean_baseline_0, mean_pretrain_0), eval_diff(mean_baseline_0, mean_token_0)

WilcoxonResult(statistic=108.0, pvalue=0.4636392593383789)
A-B: -0.0002098572440445423
A: 0.0113 ± 0.0012
B: 0.0115 ± 0.0019

WilcoxonResult(statistic=127.0, pvalue=0.21521663665771484)
A-B: 1.8978444859384797e-05
A: 0.0113 ± 0.0012
B: 0.0113 ± 0.0011



(None, None)

In [13]:
eval_diff(mean_baseline_1, mean_pretrain_1), eval_diff(mean_baseline_1, mean_token_1)

WilcoxonResult(statistic=95.0, pvalue=0.649409294128418)
A-B: -0.0004644595086574551
A: 0.0106 ± 0.0013
B: 0.0111 ± 0.0024

WilcoxonResult(statistic=183.0, pvalue=0.0011625289916992188)
A-B: 0.0009674181696027514
A: 0.0106 ± 0.0013
B: 0.0096 ± 0.0009



(None, None)