In [None]:
!unzip unsup_qde_analysis_model_scores.zip

Archive:  unsup_qde_analysis_model_scores.zip
  inflating: mturk_script_ensembles.py  
  inflating: mturk_script_xlnet.py   
  inflating: script_bert_image_model_calibration.py  
  inflating: table_1_unsup_hm_script_count_correct_answers.py  
  inflating: table_1_unsup_hm_script_ensembles_QA_acc_eval.py  
  inflating: table_1_unsup_hm_script_ensembles_QA_acc_test.py  
  inflating: table_1_unsup_hm_test_ensembles.py  
  inflating: table_1_unsup_hm_test_single_models.py  
  inflating: utils.py                
  inflating: utils_constants.py      
  inflating: utils_data.py           
  inflating: utils_math.py           
  inflating: utils_mturk.py          
   creating: output/
   creating: output_figures/
   creating: output_mturk/
  inflating: mturk_script_bert.py    
  inflating: mturk_script_data_prep.py  
  inflating: mturk_script_distilbert.py  
   creating: data/
  inflating: data/output_bert_seed0_eval.csv  
  inflating: data/output_bert_seed0_test.csv  
  inflating: data/output

In [None]:
import pickle
import pandas as pd
import os
import glob
import tqdm
import json
from collections import Counter
import numpy as np
import scipy.spatial as spatial
import tensorflow_hub as hub
import itertools
from nltk import agreement
from sklearn.metrics import cohen_kappa_score
import tensorflow as tf
from nltk import agreement
from utils_mturk import get_race_lines, get_mturk_results_dataframe_raw_mturk_and_race_lines
from utils_math import softmax
from utils_constants import *
from utils_data import create_calibrated_df

In [None]:
from google.colab import auth, drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = "data/race/"
DATA_DIR = "data/"
random_state = 42
split = 'test'
neural_model_type = 'distilbert'
random_seed = 1

In [None]:
def get_dists(elmo_opts, elmo_article):
    dists = []
    for x in elmo_opts:
        dists.append(spatial.distance.cosine(x, elmo_article[0]))
    return dists
    
def get_score(elmo_opts, elmo_article):
    dists = get_dists(elmo_opts, elmo_article)
    return np.mean(dists)

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
df_test = pd.read_csv('data/output_{}_seed{}_{}.csv'.format(neural_model_type, random_seed, 'test')).dropna()

In [None]:
lines = get_race_lines(DATA_PATH)
df_results_mturk = get_mturk_results_dataframe_raw_mturk_and_race_lines(
    'data/race_for_mturk_small_v2_with_analysis.csv',
    race_lines=lines
)
df_results_mturk.to_csv('data/df_results_mturk.csv', index=False)

read files: 100%|██████████| 1045/1045 [00:01<00:00, 933.72it/s]
read files: 100%|██████████| 362/362 [00:00<00:00, 1032.12it/s]


# high vs middle

In [None]:
# a slightly modified function that allows to keep the texts of article/options/questions when generatign the PairRace_HM dataset
def prepare_dataset_for_high_vs_middle_prediction(df, max_len=2000, output_file=None, random_state=None):
    if output_file is None:
        print("Num. high questions", len(df[df[LEVEL] == HIGH]))
        print("Num. middle questions", len(df[df[LEVEL] == MIDDLE]))
    else:
        output_file.write("Num. high questions "+str(len(df[df[LEVEL] == HIGH]))+"\n")
        output_file.write("Num. middle questions "+str(len(df[df[LEVEL] == MIDDLE]))+"\n")

    df_high = df[df[LEVEL] == HIGH].copy()[PREDICTION_COLUMNS + ['instance_id', 'question', 'article', 'options', 'label']]
    df_high = df_high.rename(columns={x: x + '_h' for x in df_high.columns})
    df_high['key'] = 1

    df_middle = df[df[LEVEL] == MIDDLE].copy()[PREDICTION_COLUMNS + ['instance_id', 'question', 'article', 'options', 'label']]
    df_middle = df_middle.rename(columns={x: x + '_m' for x in df_middle.columns})
    df_middle['key'] = 1

    length = min(len(df_high), len(df_middle), max_len)
    if output_file is None:
        print("Considered %d items for each level" % length)
    else:
        output_file.write("Considered %d items for each level\n" % length)
    return pd.merge(
        df_high.sample(length, random_state=random_state), df_middle.sample(length, random_state=random_state), on='key'
    )

In [None]:
df = create_calibrated_df(['output_xlnet_seed_%d_%s.csv' % (random_seed, split)])
df['instance_id'] = df['level'] + '/' + df['document_id']
for data_raw in lines:
    idxs = df[(df.level == data_raw['race_id'].split('/')[-2])&(df.document_id == data_raw['race_id'].split('/')[-1])].index
    df.at[idxs, 'question'] = data_raw['questions']
    df.at[idxs, 'article'] = [data_raw['article']] * len(data_raw['questions'])
    df.at[idxs, 'options'] = [' OPTIONBREAK '.join(x) for x in data_raw['options']]
output_filename = 'output/xlnet_%d_test.txt' % random_seed
output_file = open(output_filename, "w")
df = prepare_dataset_for_high_vs_middle_prediction(df, output_file=output_file, random_state=random_state)
output_file.close()

In [None]:
# since there are multiple repetitions in pairs, we embed the texts separately
all_questions = list(set(df['question_m'].values)) + list(set(df['question_h'].values))
all_questions_embeds = {x: elmo.signatures["default"](tf.constant([x]))["default"] for x in all_questions}
all_options = df['options_m'].apply(lambda x: x.split('OPTIONBREAK')).values + df['options_h'].apply(lambda x: x.split('OPTIONBREAK')).values
all_options = [y for x in all_options for y in x]
all_options = list(set(all_options))
all_options_embeds = {x: elmo.signatures["default"](tf.constant([x]))["default"] for x in all_options}
all_articles = list(set(df['article_m'].values)) + list(set(df['article_h'].values))
all_articles_embeds = {x: elmo.signatures["default"](tf.constant([x]))["default"] for x in all_articles}

In [None]:
len(all_questions)

2709

In [None]:
elmos = []
dists = []
pred_elmok = []
pred_elmoc = []

for idx, row in df.iterrows():
    
    opts1 = row['options_h'].split('OPTIONBREAK')
    opts2 = row['options_m'].split('OPTIONBREAK')

    # options to article (ELMO_qa)
    elmo_opts1 = [all_options_embeds[x] for x in opts1]
    elmo_opts2 = [all_options_embeds[x] for x in opts2]
    elmo_article1 = all_articles_embeds[row['article_h']]
    elmo_article2 = all_articles_embeds[row['article_m']]
    elmos.append((elmo_opts1, elmo_opts2, elmo_article1, elmo_article2))
    dists1 = get_dists(elmo_opts1, elmo_article1)
    dists2 = get_dists(elmo_opts2, elmo_article2)
    dists.append((dists1, dists2))
    for i in range(4):
        df.at[idx, 'elmo_qa_scoreh_{}'.format(i)] = dists1[i]
        df.at[idx, 'elmo_qa_scorem_{}'.format(i)] = dists2[i]
    df.at[idx, 'elmo_qa_h'] = np.argmin(dists1)
    df.at[idx, 'elmo_qa_m'] = np.argmin(dists2)

    # distractors to correct choice (ELMO_k)
    distractors_1 = [opts1[i] for i in range(len(opts1)) if i != row['label_h']]
    correct_choice_1 = opts1[row['label_h']]
    distractors_2 = [opts2[i] for i in range(len(opts2)) if i != row['label_m']]
    correct_choice_2 = opts2[row['label_m']]
    elmo_distractors1 = [all_options_embeds[x] for x in distractors_1]
    elmo_distractors2 = [all_options_embeds[x] for x in distractors_2]
    elmo_correct_choice_1 = all_options_embeds[correct_choice_1]
    elmo_correct_choice_2 = all_options_embeds[correct_choice_2]
    scoreh = get_score(elmo_distractors1, elmo_correct_choice_1)
    scorem = get_score(elmo_distractors2, elmo_correct_choice_2)
    if scoreh > scorem:
        pred_elmok.append(1)
    else:
        pred_elmok.append(2)
    df.at[idx, 'distqh'] = scoreh
    df.at[idx, 'distqm'] = scorem

    # question to article (ELMO_c)
    scoreh = spatial.distance.cosine(all_questions_embeds[row['question_h']], all_articles_embeds[row['article_h']])
    scorem = spatial.distance.cosine(all_questions_embeds[row['question_m']], all_articles_embeds[row['article_m']])
    if scoreh > scorem:
        pred_elmoc.append(1)
    else:
        pred_elmoc.append(2)
    df.at[idx, 'distqh_elmoc'] = scoreh
    df.at[idx, 'distqm_elmoc'] = scorem
df['pred_elmok'] = pred_elmok
df['pred_elmoc'] = pred_elmoc

In [None]:
df['label'] = [1] * len(df)

In [None]:
np.mean(df.apply(lambda r: r['label']==r['pred_elmoc'], axis=1))

0.5693886220622124

In [None]:
np.mean(df.apply(lambda r: r['label']==r['pred_elmok'], axis=1))

0.566989121747969

In [None]:
correctly_answered_df = df[(df.label_m==df.elmo_qa_m)&(df.label_h==df.elmo_qa_h)]

In [None]:
len(correctly_answered_df)

181395

In [None]:
a2p_scores_qm = ['elmo_qa_scorem_0',  'elmo_qa_scorem_1', 'elmo_qa_scorem_2', 'elmo_qa_scorem_3']
a2p_scores_qh = ['elmo_qa_scoreh_0',  'elmo_qa_scoreh_1', 'elmo_qa_scoreh_2', 'elmo_qa_scoreh_3']

In [None]:
df['label'] = 2
df['scores_elmo_qm'] = df.apply(lambda r: [r[o] for o in a2p_scores_qm], axis=1)
df['max_2nd_diff_qm_elmo'] = df.apply(lambda r: np.max(r['scores_elmo_qm'])-np.sort(r['scores_elmo_qm'])[-2], axis=1)
df['max_others_diff_qm_elmo'] = df.apply(lambda r: np.max(r['scores_elmo_qm'])-(np.sum(r['scores_elmo_qm'])-np.max(r['scores_elmo_qm']))/3.0, axis=1)
df['scores_var_qm_elmo'] = df.apply(lambda r: np.var(r['scores_elmo_qm']), axis=1)
df['max_score_qm_elmo'] = df.apply(lambda r: np.max(r['scores_elmo_qm']), axis=1)
df['prediction_max_2nd_diff_elmo'] = df.apply(lambda r: 1 if r['max_2nd_diff_qh_elmo']<r['max_2nd_diff_qm_elmo'] else 2, axis=1)
df['prediction_max_others_diff_elmo'] = df.apply(lambda r: 1 if r['max_others_diff_qh_elmo']<r['max_others_diff_qm_elmo'] else 2, axis=1)
df['prediction_scores_var_elmo'] = df.apply(lambda r: 1 if r['scores_var_qh_elmo']<r['scores_var_qm_elmo'] else 2, axis=1)
df['prediction_max_score_elmo'] = df.apply(lambda r: 1 if r['max_score_qh_elmo']<r['max_score_qm_elmo'] else 2, axis=1)
print("prediction_max_2nd_diff_elmo   ", np.mean(df.apply(lambda r: r['label']==r['prediction_max_2nd_diff_elmo'], axis=1)))
print("prediction_max_others_diff_elmo", np.mean(df.apply(lambda r: r['label']==r['prediction_max_others_diff_elmo'], axis=1)))
print("prediction_scores_var_elmo     ", np.mean(df.apply(lambda r: r['label']==r['prediction_scores_var_elmo'], axis=1)))
print("prediction_max_score_elmo      ", np.mean(df.apply(lambda r: r['label']==r['prediction_max_score_elmo'], axis=1)))

prediction_max_2nd_diff_elmo    0.5565381049184907
prediction_max_others_diff_elmo 0.5600195141254335
prediction_scores_var_elmo      0.5546865907309844
prediction_max_score_elmo       0.41832872960327744


In [None]:
print("prediction_max_2nd_diff_elmo   ", np.mean(correctly_answered_df.apply(lambda r: r['label']==r['prediction_max_2nd_diff_elmo'], axis=1)))
print("prediction_max_others_diff_elmo", np.mean(correctly_answered_df.apply(lambda r: r['label']==r['prediction_max_others_diff_elmo'], axis=1)))
print("prediction_scores_var_elmo     ", np.mean(correctly_answered_df.apply(lambda r: r['label']==r['prediction_scores_var_elmo'], axis=1)))
print("prediction_max_score_elmo      ", np.mean(correctly_answered_df.apply(lambda r: r['label']==r['prediction_max_score_elmo'], axis=1)))

prediction_max_2nd_diff_elmo    0.5660244218418369
prediction_max_others_diff_elmo 0.5722318696766725
prediction_scores_var_elmo      0.5654951900548527
prediction_max_score_elmo       0.44174315719837925


# crowdsourced labels

In [None]:
df_for_evaluation = pd.read_csv(os.path.join(DATA_DIR, 'df_for_evaluation.csv'))
df_for_evaluation.head()

Unnamed: 0.1,Unnamed: 0,article,level,document_id,aggr_document_id,question_1,idx_q1,question_2,idx_q2,options_1,options_2,LB,EL,Turker,sum,label,agreement,id_q1,A_q1,B_q1,C_q1,D_q1,max_2nd_diff_q1,max_others_diff_q1,scores_var_q1,max_score_q1,id_q2,A_q2,B_q2,C_q2,D_q2,max_2nd_diff_q2,max_others_diff_q2,scores_var_q2,max_score_q2
0,0,"The literal meaning of philosophy is ""love of ...",high,10466.txt,high10466.txt,"According to the passage, which of the followi...",1,"From the passage, we can conclude _ .",2,"['Philosophy is an independent discipline.', '...",['not all the subjects have to do with philoso...,1,1,2,4,1,2,1,0.183446,0.529452,0.222285,0.064816,0.307166,0.372602,0.029396,0.529452,2,0.856832,0.011553,0.131322,0.000293,0.72551,0.80911,0.125385,0.856832
1,1,Every child has written their names on the bea...,high,1104.txt,high1104.txt,Why does Hamad have his seven Mercedes-Benz500...,1,Which of the following might be the best title...,2,['Perhaps he hoped his cars were stored in a g...,"[""The Rainbow Sheikh's name can be seen from s...",1,1,2,4,1,2,1,0.138998,0.676711,0.183617,0.000675,0.493094,0.568948,0.065243,0.676711,2,0.679574,0.038941,0.000685,0.2808,0.398774,0.572766,0.073047,0.679574
2,2,Tom appeared on the sidewalk with a bucket of ...,high,1171.txt,high1171.txt,Why did Tom take all his bits of toys out of h...,1,Which of the following is the most suitable ti...,5,['Because he is tired and wanted to play with ...,"['Tom And His Fellows', 'The Happy Whitewasher...",2,2,2,6,2,3,1,0.057767,0.139852,0.641606,0.160774,0.480833,0.522142,0.052601,0.641606,5,0.001933,0.600499,0.39652,0.001048,0.203978,0.467331,0.066958,0.600499
3,3,"In the decade of the 1970s, the United Nations...",high,11813.txt,high11813.txt,Good distribution means _ .,1,The best title of the passage should be _,2,['having things in the right place at the righ...,"['The World Being Destroyed', 'A Serious Probl...",1,2,1,4,1,2,1,0.976428,1.9e-05,0.000928,0.022624,0.953804,0.968571,0.175981,0.976428,2,0.001153,0.997014,0.001035,0.000798,0.99586,0.996018,0.18601,0.997014
4,4,"Instagram is a fast,beautiful and fun way to s...",high,11977.txt,high11977.txt,Instagram probably is _ .,0,"""The Picture House""encourages sharing photos o...",2,"['a restaurant free of chmge', 'a campaign of""...","['raise the price of frozen food', 'attract mo...",2,2,2,6,2,3,0,0.004948,0.042743,0.864247,0.088061,0.776186,0.818996,0.126632,0.864247,2,0.006496,0.979378,0.001695,0.01243,0.966948,0.972505,0.177345,0.979378


In [None]:
# source: https://stackoverflow.com/questions/11528150/inter-rater-agreement-in-python-cohens-kappa
rater1 = df_for_evaluation.LB.values
rater2 = df_for_evaluation.EL.values
rater3 = df_for_evaluation.Turker.values

taskdata = [[0, str(i), str(rater1[i])] for i in range(0, len(rater1))] + [[1, str(i), str(rater2[i])] for i in range(0, len(rater2))] + [[2, str(i), str(rater3[i])] for i in range(0, len(rater3))]
ratingtask = agreement.AnnotationTask(data = taskdata)
print("kappa " + str(ratingtask.kappa()))
print("fleiss " + str(ratingtask.multi_kappa()))
print("alpha " + str(ratingtask.alpha()))
print("scotts " + str(ratingtask.pi()))

kappa 0.21989042808107853
fleiss 0.21800409183905406
alpha 0.21573565323565325
scotts 0.21245421245421225


In [None]:
choice2num = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
doc2answer = {}
doc2option = {}
for x in lines:
    doc2answer[x['id']] = {}
    for q, a in zip(x['questions'], x['answers']):
        doc2answer[x['id']][q] = choice2num[a] - 1
    doc2option[x['id']] = {}
    for q, a in zip(x['questions'], x['options']):
        doc2option[x['id']][q] = a

In [None]:
for idx, row in df_for_evaluation.iterrows():
    df_for_evaluation.at[idx, 'answer_1'] = doc2answer[row['aggr_document_id']][row['question_1']]
    df_for_evaluation.at[idx, 'answer_2'] = doc2answer[row['aggr_document_id']][row['question_2']]
    df_for_evaluation.at[idx, 'options_1'] = doc2option[row['aggr_document_id']][row['question_1']]
    df_for_evaluation.at[idx, 'options_2'] = doc2option[row['aggr_document_id']][row['question_2']]

In [None]:
df_for_evaluation.answer_1 = df_for_evaluation.answer_1.astype(int)
df_for_evaluation.answer_2 = df_for_evaluation.answer_2.astype(int)

In [None]:
def get_embeddings(row):
    opts1 = row['options_1']
    opts2 = row['options_2']
    elmo_opts1 = elmo.signatures["default"](tf.constant(opts1))["default"]
    elmo_opts2 = elmo.signatures["default"](tf.constant(opts2))["default"]
    elmo_article = elmo.signatures["default"](tf.constant([row['article']]))["default"]
    return elmo_opts1, elmo_opts2, elmo_article
    
# question to article
def get_embeddings_questions(row):
    elmo_q1 = elmo.signatures["default"](tf.constant([row['question_1']]))["default"]
    elmo_q2 = elmo.signatures["default"](tf.constant([row['question_2']]))["default"]
    elmo_article = elmo.signatures["default"](tf.constant([row['article']]))["default"]
    return elmo_q1, elmo_q2, elmo_article

# options to article
def get_embeddings_dists(row):
    elmo_opts1, elmo_opts2, elmo_article = get_embeddings(row)
    return get_dists(elmo_opts1, elmo_article), get_dists(elmo_opts2, elmo_article)

# correct choice to distractors
def get_embeddings_distractors(row):
    opts1 = row['options_1']
    opts2 = row['options_2']
    distractors_1 = [opts1[i] for i in range(len(opts1)) if i != row['answer_1']]
    correct_choice_1 = opts1[row['answer_1']]
    distractors_2 = [opts2[i] for i in range(len(opts2)) if i != row['answer_2']]
    correct_choice_2 = opts2[row['answer_2']]
    elmo_opts1 = elmo.signatures["default"](tf.constant(distractors_1))["default"]
    elmo_opts2 = elmo.signatures["default"](tf.constant(distractors_2))["default"]
    elmo_correct_choice_1 = elmo.signatures["default"](tf.constant([correct_choice_1]))["default"]
    elmo_correct_choice_2 = elmo.signatures["default"](tf.constant([correct_choice_2]))["default"]
    return elmo_opts1, elmo_opts2, elmo_correct_choice_1, elmo_correct_choice_2

In [None]:
preds_elmo_c = []
preds_elmo_k = []
preds = []
preds_qa_1 = []
preds_qa_2 = []
for idx, row in df_for_evaluation.iterrows():
    # question to article (ELMO_c)
    elmo_q1, elmo_q2, elmo_article = get_embeddings_questions(row)
    score1 = spatial.distance.cosine(elmo_q1, elmo_article[0])
    score2 = spatial.distance.cosine(elmo_q2, elmo_article[0])
    if score1 > score2:
        preds_elmo_c.append(1)
    else:
        preds_elmo_c.append(2)
    df_for_evaluation.at[idx, 'distq1'] = score1
    df_for_evaluation.at[idx, 'distq2'] = score2

    # correct choice to distractors (ELMO_k)
    elmo_opts1, elmo_opts2, elmo_correct_choice_1, elmo_correct_choice_2 = get_embeddings_distractors(row)
    dists1 = get_dists(elmo_opts1, elmo_correct_choice_1)
    dists2 = get_dists(elmo_opts2, elmo_correct_choice_2)
    for i, x in enumerate(dists1 + dists2):
        df_for_evaluation.at[idx, 'dist' + str(i)] = x
    score1 = get_score(elmo_opts1, elmo_correct_choice_1)
    score2 = get_score(elmo_opts2, elmo_correct_choice_2)
    if score1 < score2:
        preds_elmo_k.append(1)
    else:
        preds_elmo_k.append(2)

    # options to article (ELMO_qa)
    elmo_opts1, elmo_opts2, elmo_article = get_embeddings(row)
    dists1 = get_dists(elmo_opts1, elmo_article)
    dists2 = get_dists(elmo_opts2, elmo_article)
    for i, x in enumerate(dists1 + dists2):
        df_for_evaluation.at[idx, 'elmo_score_' + str(i)] = x
    dists1, dists2 = get_embeddings_dists(row)
    for i, x in enumerate(dists1 + dists2):
        df_for_evaluation.at[idx, 'dist_opt_' + str(i)] = x
    preds_qa_1.append(np.argmin(dists1))
    preds_qa_2.append(np.argmin(dists2))

In [None]:
df_for_evaluation['pred_elmo_c'] = preds_elmo_c
df_for_evaluation['pred_elmo_k'] = preds_elmo_k
df_for_evaluation['pred_elmo_qa_1'] = preds_qa_1
df_for_evaluation['pred_elmo_qa_2'] = preds_qa_2

In [None]:
df_for_evaluation['scores_elmo_q1'] = df_for_evaluation.apply(lambda r: [r[o] for o in a2p_scores_q1], axis=1)
df_for_evaluation['max_2nd_diff_q1_elmo'] = df_for_evaluation.apply(lambda r: np.max(r['scores_elmo_q1'])-np.sort(r['scores_elmo_q1'])[-2], axis=1)
df_for_evaluation['max_others_diff_q1_elmo'] = df_for_evaluation.apply(lambda r: np.max(r['scores_elmo_q1'])-(np.sum(r['scores_elmo_q1'])-np.max(r['scores_elmo_q1']))/3.0, axis=1)
df_for_evaluation['scores_var_q1_elmo'] = df_for_evaluation.apply(lambda r: np.var(r['scores_elmo_q1']), axis=1)
df_for_evaluation['max_score_q1_elmo'] = df_for_evaluation.apply(lambda r: np.max(r['scores_elmo_q1']), axis=1)
df_for_evaluation['scores_elmo_q2'] = df_for_evaluation.apply(lambda r: [r[o] for o in a2p_scores_q2], axis=1)
df_for_evaluation['max_2nd_diff_q2_elmo'] = df_for_evaluation.apply(lambda r: np.max(r['scores_elmo_q2'])-np.sort(r['scores_elmo_q2'])[-2], axis=1)
df_for_evaluation['max_others_diff_q2_elmo'] = df_for_evaluation.apply(lambda r: np.max(r['scores_elmo_q2'])-(np.sum(r['scores_elmo_q2'])-np.max(r['scores_elmo_q2']))/3.0, axis=1)
df_for_evaluation['scores_var_q2_elmo'] = df_for_evaluation.apply(lambda r: np.var(r['scores_elmo_q2']), axis=1)
df_for_evaluation['max_score_q2_elmo'] = df_for_evaluation.apply(lambda r: np.max(r['scores_elmo_q2']), axis=1)

In [None]:
score_features = ['max_2nd_diff_q1', 'max_others_diff_q1', 'scores_var_q1', 
                  'max_score_q1', 'max_2nd_diff_q2', 'max_others_diff_q2', 
                  'scores_var_q2', 'max_score_q2']
a2p_dists = ['dist_opt_0', 'dist_opt_1', 'dist_opt_2', 'dist_opt_3',
             'dist_opt_4', 'dist_opt_5', 'dist_opt_6', 'dist_opt_7']
a2p_scores = ['elmo_score_0', 'elmo_score_1', 'elmo_score_2', 'elmo_score_3', 
              'elmo_score_4', 'elmo_score_5', 'elmo_score_6', 'elmo_score_7']
ca2ia_dists = ['dist0', 'dist1', 'dist2', 'dist3', 'dist4', 'dist5']
q2a_dists = ['distq1', 'distq2']
a2p_scores_q1 = ['elmo_score_0', 'elmo_score_1', 'elmo_score_2', 'elmo_score_3']
a2p_scores_q2 = ['elmo_score_4', 'elmo_score_5', 'elmo_score_6', 'elmo_score_7']

In [None]:
df_agree = df_for_evaluation[df_for_evaluation.agreement==3]

In [None]:
for x in ['pred_elmo_c', 'pred_elmo_k']:
    print(x)
    print(np.mean(df_for_evaluation.apply(lambda r: r['label']==r[x], axis=1)), Counter(df_for_evaluation[x]))

pred_elmo_c
0.65 Counter({2: 45, 1: 35})
pred_elmo_k
0.5 Counter({2: 43, 1: 37})


In [None]:
for x in ['pred_elmo_c', 'pred_elmo_k']:
    print(x)
    print(np.mean(df_agree.apply(lambda r: r['label']==r[x], axis=1)), Counter(df_agree[x]))

pred_elmo_c
0.7567567567567568 Counter({2: 23, 1: 14})
pred_elmo_k
0.5675675675675675 Counter({1: 19, 2: 18})


In [None]:
df_for_evaluation['prediction_max_2nd_diff'] = df_for_evaluation.apply(lambda r: 1 if r['max_2nd_diff_q1']<r['max_2nd_diff_q2'] else 2, axis=1)
df_for_evaluation['prediction_max_others_diff'] = df_for_evaluation.apply(lambda r: 1 if r['max_others_diff_q1']<r['max_others_diff_q2'] else 2, axis=1)
df_for_evaluation['prediction_scores_var'] = df_for_evaluation.apply(lambda r: 1 if r['scores_var_q1']<r['scores_var_q2'] else 2, axis=1)
df_for_evaluation['prediction_max_score'] = df_for_evaluation.apply(lambda r: 1 if r['max_score_q1']<r['max_score_q2'] else 2, axis=1)
print("prediction_max_2nd_diff   ", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_max_2nd_diff'], axis=1)))
print("prediction_max_others_diff", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_max_others_diff'], axis=1)))
print("prediction_scores_var     ", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_scores_var'], axis=1)))
print("prediction_max_score      ", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_max_score'], axis=1)))

prediction_max_2nd_diff    0.575
prediction_max_others_diff 0.575
prediction_scores_var      0.575
prediction_max_score       0.575


$ELMO_{qa}$

In [None]:
df_for_evaluation['prediction_max_2nd_diff_elmo'] = df_for_evaluation.apply(lambda r: 1 if r['max_2nd_diff_q1_elmo']<r['max_2nd_diff_q2_elmo'] else 2, axis=1)
df_for_evaluation['prediction_max_others_diff_elmo'] = df_for_evaluation.apply(lambda r: 1 if r['max_others_diff_q1_elmo']<r['max_others_diff_q2_elmo'] else 2, axis=1)
df_for_evaluation['prediction_scores_var_elmo'] = df_for_evaluation.apply(lambda r: 1 if r['scores_var_q1_elmo']<r['scores_var_q2_elmo'] else 2, axis=1)
df_for_evaluation['prediction_max_score_elmo'] = df_for_evaluation.apply(lambda r: 1 if r['max_score_q1_elmo']<r['max_score_q2_elmo'] else 2, axis=1)
print("prediction_max_2nd_diff_elmo   ", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_max_2nd_diff_elmo'], axis=1)))
print("prediction_max_others_diff_elmo", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_max_others_diff_elmo'], axis=1)))
print("prediction_scores_var_elmo     ", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_scores_var_elmo'], axis=1)))
print("prediction_max_score_elmo      ", np.mean(df_for_evaluation.apply(lambda r: r['label']==r['prediction_max_score_elmo'], axis=1)))

prediction_max_2nd_diff_elmo    0.4875
prediction_max_others_diff_elmo 0.525
prediction_scores_var_elmo      0.45
prediction_max_score_elmo       0.5125


In [None]:
df_agree['prediction_max_2nd_diff_elmo'] = df_agree.apply(lambda r: 1 if r['max_2nd_diff_q1_elmo']<r['max_2nd_diff_q2_elmo'] else 2, axis=1)
df_agree['prediction_max_others_diff_elmo'] = df_agree.apply(lambda r: 1 if r['max_others_diff_q1_elmo']<r['max_others_diff_q2_elmo'] else 2, axis=1)
df_agree['prediction_scores_var_elmo'] = df_agree.apply(lambda r: 1 if r['scores_var_q1_elmo']<r['scores_var_q2_elmo'] else 2, axis=1)
df_agree['prediction_max_score_elmo'] = df_agree.apply(lambda r: 1 if r['max_score_q1_elmo']<r['max_score_q2_elmo'] else 2, axis=1)
print("prediction_max_2nd_diff_elmo   ", np.mean(df_agree.apply(lambda r: r['label']==r['prediction_max_2nd_diff_elmo'], axis=1)))
print("prediction_max_others_diff_elmo", np.mean(df_agree.apply(lambda r: r['label']==r['prediction_max_others_diff_elmo'], axis=1)))
print("prediction_scores_var_elmo     ", np.mean(df_agree.apply(lambda r: r['label']==r['prediction_scores_var_elmo'], axis=1)))
print("prediction_max_score_elmo      ", np.mean(df_agree.apply(lambda r: r['label']==r['prediction_max_score_elmo'], axis=1)))

prediction_max_2nd_diff_elmo    0.5405405405405406
prediction_max_others_diff_elmo 0.5135135135135135
prediction_scores_var_elmo      0.40540540540540543
prediction_max_score_elmo       0.6756756756756757


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [None]:
df_tmp = df_for_evaluation[(df_for_evaluation.pred_elmo_qa_2==df_for_evaluation.idx_q2)&(df_for_evaluation.pred_elmo_qa_1==df_for_evaluation.idx_q1)]
df_tmp['prediction_max_2nd_diff_elmo'] = df_tmp.apply(lambda r: 1 if r['max_2nd_diff_q1_elmo']<r['max_2nd_diff_q2_elmo'] else 2, axis=1)
df_tmp['prediction_max_others_diff_elmo'] = df_tmp.apply(lambda r: 1 if r['max_others_diff_q1_elmo']<r['max_others_diff_q2_elmo'] else 2, axis=1)
df_tmp['prediction_scores_var_elmo'] = df_tmp.apply(lambda r: 1 if r['scores_var_q1_elmo']<r['scores_var_q2_elmo'] else 2, axis=1)
df_tmp['prediction_max_score_elmo'] = df_tmp.apply(lambda r: 1 if r['max_score_q1_elmo']<r['max_score_q2_elmo'] else 2, axis=1)
print("prediction_max_2nd_diff_elmo   ", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_max_2nd_diff_elmo'], axis=1)))
print("prediction_max_others_diff_elmo", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_max_others_diff_elmo'], axis=1)))
print("prediction_scores_var_elmo     ", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_scores_var_elmo'], axis=1)))
print("prediction_max_score_elmo      ", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_max_score_elmo'], axis=1)))

prediction_max_2nd_diff_elmo    0.6666666666666666
prediction_max_others_diff_elmo 1.0
prediction_scores_var_elmo      1.0
prediction_max_score_elmo       0.6666666666666666


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [None]:
df_tmp = df_agree[(df_agree.pred_elmo_qa_2==df_agree.idx_q2)&(df_agree.pred_elmo_qa_1==df_agree.idx_q1)]
df_tmp['prediction_max_2nd_diff_elmo'] = df_tmp.apply(lambda r: 1 if r['max_2nd_diff_q1_elmo']<r['max_2nd_diff_q2_elmo'] else 2, axis=1)
df_tmp['prediction_max_others_diff_elmo'] = df_tmp.apply(lambda r: 1 if r['max_others_diff_q1_elmo']<r['max_others_diff_q2_elmo'] else 2, axis=1)
df_tmp['prediction_scores_var_elmo'] = df_tmp.apply(lambda r: 1 if r['scores_var_q1_elmo']<r['scores_var_q2_elmo'] else 2, axis=1)
df_tmp['prediction_max_score_elmo'] = df_tmp.apply(lambda r: 1 if r['max_score_q1_elmo']<r['max_score_q2_elmo'] else 2, axis=1)
print("prediction_max_2nd_diff_elmo   ", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_max_2nd_diff_elmo'], axis=1)))
print("prediction_max_others_diff_elmo", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_max_others_diff_elmo'], axis=1)))
print("prediction_scores_var_elmo     ", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_scores_var_elmo'], axis=1)))
print("prediction_max_score_elmo      ", np.mean(df_tmp.apply(lambda r: r['label']==r['prediction_max_score_elmo'], axis=1)))

prediction_max_2nd_diff_elmo    0.5
prediction_max_others_diff_elmo 1.0
prediction_scores_var_elmo      1.0
prediction_max_score_elmo       1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_