# IR-Reproducibility with Transferred Relevance Judgments

In [1]:
import pandas as pd
import json
import numpy as np
import seaborn as sns
import random
import sys

sys.path.append('../../../simulate-label-transfer/transfer_simulation')

from repro_measures import effect_ratio, delta_relative_improvement, classify_repro_pairs

def regression_by_measure(df, measure: str):
    g = sns.FacetGrid(
        df.loc[:,["corpus","tag",measure]].sort_values(measure).replace({k: v for v,k in enumerate(df.sort_values(measure).tag.unique())}), 
        hue="corpus",
        height=6,
        aspect=1.2
    )
    g.map(sns.regplot, "tag", measure, scatter_kws={'alpha':0.4})
    g.add_legend()
    
    return g

def query_regression_by_measure(df, topic: int, measure: str):
    regression_by_measure(df[df['topic'] == topic], measure)

def extract_metadata_from_Tag(tag):
    tag = tag.replace('clueweb12-and-wayback12', 'clueweb12_and_wayback12')
    return dict(list(map(lambda x: x.split("^"), ("collection^"+tag).split("-"))))

def sorted_df(df):
    return df.sort_values("tag",ascending=False)

def import_df(file_name):
    df = pd.read_json(file_name, lines=True)
    df = pd.merge(
        pd.DataFrame(df.tag.apply(lambda t: extract_metadata_from_Tag(t)).tolist()).reset_index(),
        df.reset_index(),
        on="index"
    ).drop(["index","tag"],axis=1)

    df["tag"] = df['body_lang.en'].astype(str)+df['title_lang.en'].astype(str)+df['meta_desc_lang.en'].astype(str)
    
    return sorted_df(df)

def build_topic_df(df, topics: list=None, topic_start: int=None, topic_end: int=None):
    if topics and topic_start and topic_end:
        raise
    elif not topics:
        topics = [i for i in range(topic_start, topic_end+1)]
    nqueries = len(topics)
    df = df[(df['topic'].astype(int).isin(topics))]
    df = df.groupby(['collection', 'body_lang.en', 'title_lang.en', 'meta_desc_lang.en', 'corpus', 'tag']).agg({
        'bpref': np.mean,
        'pseudoNDCG@10': np.mean,
        'pseudoNDCG': np.mean
    }).reset_index()
    
    columns = ['collection', 'body_lang.en', 'title_lang.en', 'meta_desc_lang.en', 'corpus', 'bpref', 'pseudoNDCG@10', 'pseudoNDCG', 'tag']
    if topic_start and topic_end:
        columns = ['collection', 'body_lang.en', 'title_lang.en', 'meta_desc_lang.en', 'corpus', 'topics', 'bpref', 'pseudoNDCG@10', 'pseudoNDCG', 'tag']
        df['topics'] = 'topics-' + str(topic_start) + '-' + str(topic_end)
    
    df = df[columns]
    
    return sorted_df(df)

def is_source_collection(r, target_collection):
    if target_collection in ['cw12', 'cw12wb12']:
        return r['topic'] <= 200 and r['corpus'] == 'cw09'
    elif target_collection == 'cc15':
        return (r['corpus'] == 'cw09' and r['topic'] <= 200) or (r['corpus'] == 'cw12' and r['topic'] > 200)
    raise

def label_transfer_df(df, target_collection):
    df = df.copy()
    df['is_source_collection'] = df.apply(lambda i: is_source_collection(i, target_collection), axis=1)
    df = df[(df['is_source_collection'] == True) | (df['corpus'] == target_collection)]
    df['corpus'] = df.apply(lambda i: 'source' if i['is_source_collection'] else target_collection, axis=1)
    df['collection'] = df.apply(lambda i: 'source' if i['is_source_collection'] else i['collection'], axis=1)
    
    df['keep'] = df.apply(lambda i: is_shallow_topic(target_collection, i['topic']), axis=1)
    df = df[df['keep'] == True]
    
    return df

In [5]:
base_dir = '/mnt/ceph/storage/data-in-progress/kibi9872/sigir2021/'
topic_df = import_df(base_dir + 'data-26-10-2020/rankings/reproducibility-evaluation-zero-scores-removed.jsonl')
query_df = import_df(base_dir + 'data-26-10-2020/rankings/reproducibility-evaluation-per-query-zero-scores-removed.jsonl')

# Helpers from transferred-topics

In [3]:
import pandas as pd

def is_shallow_topic(topic):
    return topic['relevant'] > 0 and topic['irrelevant'] > 0 and (topic['relevant'] + topic['irrelevant']) >= 10
    
transferred_topics = pd.read_json('../../test/resources/overview-of-transferred-topics.jsonl', lines=True)
transferred_topics['shallowTopic'] = transferred_topics.apply(lambda i: is_shallow_topic(i), axis=1)

def is_shallow_topic(corpus, topic):
    tmp_df = transferred_topics[(transferred_topics['targetCorpus'] == corpus) & (transferred_topics['topic'] == topic)]

    return len(tmp_df) > 0 and tmp_df.iloc[0]['shallowTopic']

# Precalculations

In [4]:
query_dfs = {
    'cc15': label_transfer_df(query_df, 'cc15'),
    'cw12': label_transfer_df(query_df, 'cw12'),
    'cw12wb12': label_transfer_df(query_df, 'cw12wb12'),
}

In [5]:
topic_df

Unnamed: 0,collection,body_lang.en,title_lang.en,meta_desc_lang.en,corpus,topics,bpref,pseudoNDCG@10,pseudoNDCG,tag
502,clueweb12_and_wayback12,1.0,1.0,1.0,cw12wb12,topics-1-50,0.445783,0.472292,0.604790,1.01.01.0
2807,webis_warc_clueweb12_011,1.0,1.0,1.0,cw12,topics-201-250,0.350840,0.341778,0.578638,1.01.01.0
1398,webis_warc_clueweb09_003,1.0,1.0,1.0,cw09,topics-101-150,0.275651,0.287052,0.468339,1.01.01.0
2446,clueweb12_and_wayback12,1.0,1.0,1.0,cw12wb12,topics-151-200,0.309870,0.371314,0.489354,1.01.01.0
3310,webis_warc_commoncrawl15_002,1.0,1.0,1.0,cc15,topics-51-100,0.364970,0.470172,0.526436,1.01.01.0
...,...,...,...,...,...,...,...,...,...,...
2691,webis_warc_clueweb12_011,0.0,0.0,0.0,cw12,topics-201-250,0.000000,0.000000,0.000000,0.00.00.0
1849,clueweb12_and_wayback12,0.0,0.0,0.0,cw12wb12,topics-101-150,0.000000,0.000000,0.000000,0.00.00.0
1201,clueweb12_and_wayback12,0.0,0.0,0.0,cw12wb12,topics-51-100,0.000000,0.000000,0.000000,0.00.00.0
2497,clueweb12_and_wayback12,0.0,0.0,0.0,cw12wb12,topics-151-200,0.000000,0.000000,0.000000,0.00.00.0


In [9]:
[i for i in query_dfs['cc15'][(query_dfs['cc15']['is_source_collection'] == True) & (query_dfs['cc15']['collection'] == 'source')].tag.unique()]

['1.01.01.0',
 '1.01.00.8',
 '1.01.00.6',
 '1.01.00.4',
 '1.01.00.2',
 '1.01.00.0',
 '1.00.81.0',
 '1.00.80.8',
 '1.00.80.6',
 '1.00.80.4',
 '1.00.80.2',
 '1.00.80.0',
 '1.00.61.0',
 '1.00.60.8',
 '1.00.60.6',
 '1.00.60.4',
 '1.00.60.2',
 '1.00.60.0',
 '1.00.41.0',
 '1.00.40.8',
 '1.00.40.6',
 '1.00.40.4',
 '1.00.40.2',
 '1.00.40.0',
 '1.00.21.0',
 '1.00.20.8',
 '1.00.20.6',
 '1.00.20.4',
 '1.00.20.2',
 '1.00.20.0',
 '1.00.01.0',
 '1.00.00.8',
 '1.00.00.6',
 '1.00.00.4',
 '1.00.00.2',
 '1.00.00.0',
 '0.81.01.0',
 '0.81.00.8',
 '0.81.00.6',
 '0.81.00.4',
 '0.81.00.2',
 '0.81.00.0',
 '0.80.81.0',
 '0.80.80.8',
 '0.80.80.6',
 '0.80.80.4',
 '0.80.80.2',
 '0.80.80.0',
 '0.80.61.0',
 '0.80.60.8',
 '0.80.60.6',
 '0.80.60.4',
 '0.80.60.2',
 '0.80.60.0',
 '0.80.41.0',
 '0.80.40.8',
 '0.80.40.6',
 '0.80.40.4',
 '0.80.40.2',
 '0.80.40.0',
 '0.80.21.0',
 '0.80.20.8',
 '0.80.20.6',
 '0.80.20.4',
 '0.80.20.2',
 '0.80.20.0',
 '0.80.01.0',
 '0.80.00.8',
 '0.80.00.6',
 '0.80.00.4',
 '0.80.00.2',
 '0.80

# Sample Pairs

- Load from file if exists, or calculate new pairs

In [20]:
def position_pair(corpus, topics, measure, firstPos=None, secondPos=None):
    df = performance_ranking_on_source(corpus, topics, measure)
    tags = [i for i in df.keys()]
    tags = sorted(tags, key=lambda i: df[i]['pos'], reverse=False)
    
    if firstPos == None:
        firstPos = random.randrange(0, len(tags))
    
    if secondPos == None:
        secondPos = random.randrange(0, len(tags))
    
    firstTag = tags[firstPos]
    secondTag = tags[secondPos]
    
    if firstTag == secondTag:
        return position_pair(corpus, topics, measure)
    else:
        first_topic_level_df = topics_to_performance_on_source_and_on_target_for_tag(corpus, firstTag, topics, measure)
        first_topic_level_df['tag'] = firstTag
        second_topic_level_df = topics_to_performance_on_source_and_on_target_for_tag(corpus, secondTag, topics, measure)
        second_topic_level_df['tag'] = secondTag
        
        topic_level_df = pd.concat([first_topic_level_df, second_topic_level_df])
        
        if df[firstTag]['pos'] > df[secondTag]['pos']:
            return calculate_position_pair(df, secondTag, firstTag, topic_level_df)
        else:
            return calculate_position_pair(df, firstTag, secondTag, topic_level_df)

def calculate_position_pair(df, first, second, topic_level_df):
    return {
        'firstTag': first,
        'sourceFirstPos': df[first]['pos'],
        'sourceFirstScore': df[first]['measure'],
        'sourceSecondPost': df[second]['pos'],
        'sourceSecondTag': second,
        'sourceSecondScore': df[second]['measure'],
        'effectRatio': effect_ratio(df=topic_level_df, baseline=second, advanced=first),
        'delta_relative_improvement': delta_relative_improvement(df=topic_level_df, baseline=second, advanced=first)
    }
    
def topics_to_performance_on_source_and_on_target_for_tag(corpus, tag, topics, measure):
    df = query_dfs[corpus]
    df = df[df['tag'] == tag]
    df = df[df['topic'].astype(int).isin(topics)].copy()
    df['measure'] = df[measure]
    
    return df[['collection', 'topic', 'measure']].reset_index()

def query_df_for_topic(corpus, topics):
    df = query_dfs[corpus]
    df = df[(df['topic'].astype(int).isin(topics))]
    return build_topic_df(df, topics)

def performance_ranking_on_source(corpus, topics, measure):
    df = query_df_for_topic(corpus, topics)
    df = df[(df['collection'] == 'source')]
    df = df.sort_values(measure, ascending=False).reset_index()
    return {i[1]['tag']: {'pos': i[0], 'measure': i[1][measure]} for i in df.iterrows()}

In [14]:
query_df_for_topic('cc15', range(0,50))

Unnamed: 0,collection,body_lang.en,title_lang.en,meta_desc_lang.en,corpus,bpref,pseudoNDCG@10,pseudoNDCG,tag
429,webis_warc_commoncrawl15_002,1.0,1.0,1.0,cc15,0.445148,0.620208,0.687350,1.01.01.0
214,source,1.0,1.0,1.0,source,0.287577,0.229887,0.587927,1.01.01.0
213,source,1.0,1.0,0.8,source,0.294400,0.245944,0.589603,1.01.00.8
428,webis_warc_commoncrawl15_002,1.0,1.0,0.8,cc15,0.469490,0.654750,0.721659,1.01.00.8
212,source,1.0,1.0,0.6,source,0.301634,0.251413,0.593319,1.01.00.6
...,...,...,...,...,...,...,...,...,...
217,webis_warc_commoncrawl15_002,0.0,0.0,0.6,cc15,0.211302,0.275214,0.268169,0.00.00.6
216,webis_warc_commoncrawl15_002,0.0,0.0,0.4,cc15,0.211302,0.275214,0.268169,0.00.00.4
1,source,0.0,0.0,0.4,source,0.147564,0.248800,0.214156,0.00.00.4
0,source,0.0,0.0,0.2,source,0.147564,0.248800,0.214156,0.00.00.2


In [21]:
performance_ranking_on_source('cc15', range(0,50), 'pseudoNDCG')

Unnamed: 0,index,collection,body_lang.en,title_lang.en,meta_desc_lang.en,corpus,bpref,pseudoNDCG@10,pseudoNDCG,tag
0,149,source,0.8,0.2,0.0,source,0.334591,0.332952,0.627283,0.80.20.0
1,185,source,1.0,0.2,0.0,source,0.333549,0.352994,0.626348,1.00.20.0
2,203,source,1.0,0.8,0.0,source,0.334874,0.322741,0.622779,1.00.80.0
3,77,source,0.4,0.2,0.0,source,0.335073,0.318302,0.622776,0.40.20.0
4,155,source,0.8,0.4,0.0,source,0.335073,0.318302,0.622776,0.80.40.0
...,...,...,...,...,...,...,...,...,...,...
210,4,source,0.0,0.0,1.0,source,0.147564,0.248800,0.214156,0.00.01.0
211,3,source,0.0,0.0,0.8,source,0.147564,0.248800,0.214156,0.00.00.8
212,2,source,0.0,0.0,0.6,source,0.147564,0.248800,0.214156,0.00.00.6
213,1,source,0.0,0.0,0.4,source,0.147564,0.248800,0.214156,0.00.00.4


In [26]:
performance_ranking_on_source('cc15', range(0,50), 'pseudoNDCG')

{'0.80.20.0': {'pos': 0, 'measure': 0.6272834450859306},
 '1.00.20.0': {'pos': 1, 'measure': 0.6263484061190312},
 '1.00.80.0': {'pos': 2, 'measure': 0.6227788797816403},
 '0.40.20.0': {'pos': 3, 'measure': 0.6227760584885889},
 '0.80.40.0': {'pos': 4, 'measure': 0.6227760584885887},
 '1.00.40.0': {'pos': 5, 'measure': 0.62276308964051},
 '0.60.20.0': {'pos': 6, 'measure': 0.6227607987677791},
 '0.80.60.0': {'pos': 7, 'measure': 0.6222375525914834},
 '1.01.00.0': {'pos': 8, 'measure': 0.622009397267328},
 '0.40.40.0': {'pos': 9, 'measure': 0.622009397267328},
 '0.20.20.0': {'pos': 10, 'measure': 0.622009397267328},
 '0.80.80.0': {'pos': 11, 'measure': 0.622009397267328},
 '0.60.60.0': {'pos': 12, 'measure': 0.6220093972673278},
 '1.00.60.0': {'pos': 13, 'measure': 0.621282225061434},
 '0.60.40.0': {'pos': 14, 'measure': 0.6210745873790533},
 '0.81.00.0': {'pos': 15, 'measure': 0.6203828690436336},
 '0.60.80.0': {'pos': 16, 'measure': 0.6198693468173402},
 '0.40.60.0': {'pos': 17, 'meas

In [25]:
position_pair('cc15', range(0,50), 'pseudoNDCG')

{'firstTag': '1.00.00.2',
 'sourceFirstPos': 71,
 'sourceFirstScore': 0.5912597178679921,
 'sourceSecondPost': 147,
 'sourceSecondTag': '0.40.00.4',
 'sourceSecondScore': 0.5781622172177822}

In [22]:
position_pair('cc15', range(0,50), 'pseudoNDCG')

{'firstTag': '0.60.20.2',
 'sourceFirstPos': 53,
 'sourceFirstScore': 0.5972886373382029,
 'sourceSecondPost': 55,
 'sourceSecondTag': '0.80.60.4',
 'sourceSecondScore': 0.5966824873103584,
 'effectRatio': -32.01656061149727}

In [25]:
from tqdm import tqdm

def sample_pairs_and_save_them(corpus, topics, measure, number_queries):
    d = []
    for i in tqdm(range(0, number_queries)):
        try:
            pair = position_pair(corpus, topics, measure)
            d+= [pair]
        except:
            pass

    pd.DataFrame(d).to_json(base_dir + 'data-26-10-2020/ir-reproducibility-with-transferred-relevance-judgments/' + corpus + '-' + measure + '.jsonl', lines=True, orient='records')
    
def all_top_pairs_and_save_them(corpus, topics, measure):
    d = []
    for left in tqdm(range(0,20)):
        for right in range(left +1, 20):
            try:
                pair = position_pair(corpus, topics, measure, firstPos=left, secondPos=right)
                d+= [pair]
            except:
                pass
    pd.DataFrame(d).to_json(base_dir + 'data-26-10-2020/ir-reproducibility-with-transferred-relevance-judgments/top-20-' + corpus + '-' + measure + '.jsonl', lines=True, orient='records')
            

In [None]:
%%time

sample_pairs_and_save_them(corpus='cc15', topics=range(0,300), measure='pseudoNDCG', number_queries=300)

 35%|███▍      | 104/300 [00:16<00:32,  5.95it/s]

In [None]:
[i for i in all_top_pairs_and_save_them(corpus='cc15', topics=range(0,300), measure='pseudoNDCG')]

In [43]:
position_pair(corpus='cw12', topics=range(0,300), measure='pseudoNDCG', firstPos=50, secondPos=60)

{'firstTag': '1.00.80.4',
 'sourceFirstPos': 50,
 'sourceFirstScore': 0.5253966412021462,
 'sourceSecondPost': 60,
 'sourceSecondTag': '1.01.00.8',
 'sourceSecondScore': 0.5245060906269584,
 'effectRatio': 0.2703620951211685,
 'delta_relative_improvement': 0.001296841063476402}

In [26]:
%%time
all_top_pairs_and_save_them(corpus='cc15', topics=range(0,300), measure='pseudoNDCG')

100%|██████████| 20/20 [03:54<00:00, 11.72s/it]

CPU times: user 3min 54s, sys: 230 ms, total: 3min 55s
Wall time: 3min 54s





In [27]:
%%time
all_top_pairs_and_save_them(corpus='cw12', topics=range(0,300), measure='pseudoNDCG')

100%|██████████| 20/20 [04:29<00:00, 13.47s/it]

CPU times: user 4min 30s, sys: 321 ms, total: 4min 30s
Wall time: 4min 29s





In [28]:
%%time
all_top_pairs_and_save_them(corpus='cw12wb12', topics=range(0,300), measure='pseudoNDCG')

100%|██████████| 20/20 [04:01<00:00, 12.09s/it]

CPU times: user 4min 4s, sys: 374 ms, total: 4min 4s
Wall time: 4min 1s





In [29]:
%%time
all_top_pairs_and_save_them(corpus='cc15', topics=range(0,300), measure='bpref')

100%|██████████| 20/20 [02:55<00:00,  8.80s/it]


CPU times: user 2min 56s, sys: 111 ms, total: 2min 56s
Wall time: 2min 56s


In [30]:
%%time
all_top_pairs_and_save_them(corpus='cw12', topics=range(0,300), measure='bpref')

100%|██████████| 20/20 [03:26<00:00, 10.34s/it]


CPU times: user 3min 27s, sys: 211 ms, total: 3min 28s
Wall time: 3min 27s


In [31]:
%%time
all_top_pairs_and_save_them(corpus='cw12wb12', topics=range(0,300), measure='bpref')

100%|██████████| 20/20 [03:49<00:00, 11.47s/it]


CPU times: user 3min 49s, sys: 291 ms, total: 3min 50s
Wall time: 3min 49s


In [68]:
from tqdm import tqdm

def sample_pairs_and_save_them_with_top_topics(corpus, measure, number_queries):
    topics = json.load(open('/mnt/ceph/storage/data-in-progress/kibi9872/sigir2021/data-26-10-2020/top-topics-' + measure + '-to-' + corpus + '.json'))
    topics = [int(i) for i in topics]
    d = []
    for i in tqdm(range(0, number_queries)):
        try:
            pair = position_pair(corpus, topics, measure)
            d+= [pair]
        except:
            pass

    pd.DataFrame(d).to_json(base_dir + 'data-26-10-2020/ir-reproducibility-with-transferred-relevance-judgments/best-topics-' + corpus + '-' + measure + '.jsonl', lines=True, orient='records')
    
def all_top_pairs_and_save_them_with_top_topics(corpus, measure):
    topics = json.load(open('/mnt/ceph/storage/data-in-progress/kibi9872/sigir2021/data-26-10-2020/top-topics-' + measure + '-to-' + corpus + '.json'))
    topics = [int(i) for i in topics]
    d = []
    for left in tqdm(range(0,20)):
        for right in range(left +1, 20):
            try:
                pair = position_pair(corpus, topics, measure, firstPos=left, secondPos=right)
                d+= [pair]
            except:
                pass
    pd.DataFrame(d).to_json(base_dir + 'data-26-10-2020/ir-reproducibility-with-transferred-relevance-judgments/best-topics-top-20-' + corpus + '-' + measure + '.jsonl', lines=True, orient='records')


In [69]:
%%time
sample_pairs_and_save_them_with_top_topics(corpus='cc15', measure='pseudoNDCG', number_queries=2000)

100%|██████████| 2000/2000 [16:59<00:00,  1.96it/s]

CPU times: user 17min 15s, sys: 6.17 s, total: 17min 21s
Wall time: 16min 59s





In [70]:
%%time
sample_pairs_and_save_them_with_top_topics(corpus='cw12', measure='pseudoNDCG', number_queries=2000)

100%|██████████| 2000/2000 [12:25<00:00,  2.68it/s]

CPU times: user 12min 43s, sys: 5.2 s, total: 12min 49s
Wall time: 12min 25s





In [71]:
%%time
sample_pairs_and_save_them_with_top_topics(corpus='cw12wb12', measure='pseudoNDCG', number_queries=2000)

100%|██████████| 2000/2000 [12:52<00:00,  2.59it/s]

CPU times: user 13min 13s, sys: 5.65 s, total: 13min 19s
Wall time: 12min 52s





In [72]:
%%time
sample_pairs_and_save_them_with_top_topics(corpus='cc15', measure='bpref', number_queries=2000)

100%|██████████| 2000/2000 [14:43<00:00,  2.26it/s]

CPU times: user 15min 9s, sys: 6.47 s, total: 15min 15s
Wall time: 14min 44s





In [73]:
%%time
sample_pairs_and_save_them_with_top_topics(corpus='cw12', measure='bpref', number_queries=2000)

100%|██████████| 2000/2000 [15:18<00:00,  2.18it/s]

CPU times: user 15min 45s, sys: 7.56 s, total: 15min 53s
Wall time: 15min 18s





In [74]:
%%time
sample_pairs_and_save_them_with_top_topics(corpus='cw12wb12', measure='bpref', number_queries=2000)

100%|██████████| 2000/2000 [14:52<00:00,  2.24it/s]

CPU times: user 15min 18s, sys: 6.89 s, total: 15min 25s
Wall time: 14min 52s





In [35]:
%%time
all_top_pairs_and_save_them_with_top_topics(corpus='cc15', measure='pseudoNDCG')

100%|██████████| 20/20 [01:28<00:00,  4.41s/it]

CPU times: user 1min 28s, sys: 261 ms, total: 1min 29s
Wall time: 1min 28s





In [36]:
%%time
all_top_pairs_and_save_them_with_top_topics(corpus='cw12', measure='pseudoNDCG')

100%|██████████| 20/20 [01:21<00:00,  4.06s/it]

CPU times: user 1min 22s, sys: 233 ms, total: 1min 22s
Wall time: 1min 21s





In [37]:
%%time
all_top_pairs_and_save_them_with_top_topics(corpus='cw12wb12', measure='pseudoNDCG')

100%|██████████| 20/20 [01:30<00:00,  4.53s/it]

CPU times: user 1min 31s, sys: 232 ms, total: 1min 31s
Wall time: 1min 30s





In [38]:
%%time
all_top_pairs_and_save_them_with_top_topics(corpus='cc15', measure='bpref')

100%|██████████| 20/20 [01:38<00:00,  4.91s/it]

CPU times: user 1min 38s, sys: 160 ms, total: 1min 38s
Wall time: 1min 38s





In [39]:
%%time
all_top_pairs_and_save_them_with_top_topics(corpus='cw12', measure='bpref')

100%|██████████| 20/20 [01:56<00:00,  5.82s/it]

CPU times: user 1min 56s, sys: 250 ms, total: 1min 56s
Wall time: 1min 56s





In [40]:
%%time
all_top_pairs_and_save_them_with_top_topics(corpus='cw12wb12', measure='bpref')

100%|██████████| 20/20 [01:48<00:00,  5.42s/it]

CPU times: user 1min 48s, sys: 201 ms, total: 1min 48s
Wall time: 1min 48s





In [57]:
topics_to_performance_on_source_and_on_target_for_tag('cc15', '0.60.60.2', [i for i in range(0,50)], 'pseudoNDCG')

Unnamed: 0,collection,body_lang.en,title_lang.en,meta_desc_lang.en,corpus,topic,bpref,pseudoNDCG@10,pseudoNDCG,tag,is_source_collection,keep
4503,source,0.6,0.6,0.2,source,3,0.429591,0.313747,0.602354,0.60.60.2,True,True
4516,source,0.6,0.6,0.2,source,16,0.27377,0.071398,0.634622,0.60.60.2,True,True
4514,source,0.6,0.6,0.2,source,14,0.110469,0.168364,0.601521,0.60.60.2,True,True
4513,source,0.6,0.6,0.2,source,13,0.125,0.275553,0.419379,0.60.60.2,True,True
135982,webis_warc_commoncrawl15_002,0.6,0.6,0.2,cc15,26,0.333333,0.835693,0.835693,0.60.60.2,False,True
4523,source,0.6,0.6,0.2,source,24,0.266657,0.123515,0.595799,0.60.60.2,True,True
135985,webis_warc_commoncrawl15_002,0.6,0.6,0.2,cc15,31,0.885714,0.979921,0.979921,0.60.60.2,False,True
4524,source,0.6,0.6,0.2,source,25,0.236496,0.319972,0.638579,0.60.60.2,True,True
4530,source,0.6,0.6,0.2,source,31,0.635973,0.618149,0.766573,0.60.60.2,True,True
4525,source,0.6,0.6,0.2,source,26,0.395556,0.384266,0.708231,0.60.60.2,True,True


In [80]:
topics_to_performance_on_source_and_on_target_for_tag('cc15', '0.60.60.2', [i for i in range(0,50)], 'pseudoNDCG')

Unnamed: 0,index,collection,topic,measure
0,4503,source,3,0.602354
1,4516,source,16,0.634622
2,4514,source,14,0.601521
3,4513,source,13,0.419379
4,135982,webis_warc_commoncrawl15_002,26,0.835693
5,4523,source,24,0.595799
6,135985,webis_warc_commoncrawl15_002,31,0.979921
7,4524,source,25,0.638579
8,4530,source,31,0.766573
9,4525,source,26,0.708231


In [73]:
set(topics_to_performance_on_source_and_on_target_for_tag('cc15', '0.60.60.2', [i for i in range(0,50)], 'pseudoNDCG').topic.unique())

{3, 13, 14, 16, 24, 25, 26, 31, 44, 47, 49}

In [69]:
def effect_ratio(df, measure):
    topics = set(df.topic.unique())
    
    numerator = 0
    denominator = 0
    
    for topic in topics:
        numerator += m_new(df, topic, measure)
        denominator += m_old(df, topic, measure)
    
    return (numerator/len(topics))/(denominator/len(topics))
                 


def m_old(df, topic, measure):
    df = df[(df['collection'] == 'source') & (df['topic'] == topic)]
    if len(df) != 1:
        raise
    
    return df.iloc[0][measure]

def m_new(df, topic, measure):
    df = df[(df['collection'] != 'source') & (df['topic'] == topic)]
    if len(df) != 1:
        raise
    
    return df.iloc[0][measure]

def relative_improvement_old(df, topic, measure):
    topics = set(df.topic.unique())
    mean_score_

In [67]:
m_old(topics_to_performance_on_source_and_on_target_for_tag('cc15', '0.60.60.2', [i for i in range(0,50)], 'pseudoNDCG'), 49, 'pseudoNDCG')

0.536546781718382

In [70]:
m_new(topics_to_performance_on_source_and_on_target_for_tag('cc15', '0.60.60.2', [i for i in range(0,50)], 'pseudoNDCG'), 49, 'pseudoNDCG')

0.8599797111848091

# Evaluation Tables

In [3]:
def count_df(corpus, measure):
    df_1 = pd.read_json(base_dir + 'data-26-10-2020/ir-reproducibility-with-transferred-relevance-judgments/best-topics-top-20-' + corpus + '-' + measure + '.jsonl', lines=True)
    df_2 = pd.read_json(base_dir + 'data-26-10-2020/ir-reproducibility-with-transferred-relevance-judgments/best-topics-' + corpus + '-' + measure + '.jsonl', lines=True)
    
    df = pd.concat([df_1, df_2])
    
    df['posDist'] = df['sourceSecondPost'] - df['sourceFirstPos']
    return df


In [45]:
count_stuff('cc15', 'pseudoNDCG')

{'effect-size-success-absolute-scores-success': 134,
 'effect-size-failure-absolute-scores-failure': 45,
 'effect-size-success-absolute-scores-failure': 11}

In [46]:
count_stuff('cw12', 'pseudoNDCG')

{'effect-size-failure-absolute-scores-failure': 103,
 'effect-size-success-absolute-scores-success': 81,
 'effect-size-success-absolute-scores-failure': 6}

In [47]:
count_stuff('cw12wb12', 'pseudoNDCG')

{'effect-size-failure-absolute-scores-failure': 31,
 'effect-size-success-absolute-scores-failure': 41,
 'effect-size-success-absolute-scores-success': 118}

In [48]:
count_stuff('cc15', 'bpref')

{'effect-size-failure-absolute-scores-failure': 114,
 'effect-size-success-absolute-scores-success': 36,
 'effect-size-success-absolute-scores-failure': 40}

In [49]:
count_stuff('cw12', 'bpref')

{'effect-size-failure-absolute-scores-failure': 97,
 'effect-size-success-absolute-scores-failure': 21,
 'effect-size-success-absolute-scores-success': 72}

In [50]:
count_stuff('cw12wb12', 'bpref')

{'effect-size-failure-absolute-scores-failure': 53,
 'effect-size-success-absolute-scores-success': 99,
 'effect-size-success-absolute-scores-failure': 38}

In [17]:
count_stuff('cw12wb12', 'bpref',300)

{'effect-size-failure-absolute-scores-failure': 124,
 'effect-size-success-absolute-scores-success': 1126,
 'effect-size-success-absolute-scores-failure': 752}

In [None]:
POSITION_THRESHOLD=10

def corpus_display_name(corpus):
    if corpus == 'cw12':
        return 'CW12'
    elif corpus == 'cw12wb12':
        return 'CW12+'
    elif corpus == 'cc15':
        return 'CC15'
    raise

def row_table_ir_reproducibility(corpus, measure):
    df = count_df(corpus, measure)
    small_effect_sizes = df[df['posDist'] < POSITION_THRESHOLD]
    large_effect_sizes = df[df['posDist'] >= POSITION_THRESHOLD]
    
    small_effect_sizes = classify_repro_pairs(small_effect_sizes)
    success_success = small_effect_sizes.get('effect-size-success-absolute-scores-success', 0)
    success_failure = small_effect_sizes.get('effect-size-success-absolute-scores-failure', 0)
    failure_failure = small_effect_sizes.get('effect-size-failure-absolute-scores-failure', 0)
    failure_failure += small_effect_sizes.get('effect-size-failure-absolute-scores-success', 0)
    
    all_count = success_success + success_failure + failure_failure
    
    
    ret = '& ' + corpus_display_name(corpus) + ' & ' + '{:.1f}'.format((success_success/all_count)*100) + ' & ' + '{:.1f}'.format((success_failure/all_count)*100) + ' & ' + '{:.1f}'.format((failure_failure/all_count)*100) +' & '

    large_effect_sizes = classify_repro_pairs(large_effect_sizes)
    success_success = large_effect_sizes.get('effect-size-success-absolute-scores-success', 0)
    success_failure = large_effect_sizes.get('effect-size-success-absolute-scores-failure', 0)
    failure_failure = large_effect_sizes.get('effect-size-failure-absolute-scores-failure', 0)
    failure_failure += large_effect_sizes.get('effect-size-failure-absolute-scores-success', 0)
    
    all_count = success_success + success_failure + failure_failure
    
    ret = ret + '{:.1f}'.format((success_success/all_count)*100) + ' & ' + '{:.1f}'.format((success_failure/all_count)*100) + ' & ' + '{:.1f}'.format((failure_failure/all_count)*100) +' \\\\'
    
    df = classify_repro_pairs(df)
    success_success = df.get('effect-size-success-absolute-scores-success', 0)
    success_failure = df.get('effect-size-success-absolute-scores-failure', 0)
    failure_failure = df.get('effect-size-failure-absolute-scores-failure', 0)
    failure_failure += df.get('effect-size-failure-absolute-scores-success', 0)
    
    all_count = success_success + success_failure + failure_failure
    
    return ret + '# for all: ' + '{:.1f}'.format((success_success/all_count)*100) + ' & ' + '{:.1f}'.format((success_failure/all_count)*100) + ' & ' + '{:.1f}'.format((failure_failure/all_count)*100) +' \\\\'
    
def create_table_ir_reproducibility():
    return """\\begin{table}[tb]
\\centering
\\small
\\setlength{\\tabcolsep}{3pt}%
\\caption{TBD.: This is $\\ndcg$. Add bpref also?}

\\label{table-ir-reproducibility}
\\begin{tabular}{@{}clcccccc@{}}
\\toprule

\\multicolumn{2}{c@{}}{\\bfseries Target} & \\multicolumn{3}{c@{}}{\\bfseries $\\leq$ """ + str(POSITION_THRESHOLD) + """ Positions} & \\multicolumn{3}{c@{}}{\\bfseries > """ + str(POSITION_THRESHOLD) + """ Positions} \\\\

\\cmidrule{3-5}
\\cmidrule(l@{1em}){6-8}

&& S/S & S/F & F/* & S/S & S/F & F/*\\\\

\\midrule
\\parbox[t]{2mm}{\\multirow{3}{*}{\\rotatebox[origin=c]{90}{\\small $\\ndcg$ \\kern-0.6em}}}
""" + row_table_ir_reproducibility('cw12', 'pseudoNDCG') + """
""" + row_table_ir_reproducibility('cw12wb12', 'pseudoNDCG') + """
""" + row_table_ir_reproducibility('cc15', 'pseudoNDCG') + """
\\midrule
\\parbox[t]{2mm}{\\multirow{3}{*}{\\rotatebox[origin=c]{90}{\\small bpref \\kern-0.6em}}}
""" + row_table_ir_reproducibility('cw12', 'bpref') + """
""" + row_table_ir_reproducibility('cw12wb12', 'bpref') + """
""" + row_table_ir_reproducibility('cc15', 'bpref') + """

\\bottomrule
\\end{tabular}
\\end{table}
"""

table_ir_reproducibility = create_table_ir_reproducibility()
with open('/sigir21/sigir21-relevance-label-transfer-paper-submitted/table-ir-reproducibility.tex', 'w+') as f:
    f.write(table_ir_reproducibility)