In [1]:
import os
import pandas as pd
import glob
import numpy as np

from scipy.stats import zscore

import gc

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

In [3]:
os.getcwd()

'/media/sf_eduseiti/unicamp/ia368v_dd/trabalho_final'

In [4]:
VALIDATION_RESULTS_FOLDER="validation_queries"
COLBERTX_VALIDATION_RESULTS_FOLDER="validation_queries/colbertx_retrieval"

COLBERT_RANKING_RESULTS_FILENAME_FORMAT="ranking_index_{:02}_parts_*.tsv"

In [5]:
CORPUS_PART_INICIAL_ID=[
    int(0),
    int(2e+6),
    int(4e+6),
    int(6e+6),
    int(8e+6)
]

In [17]:
def combine_colbertx_retrievals(colbertx_retrievals_folder, corpus_parts=[0, 1, 2, 3, 4], combined_results_output_filename=None, verbose=True):
    
    znormalized_result_dfs = []
    
    for corpus_part in corpus_parts:
        
        print("Hanlding corpus part: {}...".format(corpus_part))
        
        corpus_part_retrieval_results = glob.glob(os.path.join(colbertx_retrievals_folder, COLBERT_RANKING_RESULTS_FILENAME_FORMAT.format(corpus_part)))
        
        print(corpus_part_retrieval_results)
        
        all_retrieval_results = []

        for retrieval_part in corpus_part_retrieval_results:
            retrieval_part_df = pd.read_csv(retrieval_part, sep='\t', header=None, names=['query_id', 'colbertx_id', 'doc_position', 'doc_score'])
            
            if verbose:
                print(retrieval_part_df.shape)
                display(retrieval_part_df.groupby('query_id').count())
            
            all_retrieval_results.append(retrieval_part_df)        
    
        all_retrieval_results_df = pd.concat(all_retrieval_results)
        
        all_retrieval_results_df['score_znorm'] = zscore(all_retrieval_results_df['doc_score'])
        all_retrieval_results_df['part'] = corpus_part
        all_retrieval_results_df['colbertx_id'] += CORPUS_PART_INICIAL_ID[corpus_part]
        
        znormalized_result_dfs.append(all_retrieval_results_df)
        
        
    combined_retrievals_df = pd.concat(znormalized_result_dfs).sort_values('score_znorm', ascending=False)
    
    if combined_results_output_filename is not None:
        combined_retrievals_df.to_csv(combined_results_output_filename, sep='\t', index=False)
        
    return combined_retrievals_df

### Combine all the train queries retrievals, performed over the different corpus parts 

In [18]:
validation_scores_df = combine_colbertx_retrievals(COLBERTX_VALIDATION_RESULTS_FOLDER)

Hanlding corpus part: 0...
['validation_queries/colbertx_retrieval/ranking_index_00_parts_0_1_2.tsv', 'validation_queries/colbertx_retrieval/ranking_index_00_parts_3_4_5.tsv', 'validation_queries/colbertx_retrieval/ranking_index_00_parts_6_7_8.tsv', 'validation_queries/colbertx_retrieval/ranking_index_00_parts_9.tsv']
(97154, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,659,659,659
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97089, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,714,714,714
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97289, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,707,707,707
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(29163, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,277,277,277
1,322,322,322
2,438,438,438
3,280,280,280
4,129,129,129
5,227,227,227
6,508,508,508
7,339,339,339
8,583,583,583
9,292,292,292


Hanlding corpus part: 1...
['validation_queries/colbertx_retrieval/ranking_index_01_parts_0_1_2.tsv', 'validation_queries/colbertx_retrieval/ranking_index_01_parts_3_4_5.tsv', 'validation_queries/colbertx_retrieval/ranking_index_01_parts_6_7_8.tsv', 'validation_queries/colbertx_retrieval/ranking_index_01_parts_9.tsv']
(97502, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,802,802,802
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97476, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,763,763,763
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97486, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,698,698,698
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(29937, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,268,268,268
1,308,308,308
2,432,432,432
3,324,324,324
4,156,156,156
5,227,227,227
6,469,469,469
7,381,381,381
8,502,502,502
9,284,284,284


Hanlding corpus part: 2...
['validation_queries/colbertx_retrieval/ranking_index_02_parts_0_1_2.tsv', 'validation_queries/colbertx_retrieval/ranking_index_02_parts_3_4_5.tsv', 'validation_queries/colbertx_retrieval/ranking_index_02_parts_6_7_8.tsv', 'validation_queries/colbertx_retrieval/ranking_index_02_parts_9.tsv']
(97415, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,736,736,736
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97344, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,717,717,717
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97439, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,742,742,742
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(29640, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,301,301,301
1,323,323,323
2,397,397,397
3,291,291,291
4,119,119,119
5,251,251,251
6,419,419,419
7,335,335,335
8,546,546,546
9,253,253,253


Hanlding corpus part: 3...
['validation_queries/colbertx_retrieval/ranking_index_03_parts_0_1_2.tsv', 'validation_queries/colbertx_retrieval/ranking_index_03_parts_3_4_5.tsv', 'validation_queries/colbertx_retrieval/ranking_index_03_parts_6_7_8.tsv', 'validation_queries/colbertx_retrieval/ranking_index_03_parts_9.tsv']
(97504, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,673,673,673
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(96966, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,655,655,655
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97272, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,666,666,666
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(30084, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,280,280,280
1,320,320,320
2,433,433,433
3,277,277,277
4,116,116,116
5,276,276,276
6,533,533,533
7,286,286,286
8,416,416,416
9,275,275,275


Hanlding corpus part: 4...
['validation_queries/colbertx_retrieval/ranking_index_04_parts_0_1_2.tsv', 'validation_queries/colbertx_retrieval/ranking_index_04_parts_3_4_5.tsv', 'validation_queries/colbertx_retrieval/ranking_index_04_parts_6_7_8.tsv', 'validation_queries/colbertx_retrieval/ranking_index_04_parts_9.tsv']
(96942, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,688,688,688
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97058, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,715,715,715
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(97120, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1000,1000,1000
1,1000,1000,1000
2,1000,1000,1000
3,1000,1000,1000
4,680,680,680
5,1000,1000,1000
6,1000,1000,1000
7,1000,1000,1000
8,1000,1000,1000
9,1000,1000,1000


(31023, 4)


Unnamed: 0_level_0,colbertx_id,doc_position,doc_score
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,268,268,268
1,309,309,309
2,427,427,427
3,291,291,291
4,132,132,132
5,251,251,251
6,455,455,455
7,368,368,368
8,657,657,657
9,264,264,264


In [8]:
validation_scores_df.head(10)

Unnamed: 0,query_id,colbertx_id,doc_position,doc_score,score_znorm,part
67127,69,6614011,1,30.673712,4.722968,3
63426,65,3655302,1,30.673641,4.712574,1
96550,99,5475501,1,30.701267,4.70561,2
36420,37,6374446,1,30.603916,4.685387,3
18733,19,5531005,1,30.612341,4.658078,2
63427,65,3751617,2,30.571152,4.657566,1
63344,65,503170,1,30.459084,4.636068,0
26722,27,2994127,1,30.527214,4.633984,1
18714,19,972162,1,30.410461,4.609721,0
96182,99,830998,1,30.40044,4.604291,0


In [9]:
validation_scores_df.shape

(1608903, 6)

In [12]:
np.sort(validation_scores_df['query_id'].unique()).shape

(100,)

In [14]:
validation_scores_df.groupby(['query_id', 'part']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,colbertx_id,doc_position,doc_score,score_znorm
query_id,part,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,3277,3277,3277,3277
0,1,3268,3268,3268,3268
0,2,3301,3301,3301,3301
0,3,3280,3280,3280,3280
0,4,3268,3268,3268,3268
...,...,...,...,...,...
99,0,2766,2766,2766,2766
99,1,2938,2938,2938,2938
99,2,2969,2969,2969,2969
99,3,3006,3006,3006,3006


In [19]:
validation_scores_df.to_csv(os.path.join(VALIDATION_RESULTS_FOLDER, "colbertx_all_znormalized_validation_scores.tsv"), sep='\t', index=False)