# Setup
### Imports

In [1]:
import sys
sys.path.append('../')
del sys

%reload_ext autoreload
%autoreload 2

import modeling.models as models
from toolbox.utils import to_class_name, add_task_argument, load_task, get_word2vec, \
    get_bart, play
from toolbox.parameters import SCORES_NAMES, MODELS_RANDOM_SEED
from toolbox.paths import MODELING_TASK_RESULTS_PATH, PRETRAINED_MODELS_PATH, \
    TENSORBOARD_LOGS_PATH

root = "../"

### Parameters

In [2]:
args = {
    'task': "context_free_same_type",
    'valid_proportion': 0.25,
    'test_proportion': 0.25,
    'ranking_size': 32,
    'batch_size': 16,
    'cross_validation': False,
    'short': True,
    'experiment': None
}

### Load the data

In [3]:
task = load_task(args=args, folder_path=root + MODELING_TASK_RESULTS_PATH)

Task loaded from ../results/modeling_task/contextfreesametype_50-25-25_rs32_bs16_short.pkl.



# Run the baselines
## Basic baselines
### Models

In [4]:
model_names = [
    "random",
    "frequency",
    "summaries_count",
    "summaries_unique_count",
    "summaries_overlap",
    "activated_summaries",
    "context_count",
    "context_unique_count",
    "summaries_context_count",
    "summaries_context_unique_count",
    "summaries_context_overlap",
]

### Run the models

In [5]:
for model_name in model_names:
    model_name = to_class_name(model_name)
    print(model_name)
    
    model = getattr(models, model_name)(scores_names=SCORES_NAMES,
                                        relevance_level=task.relevance_level,
                                        trained_model=None,
                                        tensorboard_logs_path=root + TENSORBOARD_LOGS_PATH,
                                        experiment_name=args['experiment'],
                                        random_seed=MODELS_RANDOM_SEED)
    
    play(task=task, model=model)

Random
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.17178 (+/-0.12740)
recall_at_10: 0.29504 (+/-0.27525)
reciprocal_best_rank: 0.22885 (+/-0.25701)
reciprocal_average_rank: 0.06968 (+/-0.05188)
ndcg_at_10: 0.18111 (+/-0.18586)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.18693 (+/-0.13601)
recall_at_10: 0.32522 (+/-0.27985)
reciprocal_best_rank: 0.26202 (+/-0.28731)
reciprocal_average_rank: 0.07151 (+/-0.03812)
ndcg_at_10: 0.20541 (+/-0.19494)

Frequency
Learning answers counts...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


Learning answers counts...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.57757 (+/-0.28139)
recall_at_10: 0.72549 (+/-0.28205)
reciprocal_best_rank: 0.77017 (+/-0.33979)
reciprocal_average_rank: 0.24476 (+/-0.21855)
ndcg_at_10: 0.64894 (+/-0.27912)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.58709 (+/-0.26811)
recall_at_10: 0.74218 (+/-0.26832)
reciprocal_best_rank: 0.77590 (+/-0.32974)
reciprocal_average_rank: 0.23966 (+/-0.20630)
ndcg_at_10: 0.66257 (+/-0.26370)

SummariesCount
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.32534 (+/-0.25290)
recall_at_10: 0.47588 (+/-0.32628)
reciprocal_best_rank: 0.46408 (+/-0.39335)
reciprocal_average_rank: 0.12017 (+/-0.13507)
ndcg_at_10: 0.37253 (+/-0.29316)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.32469 (+/-0.25549)
recall_at_10: 0.48112 (+/-0.33162)
reciprocal_best_rank: 0.45614 (+/-0.38943)
reciprocal_average_rank: 0.11923 (+/-0.13196)
ndcg_at_10: 0.37303 (+/-0.29708)

SummariesUniqueCount
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.29602 (+/-0.22660)
recall_at_10: 0.47634 (+/-0.32048)
reciprocal_best_rank: 0.42159 (+/-0.36716)
reciprocal_average_rank: 0.11137 (+/-0.11641)
ndcg_at_10: 0.34970 (+/-0.27227)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.30027 (+/-0.23496)
recall_at_10: 0.48101 (+/-0.32643)
reciprocal_best_rank: 0.41909 (+/-0.36560)
reciprocal_average_rank: 0.11350 (+/-0.12108)
ndcg_at_10: 0.35352 (+/-0.28145)

SummariesOverlap
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.29016 (+/-0.23691)
recall_at_10: 0.43223 (+/-0.32230)
reciprocal_best_rank: 0.41379 (+/-0.38078)
reciprocal_average_rank: 0.10978 (+/-0.12348)
ndcg_at_10: 0.32881 (+/-0.28426)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.31016 (+/-0.24491)
recall_at_10: 0.45435 (+/-0.31447)
reciprocal_best_rank: 0.44549 (+/-0.37515)
reciprocal_average_rank: 0.11425 (+/-0.13292)
ndcg_at_10: 0.35473 (+/-0.28375)

ActivatedSummaries
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.30921 (+/-0.24113)
recall_at_10: 0.47675 (+/-0.32641)
reciprocal_best_rank: 0.43216 (+/-0.37505)
reciprocal_average_rank: 0.12012 (+/-0.13955)
ndcg_at_10: 0.35879 (+/-0.28463)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.30690 (+/-0.24237)
recall_at_10: 0.47677 (+/-0.32907)
reciprocal_best_rank: 0.42173 (+/-0.37020)
reciprocal_average_rank: 0.11791 (+/-0.13763)
ndcg_at_10: 0.35604 (+/-0.28688)

ContextCount
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.27283 (+/-0.21885)
recall_at_10: 0.40828 (+/-0.31532)
reciprocal_best_rank: 0.40159 (+/-0.37345)
reciprocal_average_rank: 0.10106 (+/-0.10955)
ndcg_at_10: 0.30757 (+/-0.26843)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.27132 (+/-0.21980)
recall_at_10: 0.40922 (+/-0.31130)
reciprocal_best_rank: 0.39875 (+/-0.36043)
reciprocal_average_rank: 0.10083 (+/-0.11191)
ndcg_at_10: 0.30825 (+/-0.26337)

ContextUniqueCount
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.26986 (+/-0.21377)
recall_at_10: 0.40876 (+/-0.31652)
reciprocal_best_rank: 0.39494 (+/-0.36519)
reciprocal_average_rank: 0.09999 (+/-0.10324)
ndcg_at_10: 0.30508 (+/-0.26434)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.26488 (+/-0.21466)
recall_at_10: 0.40773 (+/-0.30965)
reciprocal_best_rank: 0.38335 (+/-0.35142)
reciprocal_average_rank: 0.09933 (+/-0.11206)
ndcg_at_10: 0.30180 (+/-0.25893)

SummariesContextCount
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.35301 (+/-0.25328)
recall_at_10: 0.50822 (+/-0.32395)
reciprocal_best_rank: 0.50490 (+/-0.39435)
reciprocal_average_rank: 0.12752 (+/-0.13623)
ndcg_at_10: 0.40696 (+/-0.29140)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.33892 (+/-0.26118)
recall_at_10: 0.50269 (+/-0.32989)
reciprocal_best_rank: 0.47304 (+/-0.38723)
reciprocal_average_rank: 0.12379 (+/-0.13332)
ndcg_at_10: 0.39134 (+/-0.29741)

SummariesContextUniqueCount
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.31861 (+/-0.22838)
recall_at_10: 0.50753 (+/-0.31598)
reciprocal_best_rank: 0.45686 (+/-0.37234)
reciprocal_average_rank: 0.11809 (+/-0.11955)
ndcg_at_10: 0.37952 (+/-0.26988)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.30432 (+/-0.23949)
recall_at_10: 0.49729 (+/-0.32487)
reciprocal_best_rank: 0.41846 (+/-0.36229)
reciprocal_average_rank: 0.11681 (+/-0.12466)
ndcg_at_10: 0.36022 (+/-0.27887)

SummariesContextOverlap
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.32401 (+/-0.23814)
recall_at_10: 0.48385 (+/-0.32377)
reciprocal_best_rank: 0.46945 (+/-0.38297)
reciprocal_average_rank: 0.11994 (+/-0.12482)
ndcg_at_10: 0.37599 (+/-0.28289)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.31654 (+/-0.24526)
recall_at_10: 0.48925 (+/-0.31687)
reciprocal_best_rank: 0.44145 (+/-0.36692)
reciprocal_average_rank: 0.11881 (+/-0.12779)
ndcg_at_10: 0.36862 (+/-0.28183)



## Embedding baselines
### Models

In [6]:
model_names = [
    "summaries_average_embedding",
    "summaries_overlap_average_embedding",
    "context_average_embedding",
    "summaries_context_average_embedding",
    "summaries_context_overlap_average_embedding",
]

### Pretrained model

In [7]:
word2vec_embedding = get_word2vec(root + PRETRAINED_MODELS_PATH)

Word2Vec embedding loaded.



### Run the models

In [9]:
for model_name in model_names:
    model_name = to_class_name(model_name)
    print(model_name)
    
    model = getattr(models, model_name)(scores_names=SCORES_NAMES,
                                        relevance_level=task.relevance_level,
                                        trained_model=word2vec_embedding,
                                        tensorboard_logs_path=root + TENSORBOARD_LOGS_PATH,
                                        experiment_name=args['experiment'],
                                        random_seed=MODELS_RANDOM_SEED)
    
    play(task=task, model=model)

SummariesAverageEmbedding
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.36380 (+/-0.27860)
recall_at_10: 0.53927 (+/-0.35898)
reciprocal_best_rank: 0.47815 (+/-0.38885)
reciprocal_average_rank: 0.14940 (+/-0.15595)
ndcg_at_10: 0.41580 (+/-0.31898)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.35204 (+/-0.27668)
recall_at_10: 0.52027 (+/-0.36628)
reciprocal_best_rank: 0.46311 (+/-0.38478)
reciprocal_average_rank: 0.14613 (+/-0.15051)
ndcg_at_10: 0.40030 (+/-0.32237)

SummariesOverlapAverageEmbedding
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))


average_precision: 0.39632 (+/-0.29534)
recall_at_10: 0.55842 (+/-0.36701)
reciprocal_best_rank: 0.52359 (+/-0.39791)
reciprocal_average_rank: 0.16864 (+/-0.18361)
ndcg_at_10: 0.44902 (+/-0.33315)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))


average_precision: 0.39849 (+/-0.29205)
recall_at_10: 0.54435 (+/-0.35726)
reciprocal_best_rank: 0.54345 (+/-0.39941)
reciprocal_average_rank: 0.16380 (+/-0.17444)
ndcg_at_10: 0.44896 (+/-0.32792)

ContextAverageEmbedding
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=670.0), HTML(value='')))

KeyboardInterrupt: 

# Run the finetuned models
## BART classifier
### Models

In [4]:
model_names = [
    "classifier_bart"
]

### Pretrained model

In [5]:
bart = get_bart(bin_path=root + "results/models/RTE-bin_vanilla",
                checkpoint_file="checkpoints/checkpoint1.pt")

BART loaded (in evaluation mode).



### Run the models

In [6]:
for model_name in model_names:
    model_name = to_class_name(model_name)
    print(model_name)
    
    model = getattr(models, model_name)(scores_names=SCORES_NAMES,
                                        relevance_level=task.relevance_level,
                                        trained_model=bart,
                                        tensorboard_logs_path=root + TENSORBOARD_LOGS_PATH,
                                        experiment_name=args['experiment'],
                                        random_seed=MODELS_RANDOM_SEED)
    
    play(task=task, model=model)

ClassifierBart
Evaluation on the train_loader...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


average_precision: 1.00000 (+/-0.00000)
recall_at_10: 1.00000 (+/-0.00000)
reciprocal_best_rank: 1.00000 (+/-0.00000)
reciprocal_average_rank: 1.00000 (+/-0.00000)
ndcg_at_10: 1.00000 (+/-0.00000)

Evaluation on the valid_loader...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


average_precision: 0.88750 (+/-0.00000)
recall_at_10: 1.00000 (+/-0.00000)
reciprocal_best_rank: 1.00000 (+/-0.00000)
reciprocal_average_rank: 0.33333 (+/-0.00000)
ndcg_at_10: 0.95583 (+/-0.00000)

