## Setup

In [1]:
import sys
sys.path.append('../')
del sys
%reload_ext autoreload
%autoreload 2

from modeling.pipeline import Pipeline

## Data pipeline parameters

In [2]:
batch_size = 64
drop_last = False
test_proportion = 0.2
valid_proportion = 0.2
use_k_fold = False
k_k_fold = None

## Load the data

In [3]:
pipeline = Pipeline(use_k_fold=use_k_fold)
pipeline.process_data(batch_size=batch_size,
                      drop_last=drop_last,
                      test_proportion=test_proportion,
                      valid_proportion=valid_proportion,
                      k=k_k_fold)

Processing the modeling task...
Computing the annotated queries...
Initial length of queries: 0
Object loaded from ../results/../results/task_annotation/v2_0/task/queries_size10k_shuffle_articles1_queries1_seed0.pkl
Object loaded from ../results/../results/task_annotation/v2_1/task/queries_sizemax_shuffle_articles1_queries1_seed0.pkl
Object loaded from ../results/../results/task_annotation/v2_2/task/queries_sizemax_shuffle_articles1_queries1_seed0.pkl
Final length of queries: 61056
Done (elapsed time: 2s).

Computing the annotations...
Initial length of annotations: 0
Object loaded from ../results/task_annotation/v2_0/results/batch_00_complete.csv
   Correcting "n this article, Nevada and Ohio are discussed. The two American states..." to " The two American states..."
   Correcting "In this article, California and Oregon are discussed. The two neighboring states..." to " The two neighboring states..."
   Correcting "In this article, California and Oregon are discussed. The two West Coa

## Metrics

In [4]:
scores_names = [
    'average_precision', 
    'precision_at_k', 
    'recall_at_k', 
    'reciprocal_best_rank', 
    'reciprocal_average_rank', 
    'ndcg'
]
n_updates = 200

## Random

In [5]:
from modeling.models import RandomBaseline

model = RandomBaseline(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

In [6]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.07633
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.01269


In [7]:
model.display_metrics()

average_precision: training set 0.07633 validation set 0.01269
precision_at_k: training set 0.01774 validation set 0.00250
recall_at_k: training set 0.16051 validation set 0.00833
reciprocal_best_rank: training set 0.07934 validation set 0.02349
reciprocal_average_rank: training set 0.07128 validation set 0.00398
ndcg: training set nan validation set 0.00649


In [8]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])


Entities (location): New York City,  New York State
Scores of the batch: average_precision: 0.00482, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.00282, reciprocal_average_rank: 0.00249, ndcg: 0.00000
1: political parties (1.000)
2: pension cutters (0.996)
3: western asian places (0.996)
4: neighboring republics (0.994)
5: islands (0.993)
6: moldova region (0.990)
7: european territories (0.990)
8: major political parties (0.987)
9: north asian countries (0.987)
10: anti-west nations (0.982)

Entities (org): Magna International Inc.,  Onex Corp.
Scores of the batch: average_precision: 0.02020, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.02941, reciprocal_average_rank: 0.00926, ndcg: 0.00000
1: opposing parties (0.996)
2: professional sports teams (0.988)
3: american pharmaceutical company (0.984)
4: transportation companies (0.982)
5: transportation entities (0.982)
6: american professional basketball teams (0.982)
7: institutes (0.

## Counts

In [9]:
from modeling.models import CountsBaseline

model = CountsBaseline(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

Learning answers counts...


HBox(children=(FloatProgress(value=0.0, max=12271.0), HTML(value='')))




In [10]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.44661
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.20347


In [11]:
model.display_metrics()

average_precision: training set 0.44661 validation set 0.20347
precision_at_k: training set 0.07283 validation set 0.09400
recall_at_k: training set 0.66021 validation set 0.35242
reciprocal_best_rank: training set 0.46137 validation set 0.35652
reciprocal_average_rank: training set 0.42198 validation set 0.06037
ndcg: training set nan validation set 0.27141


In [12]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])


Entities (person): Amanda Plummer,  Michael Wilson,  Tennessee Williams
Scores of the batch: average_precision: 0.14266, precision_at_k: 0.10000, recall_at_k: 0.33333, reciprocal_best_rank: 0.33333, reciprocal_average_rank: 0.00362, ndcg: 0.23464
1: politicians (186)
2: political powers (141)
3: entertainers (92)
4: american politicians (78)
5: americans (63)
6: men (45)
7: artists (32)
8: businessmen (32)
9: actors (31)
10: political figures (23)

Entities (org): Coleman Co.,  Morgan Stanley
Scores of the batch: average_precision: 0.50000, precision_at_k: 0.10000, recall_at_k: 1.00000, reciprocal_best_rank: 0.50000, reciprocal_average_rank: 0.50000, ndcg: 0.63093
1: political powers (141)
2: companies (26)
3: musicians (13)
4: corporations (9)
5: tech companies (8)
6: technology companies (7)
7: political entities (5)
8: business partners (4)
9: palestinian organizations (4)
10: steel producers (4)

Entities (location): Iran,  Iraq
Scores of the batch: average_precision: 0.35379, pre

## Summaries Counts

In [13]:
from modeling.models import SummariesCountBaseline

model = SummariesCountBaseline(scores_names=scores_names)

pipeline.preview_data(model=model, 
                      include_valid=True)

In [14]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.18011
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.06380


In [15]:
model.display_metrics()

average_precision: training set 0.18011 validation set 0.06380
precision_at_k: training set 0.03725 validation set 0.03300
recall_at_k: training set 0.33532 validation set 0.12283
reciprocal_best_rank: training set 0.18768 validation set 0.11144
reciprocal_average_rank: training set 0.16863 validation set 0.01694
ndcg: training set nan validation set 0.08341


In [16]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])


Entities (location): Baghdad,  Iraq
Scores of the batch: average_precision: 0.06845, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.07143, reciprocal_average_rank: 0.03333, ndcg: 0.00000
1: country and its capital city (7)
   capital, city, city, capital, country, capital, city
2: republic of iraq (5)
   iraq, iraq, republic, iraq, iraq
3: areas in iraq (4)
   iraq, iraq, iraq, iraq
4: cities in iraq (4)
   iraq, iraq, iraq, iraq
5: places in west asia (4)
   west, asia, asia, west
6: capital cities (3)
   capital, capital, capital
7: areas in western asia (3)
   asia, western, asia
8: western asia nations (3)
   asia, western, asia
9: major united states city and the state nearby (3)
   city, city, city
10: western asia cities (3)
   asia, western, asia

Entities (person): George E. Pataki,  Larry A. Silverstein,  Norman Foster
Scores of the batch: average_precision: 0.00496, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.00676, recipro

## Summaries Soft Overlap

In [19]:
from modeling.models import SummariesSoftOverlapBaseline

model = SummariesSoftOverlapBaseline(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

In [20]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.15158
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.04528


In [21]:
model.display_metrics()

average_precision: training set 0.15158 validation set 0.04528
precision_at_k: training set 0.03643 validation set 0.02100
recall_at_k: training set 0.32802 validation set 0.07533
reciprocal_best_rank: training set 0.15771 validation set 0.08334
reciprocal_average_rank: training set 0.14200 validation set 0.00994
ndcg: training set nan validation set 0.05196


In [22]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])


Entities (location): Dubai,  New Jersey
Scores of the batch: average_precision: 0.00525, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.00383, reciprocal_average_rank: 0.00274, ndcg: 0.00000
1: major united states city and the state nearby (5)
   major, city, states, united, state
2: us gulf coast states (4)
   states, gulf, us, coast
3: new york state region (4)
   york, state, region, new
4: middle east region (3)
   middle, east, region
5: states in the southern region of the united states (3)
   states, united, region
6: states in the southeast region of america (3)
   states, southeast, region
7: areas in north central united states (3)
   states, united, north
8: major new york cities (3)
   york, major, new
9: areas in new york state (3)
   york, state, new
10: entities of the state of new jersey (3)
   jersey, state, new

Entities (person): Mickey Mantle,  Peter Golenbock
Scores of the batch: average_precision: 0.33665, precision_at_k: 0.10000, recall_a

## Summaries Hard Overlap

In [23]:
from modeling.models import SummariesHardOverlapBaseline

model = SummariesHardOverlapBaseline(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

In [24]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.16234
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.03773


In [25]:
model.display_metrics()

average_precision: training set 0.16234 validation set 0.03773
precision_at_k: training set 0.03287 validation set 0.01700
recall_at_k: training set 0.29496 validation set 0.06267
reciprocal_best_rank: training set 0.17024 validation set 0.07706
reciprocal_average_rank: training set 0.15169 validation set 0.00757
ndcg: training set nan validation set 0.04424


In [26]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])


Entities (person): John G. Rowland,  M. Jodi Rell
Scores of the batch: average_precision: 0.00129, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.00129, reciprocal_average_rank: 0.00129, ndcg: 0.00000
1: american politician (2)
   american, politician
2: american legal professionals (1)
   american
3: american professional basketball players (1)
   american
4: american politically inclined (1)
   american
5: politician and the organ builder (1)
   politician
6: american attorneys (1)
   american
7: american men named robert (1)
   american
8: onetime rivals for the democratic nomination for governor of new york (1)
   governor
9: american producers (1)
   american
10: american statesmen (1)
   american

Entities (person): David Hyde Pierce,  Frank Langella,  John Gallagher,  Steven Sater
Scores of the batch: average_precision: 0.00438, precision_at_k: 0.00000, recall_at_k: 0.00000, reciprocal_best_rank: 0.00431, reciprocal_average_rank: 0.00216, ndcg: 0.00000
1

## Closest Average Embedding

In [27]:
from modeling.models import ClosestAverageEmbedding

model = ClosestAverageEmbedding(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

Initializing the Word2Vec pretrained embedding...


In [None]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))

In [None]:
model.display_metrics()

In [None]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])

## Closest Hard Overlap Embedding

In [18]:
from modeling.models import ClosestHardOverlapEmbedding

model = ClosestHardOverlapEmbedding(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

In [7]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.07732
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.01160


In [13]:
model.display_metrics()

0.07731831060910294
0.011604027666229135
average_precision: training set 0.07732 validation set 0.01160
0.018228786475507547
0.002666666706403097
precision_at_k: training set 0.01823 validation set 0.00267


In [None]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])

## Closest Soft Overlap Embedding

In [18]:
from modeling.models import ClosestSoftOverlapEmbedding

model = ClosestSoftOverlapEmbedding(scores_names=scores_names)

pipeline.preview_data(model=model,
                      include_valid=True)

In [7]:
pipeline.evaluate_baseline(model=model,
                           n_updates=n_updates)

Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=11927.0), HTML(value='')))


Test Score: 0.07732
Evaluation of the model...



HBox(children=(FloatProgress(value=0.0, max=344.0), HTML(value='')))


Test Score: 0.01160


In [13]:
model.display_metrics()

0.07731831060910294
0.011604027666229135
average_precision: training set 0.07732 validation set 0.01160
0.018228786475507547
0.002666666706403097
precision_at_k: training set 0.01823 validation set 0.00267


In [None]:
pipeline.explain_model(model=model, 
                       display_explanations=True,
                       n_samples=5,
                       n_answers=10,
                       scores_names=[
                           'average_precision', 
                           'precision_at_k', 
                           'recall_at_k', 
                           'reciprocal_best_rank', 
                           'reciprocal_average_rank', 
                           'ndcg'
                       ])