# Getting started with the WCEP dataset

## Clone repository & install dependencies

In [None]:
!git clone https://github.com/complementizer/wcep-mds-dataset

In [None]:
cd wcep-mds-dataset

In [None]:
!git checkout experiments

In [None]:
cd experiments

In [None]:
!pip install -r requirements.txt
!python -m nltk.downloader punkt

In [None]:
cd experiments

## Download dataset


In [None]:
!mkdir WCEP
!gdown https://drive.google.com/uc?id=1kUjSRXzKnTYdJ732BkKVLg3CFxDKo25u -O WCEP/train.jsonl.gz
!gdown https://drive.google.com/uc?id=1_kHTZ32jazTbXaFRg0vBeIsVcpI7CTmy -O WCEP/val.jsonl.gz
!gdown https://drive.google.com/uc?id=1qsd5pOCpeSXsaqNobXCrcAzhcjtG1wA1 -O WCEP/test.jsonl.gz

## Load dataset

We use the WCEP validation data as an example. <br> Each item in the dataset corresponds to a cluster of news articles about a news event and contains some metadata, most importantly the ground-truth summary for the cluster.

In [None]:
import utils

val_data = list(utils.read_jsonl_gz('WCEP/val.jsonl.gz'))

print(val_data[0].keys())

## Run extractive baselines & oracles



In [None]:
from baselines import RandomBaseline, TextRankSummarizer, CentroidSummarizer, SubmodularSummarizer
from oracles import Oracle

First we create summarizer objects and set their hyperparameters.

In [None]:
random_sum = RandomBaseline()
textrank = TextRankSummarizer(max_redundancy=0.5)
centroid = CentroidSummarizer(max_redundancy=0.5)
submod = SubmodularSummarizer(a=5, div_weight=6, cluster_factor=0.2) # div_weight encourages diversity/non-reduncancy
oracle = Oracle()

Below we pick one set of settings for extractive summarization that we will use for all baselines. <br>
* `in_titles` means we add article titles as sentences in the input, and `out_titles` means we also allow these titles to be part of a summary
* we set a minimum sentence length (`min_sent_tokens`) because short broken sentences appear frequently and are usually not desirable
* you can set the length contraint to `words`, `sents` or `chars`

In [None]:
settings = {
    'max_len': 40, 'len_type': 'words',
    'in_titles': False, 'out_titles': False,
    'min_sent_tokens': 7, 'max_sent_tokens': 40,    
}
max_articles = 20

For a quick experiment, we only select the first 10 clusters of the WCEP validation data and use the first 10 articles of each cluster as inputs.

In [None]:
example_clusters = [c['articles'][:max_articles] for c in val_data[:10]]
ref_summaries = [c['summary'] for c in val_data[:10]]

In [None]:
textrank_summaries = [textrank.summarize(articles, **settings) for articles in example_clusters]
centroid_summaries = [centroid.summarize(articles, **settings) for articles in example_clusters]
submod_summaries = [submod.summarize(articles, **settings) for articles in example_clusters]
random_summaries = [random_sum.summarize(articles, **settings) for articles in example_clusters]

In [None]:
oracle_summaries = [oracle.summarize(ref, articles, **settings)
                    for (ref, articles) in zip(ref_summaries, example_clusters)]

## Evaluate summaries

**Note:** our `evaluate` function uses a wrapper from the [newsroom library](https://github.com/lil-lab/newsroom) to compute ROUGE scores.  


In [None]:
from pprint import pprint
from evaluate import evaluate

In [None]:
names = ['TextRank', 'Centroid', 'Submodular', 'Oracle', 'Random']
outputs = [textrank_summaries, centroid_summaries, submod_summaries, oracle_summaries, random_summaries]

for preds, name in zip(outputs, names):
    print(name)
    results = evaluate(ref_summaries, preds, lowercase=True)
    pprint(results)
    print()

Let's look at some example summaries.

In [None]:
cluster_idx = 6
print('Ground-truth')
print(ref_summaries[cluster_idx])
print()

for preds, name in zip(outputs, names):
    print(name)
    print(preds[cluster_idx])
    print()

### Blog Example


In [None]:
from utils import read_jsonl_gz
from baselines import TextRankSummarizer
from evaluate import evaluate
from pprint import pprint

textrank = TextRankSummarizer()

dataset = list(read_jsonl_gz('WCEP/val.jsonl.gz'))
cluster = dataset[954]
articles = cluster['articles'][:10]

human_summary = cluster['summary']
automatic_summary = textrank.summarize(articles)
results = evaluate([human_summary], [automatic_summary])

print('Summary:')
print(automatic_summary)
print()
print('ROUGE scores:')
pprint(results)