In [1]:
import tarfile
import pathlib
import json


def read_duc_2004_(root_dir):
    root_dir = pathlib.Path(root_dir)
    docs_dir = root_dir / 'DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs'
    result_dir = root_dir / 'duc2004_results'

    def get_duc_cluster_docs(cluster_id):
        docs = []
        cluster_path = docs_dir / f'd{cluster_id}t'
        for fpath in cluster_path.iterdir():
            with open(fpath) as f:
                raw = f.read()
            text = raw.split("<TEXT>")[1].split("</TEXT>")[0]
            text = " ".join(text.split())
            doc = {
                'fname': fpath.name,
                'cluster_id': cluster_id,
                'text': text
            }
            docs.append(doc)
        docs = sorted(docs, key=lambda x: x['fname'])
        return docs

    cid_to_clusters = {}
    # get reference (models) and peer (participant systems) summaries
    for group in ["models", "peers"]:

        gz_path = result_dir / f'ROUGE/duc2004.task2.ROUGE.{group}.tar.gz'
        tar = tarfile.open(gz_path, "r:gz")
        for member in tar.getmembers():

            author_id = member.name.split(".")[-1]
            cluster_id = member.name.split("/")[-1].split(".")[0].lstrip("D")

            # print(member.name)
            # print('CID:', cluster_id)
            # print()

            with tar.extractfile(member) as f:
                text = str(f.read(), encoding="UTF-8")
            text = " ".join(text.split())

            summary_item = {
                'author_id': author_id,
                'text': text,
                'cluster_id': cluster_id
            }

            if cluster_id not in cid_to_clusters:
                cid_to_clusters[cluster_id] = {
                    'peer_summaries': [],
                    'ref_summaries': [],
                    'id': cluster_id
                }

            if group == "models":
                cid_to_clusters[cluster_id]['ref_summaries'].append(summary_item)
            elif group == "peers":
                cid_to_clusters[cluster_id]['peer_summaries'].append(summary_item)

    # get source documents
    clusters = []
    for cid, c in cid_to_clusters.items():
        docs = get_duc_cluster_docs(cid)
        c['documents'] = docs
        print('CLUSTER:', cid, len(c['documents']))
        clusters.append(c)
    clusters = sorted(clusters, key=lambda x: x['id'])
    print('#clusters:', len(clusters))
    return clusters


def read_duc_2004(path):
    for c in read_duc_2004_(path):
        docs = [d['text'] for d in c['documents']]
        summaries = [s['text'] for s in c['ref_summaries']]
        yield docs, summaries


In [2]:
from collections import Counter
import numpy as np
from pathlib import Path


DATADIR = '/home/chris/projects/aylien/dynamic-ensembles/data/DUC2004'

cluster_rows = []

article_cnts = Counter()
summary_lens = []
source_lens = []

# DUC only has a test set
with open(Path(DATADIR) / ('DUC2004_test.jsonl'), 'w') as out:
    for srcs, tgts in read_duc_2004(DATADIR):
        articles = [{'title': '', 'text': t} for t in srcs]
        out.write(f'{json.dumps({"articles": articles, "summary": tgts})}\n')
        article_cnts.update([len(articles)])
        source_lens.extend([len(a.split()) for a in srcs])
        summary_lens.extend([len(t.split()) for t in tgts])

print(article_cnts.most_common())
print('Input stats:')
print((np.mean(source_lens), np.var(source_lens), np.std(source_lens)))
print('Summary stats:')
print((np.mean(summary_lens), np.var(summary_lens), np.std(summary_lens)))


CLUSTER: 30001 10
CLUSTER: 30002 10
CLUSTER: 30003 10
CLUSTER: 30005 10
CLUSTER: 30006 10
CLUSTER: 30007 10
CLUSTER: 30008 10
CLUSTER: 30010 10
CLUSTER: 30011 10
CLUSTER: 30015 10
CLUSTER: 30017 10
CLUSTER: 30020 10
CLUSTER: 30022 10
CLUSTER: 30024 10
CLUSTER: 30026 10
CLUSTER: 30027 10
CLUSTER: 30028 10
CLUSTER: 30029 10
CLUSTER: 30031 10
CLUSTER: 30033 10
CLUSTER: 30034 10
CLUSTER: 30036 10
CLUSTER: 30037 10
CLUSTER: 30038 10
CLUSTER: 30040 10
CLUSTER: 30042 10
CLUSTER: 30044 10
CLUSTER: 30045 10
CLUSTER: 30046 10
CLUSTER: 30047 10
CLUSTER: 30048 10
CLUSTER: 30049 10
CLUSTER: 30050 10
CLUSTER: 30051 10
CLUSTER: 30053 10
CLUSTER: 30055 10
CLUSTER: 30056 10
CLUSTER: 30059 10
CLUSTER: 31001 10
CLUSTER: 31008 10
CLUSTER: 31009 10
CLUSTER: 31013 10
CLUSTER: 31022 10
CLUSTER: 31026 10
CLUSTER: 31031 10
CLUSTER: 31032 10
CLUSTER: 31033 10
CLUSTER: 31038 10
CLUSTER: 31043 10
CLUSTER: 31050 10
#clusters: 50
[(10, 50)]
Input stats:
(588.272, 191996.402016, 438.17394036615184)
Summary stats:
(1