# PreSumm

**Source**:

Code: https://github.com/nlpyang/PreSumm/


Paper: https://arxiv.org/abs/1908.08345

#### Pre-requisities

**Libraries**: 

Torch 1.1.0 (download instructions from https://pytorch.org/get-started/previous-versions/)

**Stanford CoreNLP**

We will need Stanford CoreNLP to tokenize the data. Download it [here](https://stanfordnlp.github.io/CoreNLP/) and unzip it. Then add the following command to your bash_profile:
```
export CLASSPATH=/path/to/stanford-corenlp-full-2017-06-09/stanford-corenlp-3.8.0.jar
```
replacing `/path/to/` with the path to where you saved the `stanford-corenlp-full-2017-06-09` directory. 


#### Code

In [1]:
from others.tokenization import BertTokenizer

In [2]:
import lzma
import os
import json
import re

from pandas.io.json import json_normalize
import pandas as pd
from bs4 import BeautifulSoup
import subprocess
import torch
import lxml
import numpy as np

# Read data

In [3]:
base_path = "./data/xml"
state='north_carolina.xz'
f = lzma.open(os.path.join(base_path,state),"rb")
state_data = f.readlines()
f.close()
data_json = [json.loads(line) for line in state_data]
print(f'Flattening data for {state}')
data = json_normalize(data_json)

Flattening data for north_carolina.xz


  


In [4]:
data['decision_date_p'] = pd.to_datetime(data.decision_date,errors='coerce')
data['decision_year'] = data.decision_date_p.dt.year

In [5]:
data_2008 = data[data.decision_year>=2008]

# Tokenize Data

In [6]:
def tokenize(raw_path,save_path):
    stories_dir = os.path.abspath(raw_path)
    tokenized_stories_dir = os.path.abspath(save_path)

    print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir))
    stories = os.listdir(stories_dir)
    # make IO list file
    print("Making list of files to tokenize...")
    with open("mapping_for_corenlp.txt", "w") as f:
        for s in stories:
            f.write("%s\n" % (os.path.join(stories_dir, s)))
    command = ['java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit',
               '-ssplit.newlineIsSentenceBreak', 'always', '-filelist', 'mapping_for_corenlp.txt', '-outputFormat',
               'json', '-outputDirectory', tokenized_stories_dir]
    print("Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir))
    subprocess.call(command)
    print("Stanford CoreNLP Tokenizer has finished.")
    os.remove("mapping_for_corenlp.txt")


In [7]:
sample_data = data_2008.iloc[:100]

In [8]:
for row in data_2008.iterrows():
    try:
        caseid = row[1].id
        markup = row[1]['casebody.data']
        soup = BeautifulSoup(markup, "xml")
        opinion = soup.find_all('opinion')[0]
        opinion_text = opinion.getText()
        opinion_text = opinion_text.encode("ascii", "ignore").strip().decode("ascii")
        headnotes = (' '.join([headnotes.getText() for headnotes in soup.find_all('headnotes')])).replace('\n', ' ')
        headnotes = headnotes.encode("ascii", "ignore").strip().decode("ascii")

        if (len(headnotes) > 150 and len(opinion_text)>len(headnotes)):
            with open(f'presumm_data/parsed_text/opinions/{caseid}.txt','w') as f:
                f.write(opinion_text)

            with open(f'presumm_data/parsed_text/headnotes/{caseid}.txt','w') as f:
                f.write(headnotes)
    except:
        print(f'Case ID {caseid} parsing failed')

In [9]:
parsed_opinions_path = 'presumm_data/parsed_text/opinions'
tokenized_opinions_path = 'presumm_data/tokenized_text/opinions'
tokenize(parsed_opinions_path,tokenized_opinions_path)

Preparing to tokenize C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\parsed_text\opinions to C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\tokenized_text\opinions...
Making list of files to tokenize...
Tokenizing 3693 files in C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\parsed_text\opinions and saving in C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\tokenized_text\opinions...
Stanford CoreNLP Tokenizer has finished.


In [10]:
parsed_headnotes_path = 'presumm_data/parsed_text/headnotes'
tokenized_headnotes_path = 'presumm_data/tokenized_text/headnotes'
tokenize(parsed_headnotes_path,tokenized_headnotes_path)

Preparing to tokenize C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\parsed_text\headnotes to C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\tokenized_text\headnotes...
Making list of files to tokenize...
Tokenizing 3693 files in C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\parsed_text\headnotes and saving in C:\Users\gufra\OneDrive\Documents\Academics\AdvancedTopicsInDataScience\final_project\presumm_data\tokenized_text\headnotes...
Stanford CoreNLP Tokenizer has finished.


# Converting to JSON

In [3]:

REMAP = {"-lrb-": "(", "-rrb-": ")", "-lcb-": "{", "-rcb-": "}",
         "-lsb-": "[", "-rsb-": "]", "``": '"', "''": '"'}


def clean(x):
    return re.sub(
        r"-lrb-|-rrb-|-lcb-|-rcb-|-lsb-|-rsb-|``|''",
        lambda m: REMAP.get(m.group()), x)

def load_json(case_id):
    source = []
    tgt = []
    source_path = os.path.join('presumm_data/tokenized_text/opinions',f'{case_id}.txt.json')
    target_path = os.path.join('presumm_data/tokenized_text/headnotes',f'{case_id}.txt.json')
    for sent in json.load(open(source_path,encoding='utf-8'))['sentences']:
        tokens = [t['word'].encode("ascii", "ignore").strip().decode("utf-8") for t in sent['tokens']]
        tokens = [t.lower() for t in tokens]
        source.append(tokens)
    for sent in json.load(open(target_path,encoding='utf-8'))['sentences']:
        tokens = [t['word'].encode("ascii", "ignore").strip().decode("utf-8") for t in sent['tokens']]
        tokens = [t.lower() for t in tokens]
        tgt.append(tokens)


    source = [clean(' '.join(sent)).split() for sent in source]
    tgt = [clean(' '.join(sent)).split() for sent in tgt]
    return source, tgt

### Greedy Selection

In [4]:
import re

def _get_ngrams(n, text):
    """Calcualtes n-grams.

    Args:
      n: which n-grams to calculate
      text: An array of tokens

    Returns:
      A set of n-grams
    """
    ngram_set = set()
    text_length = len(text)
    max_index_ngram_start = text_length - n
    for i in range(max_index_ngram_start + 1):
        ngram_set.add(tuple(text[i:i + n]))
    return ngram_set


def _get_word_ngrams(n, sentences):
    """Calculates word n-grams for multiple sentences.
    """
    assert len(sentences) > 0
    assert n > 0

    # words = _split_into_words(sentences)

    words = sum(sentences, [])
    # words = [w for w in words if w not in stopwords]
    return _get_ngrams(n, words)


def cal_rouge(evaluated_ngrams, reference_ngrams):
    reference_count = len(reference_ngrams)
    evaluated_count = len(evaluated_ngrams)

    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
    overlapping_count = len(overlapping_ngrams)

    if evaluated_count == 0:
        precision = 0.0
    else:
        precision = overlapping_count / evaluated_count

    if reference_count == 0:
        recall = 0.0
    else:
        recall = overlapping_count / reference_count

    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
    return {"f": f1_score, "p": precision, "r": recall}


def greedy_selection(doc_sent_list, abstract_sent_list, summary_size):
    def _rouge_clean(s):
        return re.sub(r'[^a-zA-Z0-9 ]', '', s)
   
    max_rouge = 0.0
    abstract = sum(abstract_sent_list, [])
    #abstract = abstract_sent_list
    abstract = _rouge_clean(' '.join(abstract)).split()
    sents = [_rouge_clean(' '.join(s)).split() for s in doc_sent_list]
    evaluated_1grams = [_get_word_ngrams(1, [sent]) for sent in sents]
    #print(evaluated_1grams)
    reference_1grams = _get_word_ngrams(1, [abstract])
    evaluated_2grams = [_get_word_ngrams(2, [sent]) for sent in sents]
    reference_2grams = _get_word_ngrams(2, [abstract])

    selected = []

    for s in range(summary_size):
        cur_max_rouge = max_rouge
        cur_id = -1
        
        for i in range(len(sents)):
            if (i in selected):
                continue
                
            c = selected + [i]
            candidates_1 = [evaluated_1grams[idx] for idx in c]
            candidates_1 = set.union(*map(set, candidates_1))
            candidates_2 = [evaluated_2grams[idx] for idx in c]
            candidates_2 = set.union(*map(set, candidates_2))
            rouge_1 = cal_rouge(candidates_1, reference_1grams)['f']
            rouge_2 = cal_rouge(candidates_2, reference_2grams)['f']
            rouge_score = rouge_1 + rouge_2           
            if rouge_score > cur_max_rouge:
                cur_max_rouge = rouge_score
                cur_id = i
        if (cur_id == -1):
            return sorted(selected)
        selected.append(cur_id)
        max_rouge = cur_max_rouge
    
    
    return sorted(selected)

### Bert Data

In [5]:
max_src_nsents =10000
class BertData():
    def __init__(self, min_src_ntokens_per_sent=5,
                max_src_ntokens_per_sent=200,
                max_src_nsents=max_src_nsents,
                min_src_nsents=1,
                max_tgt_ntokens=500,
                min_tgt_ntokens=5):
        self.min_src_ntokens_per_sent = min_src_ntokens_per_sent
        self.max_src_ntokens_per_sent = max_src_ntokens_per_sent
        self.max_src_nsents = max_src_nsents
        self.min_src_nsents = min_src_nsents
        self.max_tgt_ntokens = max_tgt_ntokens
        self.min_tgt_ntokens = min_tgt_ntokens
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused0]'
        self.tgt_eos = '[unused1]'
        self.tgt_sent_split = '[unused2]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]

    def preprocess(self, src, tgt, sent_labels, use_bert_basic_tokenizer=False, is_test=False):

        if ((not is_test) and len(src) == 0):
            return None

        original_src_txt = [' '.join(s) for s in src]

        idxs = [i for i, s in enumerate(src) if (len(s) > self.min_src_ntokens_per_sent)]

        _sent_labels = [0] * len(src)
        for l in sent_labels:
            _sent_labels[l] = 1

        src = [src[i][:self.max_src_ntokens_per_sent] for i in idxs]
        sent_labels = [_sent_labels[i] for i in idxs]
        src = src[:self.max_src_nsents]
        sent_labels = sent_labels[:self.max_src_nsents]

        if ((not is_test) and len(src) < self.min_src_nsents):
            return None

        src_txt = [' '.join(sent) for sent in src]
        text = ' {} {} '.format(self.sep_token, self.cls_token).join(src_txt)

        src_subtokens = self.tokenizer.tokenize(text)

        src_subtokens = [self.cls_token] + src_subtokens + [self.sep_token]
        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if (i % 2 == 0):
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]
        cls_ids = [i for i, t in enumerate(src_subtoken_idxs) if t == self.cls_vid]
        sent_labels = sent_labels[:len(cls_ids)]

        tgt_subtokens_str = '[unused0] ' + ' [unused2] '.join(
            [' '.join(self.tokenizer.tokenize(' '.join(tt), use_bert_basic_tokenizer=use_bert_basic_tokenizer)) for tt in tgt]) + ' [unused1]'
        tgt_subtoken = tgt_subtokens_str.split()[:self.max_tgt_ntokens]
        if ((not is_test) and len(tgt_subtoken) < self.min_tgt_ntokens):
            return None

        tgt_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(tgt_subtoken)

        tgt_txt = '<q>'.join([' '.join(tt) for tt in tgt])
        src_txt = [original_src_txt[i] for i in idxs]

        return src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt


In [14]:
case_files = os.listdir('./presumm_data/tokenized_text/opinions')
case_ids = [case_file.replace(".txt.json","") for case_file in case_files]
parsed_files = [case_id.replace(".json","") for case_id in os.listdir('./presumm_data/json_data')]
#case_ids = list(set(case_ids).difference(parsed_files))
len(case_ids)

3693

In [17]:
def generate_bert_data(case_id):
    source, tgt = load_json(case_id)
    sent_labels = greedy_selection(source[:max_src_nsents], tgt, 5)
    source = [' '.join(s).lower().split() for s in source]
    tgt = [' '.join(s).lower().split() for s in tgt]
    bert = BertData()
    b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=True,
                                     is_test=False)
    if b_data is not None:
        src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data
        b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs,
                               "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids,
                               'src_txt': src_txt, "tgt_txt": tgt_txt}
        return (case_id,b_data_dict)


In [18]:
from mutliprocessing_funcs import generate_bert_data

In [19]:
from multiprocessing import Pool
pool = Pool(32)
for b_data_tp in pool.imap_unordered(generate_bert_data,case_ids):
    if b_data_tp is not None:
        with open(f'./presumm_data/json_data/{b_data_tp[0]}.json', 'w') as fp:
            json.dump(b_data_tp[1], fp)
pool.close()
pool.join()

### Create Train, test and validation datasets

In [20]:
all_cases = [case_id.replace(".json","") for case_id in os.listdir('./presumm_data/json_data/')]

In [21]:
num_cases = len(all_cases)
train_cases = int(np.ceil(num_cases*0.8))
val_cases = int(np.ceil((num_cases-train_cases)/2))
test_cases = num_cases-val_cases-train_cases
all_index = np.arange(num_cases)
np.random.seed(1)
np.random.shuffle(all_index)
train_indices =all_index[:train_cases]
val_indices = all_index[train_cases:train_cases+val_cases]
test_indices = all_index[train_cases+val_cases:] 

In [22]:
train_cases = np.array(all_cases)[train_indices]
val_cases = np.array(all_cases)[val_indices]
test_cases = np.array(all_cases)[test_indices]

In [23]:
def append_samples(case_list):
    appended_samples = []
    for case_id in case_list:
        try:
            with open(f'./presumm_data/json_data/{case_id}.json','r') as f:
                case_content = f.read()
                case_content = json.loads(case_content)
            appended_samples.append(case_content)
        except:
            print(f'Error reading case {case_id}')
    return appended_samples

In [24]:
train_dataset = append_samples(train_cases)
val_dataset = append_samples(val_cases)
test_dataset = append_samples(test_cases)

In [25]:
torch.save(train_dataset, 'presumm_data/train_dataset.pt')
torch.save(val_dataset, 'presumm_data/val_dataset.pt')
torch.save(test_dataset, 'presumm_data/test_dataset.pt')

In [26]:
test_sample_dataset = append_samples(test_cases[:10])
len(test_sample_dataset)

10

In [27]:
torch.save(test_sample_dataset, 'presumm/bert_data/sample/cnndm.test.1.bert.pt')

In [28]:
with open('test_cases.json','w') as f:
    json.dump(test_cases.tolist(),f)

with open('train_cases.json','w') as f:
    json.dump(train_cases.tolist(),f)

with open('val_cases.json','w') as f:
    json.dump(val_cases.tolist(),f)


### Data Prep for Matchsum data

In [7]:
# For Match Sum Data
with open('test_cases.json','r') as f:
    test_cases = json.load(f)

text_summary=[]
sent_id = []

for case_id in test_cases:
    
    source, tgt = load_json(case_id)
    sent_labels = greedy_selection(source[:max_src_nsents], tgt, 5)
    source = [' '.join(s).lower() for s in source]
    tgt = [' '.join(s).lower() for s in tgt]
    #text_summary.append({'text':source, 'summary':tgt})
    #sent_id.append({'sent_id':sent_labels})

    with open('sentence_id.json','a+') as f:
        json.dump({'sent_id':sent_labels},f)
        f.write('\n')
    
    with open('match_summ_sample.json','a+') as f:
        json.dump({'text':source, 'summary':tgt},f)
        f.write('\n')