In [None]:
# using a pre-trained summarization model, create one instance for every input, then decode from the ensemble

In [1]:
# https://github.com/nlpyang/PreSumm

#### Updates: For encoding a text longer than 512 tokens, for example 800. Set max_pos to 800 during both preprocessing and training.

-mode can be {validate, test}, where validate will inspect the model directory and evaluate the model for each newly saved checkpoint, test need to be used with -test_from, indicating the checkpoint you want to use
MODEL_PATH is the directory of saved checkpoints
use -mode valiadte with -test_all, the system will load all saved checkpoints and select the top ones to generate summaries (this will take a while)

In [None]:
# the baseline setup results in memory error, try building on MT-GPU, or containerize for ease of use 

In [None]:
%bash

# probably pytorch version in their requirements.txt
# RuntimeError: cuda runtime error (38) : no CUDA-capable device is detected at /pytorch/aten/src/THC/THCGeneral.cpp:51


cd ~/projects/PreSumm/src

source activate presumm

BATCH_SIZE=1
# note last part of BERT_DATA_PATH is file prefix
BERT_DATA_PATH=/data/PreSumm_data/bert_data/bert_data_cnndm_final/cnndm
MODEL_PATH=/data/PreSumm_data/models

python train.py \
  -task abs \
  -mode validate \
  -batch_size ${BATCH_SIZE} \
  -test_batch_size ${BATCH_SIZE} \
  -bert_data_path ${BERT_DATA_PATH} \
  -log_file ../logs/val_abs_bert_cnndm \
  -model_path ${MODEL_PATH} \
  -sep_optim true \
  -use_interval true \
  -visible_gpus 0 \
  -max_pos 512 \
  -max_length 200 \
  -alpha 0.95 \
  -min_length 50 \
  -result_path ../logs/abs_bert_cnndm 


In [None]:
# export CORENLP_HOME=/data/stanford_core_nlp/stanford-corenlp-full-2018-10-05

In [None]:
%bash

export CLASSPATH=/data/stanford_core


java edu.stanford.nlp.pipeline.StanfordCoreNLP \
  -annotators tokenize,ssplit \
  -ssplit.newlineIsSentenceBreak always \ 
  -filelist mapping_for_corenlp.txt \
  -outputFormat json \
  -outputDirectory tokenized_stories_dir


command = ['java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', 'tokenize,ssplit',
               '-ssplit.newlineIsSentenceBreak', 'always', '-filelist', 'mapping_for_corenlp.txt', '-outputFormat',
               'json', '-outputDirectory', tokenized_stories_dir]
    print("Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir)

```

# NOTE: we still need to clean the multinews format (removing NEWLINE tokens and document separators, etc...)

export CORENLP_HOME=/data/stanford_core_nlp/stanford-corenlp-full-2018-10-05

# annotate -i val.src.100 -f json --annotators tokenize ssplit | jq '{src: [.[][] | [.tokens[].word]]}' > val.src.100.corenlp.json 


# WORKING one-liner
jq -n \
  --slurpfile o1 <(annotate -i val.src.50 -f json --annotators tokenize ssplit | jq '{src: [.[][] | [.tokens[].word]]}') \
  --slurpfile o2 <(annotate -i val.tgt.50 -f json --annotators tokenize ssplit | jq '{tgt: [.[][] | [.tokens[].word]]}') \
  'reduce range(0; $o1|length) as $i ([]; . + [{ "src": $o1[$i].src, "tgt": $o2[$i].tgt}])' | less


export CORENLP_HOME=/data/stanford_core_nlp/stanford-corenlp-full-2018-10-05
DATADIR=/data/PreSumm_data/multi-news/preprocessed_truncated
VALID_SRC=${DATADIR}/test.txt.src.tokenized.fixed.cleaned.final.truncated.txt
VALID_TGT=${DATADIR}/test.txt.tgt.tokenized.fixed.cleaned.final.truncated.txt
VALID_OUT=${DATADIR}/test.corenlp.json
jq -n \
  --slurpfile o1 <(annotate -i ${VALID_SRC} -f json --annotators tokenize ssplit | jq '{src: [.[][] | [.tokens[].word]]}') \
  --slurpfile o2 <(annotate -i ${VALID_TGT} -f json --annotators tokenize ssplit | jq '{tgt: [.[][] | [.tokens[].word]]}') \
  'reduce range(0; $o1|length) as $i ([]; . + [{ "src": $o1[$i].src, "tgt": $o2[$i].tgt}])' > ${VALID_OUT}




# After the one-liner above we need to map into .pt files
# Note file must have prefix in ['train', 'valid', 'test']

source activate presumm
PRESUM=/home/chrishokamp/projects/PreSumm
JSON_DIR=/data/PreSumm_data/multi-news/preprocessed_truncated/presumm_json_input
OUTPUT_DIR=${JSON_DIR}/bert_files_for_presumm
mkdir -p ${OUTPUT_DIR}
cd ${JSON_DIR}

python $PRESUM/src/preprocess.py \
 -mode format_to_bert \
 -raw_path ${JSON_DIR} \
 -save_path ${OUTPUT_DIR} \
 -lower \
 -n_cpus 1 \
 -log_file preprocess.log


# now rename files so that the prefixes work
cp test.multinews.corenlp.bert.pt multinews.test.corenlp.bert.pt


# Try summarizing the (flattened) multinews file
# TODO: increase max length of summaries to fit with MultiNews dataset 
cd ~/projects/PreSumm/src

source activate presumm

BATCH_SIZE=32
MAX_SUMMARY_LENGTH=128
# note last part of BERT_DATA_PATH is file prefix
BERT_DATA_PATH=/data/PreSumm_data/multi-news/preprocessed_truncated/presumm_json_input/bert_files_for_presumm/multinews
MODEL_PATH=/data/PreSumm_data/models

python train.py \
  -task abs \
  -mode validate \
  -batch_size ${BATCH_SIZE} \
  -test_batch_size ${BATCH_SIZE} \
  -bert_data_path ${BERT_DATA_PATH} \
  -log_file ../logs/val_abs_bert_cnndm \
  -model_path ${MODEL_PATH} \
  -sep_optim true \
  -use_interval true \
  -visible_gpus 0 \
  -max_pos 512 \
  -max_length ${MAX_SUMMARY_LENGTH} \
  -alpha 0.95 \
  -min_length 50 \
  -result_path ../logs/abs_bert_cnndm 

```




In [None]:
# multinews has rouge from opennmt, presumably this is what they used 
# https://github.com/Alex-Fabbri/Multi-News/blob/3675e7c422ae3b4020617a324ac264f50333357d/code/OpenNMT-py-baselines/tools/test_rouge.py

In [None]:
# split every multinews line into constituent story files

# download Stanford NLP and set classpath accordingly

# remember presumm does a lot of idiosyncratic things with the BERT special tokenss

def multinews_to_presumm_json_format(multinews_file):
    """Simplest possible thing: just flatten a multinews row into a single document"""
    pass

https://github.com/Alex-Fabbri/Multi-News



In [None]:
# Preprocessing to prepare a new test dataset

# Note we try to go around having to use their clunky preprocessing



In [None]:
# (1) Format MultiNews to .json format of 

In [None]:
# (2) Map json-formatted data to pytorch tensors for BERT, store them in a file that we can use 
#   to get the summaries for the MultiNews dev+test sets