In [4]:
import sys
import json
import ast
import pickle
import pandas as pd
from transformers import AutoTokenizer
from lexrank import STOPWORDS, LexRank

In [5]:
# useful functions:
def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text

def shuffle_and_sample(data):
    random.seed(42)
    random.shuffle(data)
    return data[:600]

def load_data(path):
    with open(path) as f:
        contents = f.readlines()
    return contents

def parse_paper(json_paper):
    first_section = to_paragraph(json_paper['sections'][0])
    last_section = to_paragraph(json_paper['sections'][-1])
    first_n_last = first_section + last_section
    return first_n_last

def to_json(str_blob):
    json_paper = json.loads(str_blob)
    return json_paper

def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text


In [31]:
with open('raw_data/val.txt') as f:
    val = f.readlines()
# creates json list of validation documents
val_json_list = []
for i in val:
    val_json_list.append(to_json(i[:-1]))
val_json_dict = {"data": val_json_list}

with open('json_val.txt', 'w') as outfile:
    json.dump(val_json_dict, outfile)

In [6]:
# open mini-val data as a list of dicts
with open('json_val.txt') as json_file:
    val = json.load(json_file)
val = val['data']

In [7]:
def re_order_temporal(data):
    #print('hi 1')
    """
    data - input data in json format with attribute "article_text" and "abstract_text"
    
    re_ordered_df - dataframe of schema: text | target, where text is the truncated and temporally corrected text, and the target is the raw abstract
    
    """
    documents = [i['article_text'] for i in data]
    truncated_df = pd.DataFrame(columns=['text', 'target'])
    # instantiate tokenizer and lexranker
    lxr = LexRank(documents[:1000], stopwords=STOPWORDS['en'])
    #iterate over dataset and assemble dataframe
    count = 0
    for i in data:
        summary = to_paragraph(i['abstract_text'])
        # create data frame of schema sentance | rank | order 
        rank = lxr.rank_sentences(sentences = i['article_text'], threshold=None, fast_power_method=False) # generate list of ranks for each sentance
        rank = pd.DataFrame(i['article_text'], rank) # put each sentance in a data frame with corresponding rank
        rank.reset_index(inplace=True)  # reset index
        rank['order'] = rank.index  # create column to keep track of the order of each sentance
        rank.rename({'index':'rank', 0:'sentance'}, inplace=True, axis=1) # rename columns to make sense
        #check length of article and select sentance cuttoff start point
        approx_len = sum([len(sentance.split(' ')) for sentance in i['article_text']])
        if approx_len == 0:
            continue
        ratio = 4096 / approx_len  # should be something like 0.5 if its 100% oversized used to approximate split point of data to speed up algorithm
        sentance_cuttoff = int(len(rank) * ratio) + 10 # attempt to gather extra to pair down as needed (select 10 extra sentances)
        # if the article is too long, slice away sentances until it is short enough
        while approx_len > 4096:
            temp_article = []
            # append sentances from sorted rank data frame tp temp_article until the sentance cuttoff is reached (only the most important sentances remain)
            for j in rank.sort_values('rank', ascending=False).sentance[:sentance_cuttoff]:
                temp_article.append(j)
            # check token length of temp_article to see if we have cutt off enough of the document
            approx_len = sum([len(sentance.split(' ')) for sentance in temp_article])
            # reduce sentance cuttoff by 1
            sentance_cuttoff -=1
        # when token lenght is satisfied, add record to final dataframe
        rank = rank.sort_values('rank', ascending=False)[:sentance_cuttoff] # select only up to cuttoff point
        rank.sort_values('order', ascending=True, inplace=True) # implement temporal correction
        # re-assemble truncated text and create a single continuous text field
        truncated_text = []
        for k in rank.sentance:
            truncated_text.append(k)
        truncated_text = to_paragraph(truncated_text) # truncated text
        summary = to_paragraph(i['abstract_text']) # summary
        addition = pd.DataFrame([[truncated_text, summary]],columns=['text', 'target'])
        truncated_df = truncated_df.append(addition, ignore_index=True)
        #print(len(truncated_df))
        if count % 100 == 0 and count !=0:
            print(f'processed {count} documents')
            print(len(truncated_df))
        count +=1
    return truncated_df

In [10]:
# create df and save as csv
ranked_val_df = re_order_temporal(val)
ranked_val_df.to_csv('ranked_temporal_val_df.csv', index=False)

processed 100 documents
101
processed 200 documents
201
processed 300 documents
301
processed 400 documents
401
processed 500 documents
501
processed 600 documents
601
processed 700 documents
701
processed 800 documents
801
processed 900 documents
901
processed 1000 documents
1001
processed 1100 documents
1101
processed 1200 documents
1201
processed 1300 documents
1301
processed 1400 documents
1401
processed 1500 documents
1501
processed 1600 documents
1601
processed 1700 documents
1701
processed 1800 documents
1801
processed 1900 documents
1901
processed 2000 documents
2001
processed 2100 documents
2101
processed 2200 documents
2201
processed 2300 documents
2301
processed 2400 documents
2401
processed 2500 documents
2501
processed 2600 documents
2601
processed 2700 documents
2701
processed 2800 documents
2801
processed 2900 documents
2901
processed 3000 documents
3001
processed 3100 documents
3101
processed 3200 documents
3201
processed 3300 documents
3301
processed 3400 documents
340