In [6]:
import sys
import json
import ast
import pickle
import pandas as pd
from transformers import AutoTokenizer
from lexrank import STOPWORDS, LexRank

In [7]:
len(pd.read_csv('ranked_temporal_train_df.csv'))

119920

In [2]:
# useful functions:
def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text

def shuffle_and_sample(data):
    random.seed(42)
    random.shuffle(data)
    return data[:600]

def load_data(path):
    with open(path) as f:
        contents = f.readlines()
    return contents

def parse_paper(json_paper):
    first_section = to_paragraph(json_paper['sections'][0])
    last_section = to_paragraph(json_paper['sections'][-1])
    first_n_last = first_section + last_section
    return first_n_last

def to_json(str_blob):
    json_paper = json.loads(str_blob)
    return json_paper

def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text


In [3]:
# open mini-val data as a list of dictionaryies
with open('mini_val_set_json.txt') as json_file:
    mini_val = json.load(json_file)
mini_val = mini_val['data']

In [4]:
# test to evaluate if these are the correct indexes (should print 600)
# read indexes
with open('mini_val_indexes.txt') as f:
    indexes = set(f.read().split('\t'))
# compare indexes
correct = 0
for i in mini_val:
    if i['article_id'] in indexes:
        correct += 1
print(f'{correct} correct indexes.')

600 correct indexes.


In [30]:
def re_order_temporal(data):
    """
    data - input data in json format with attribute "article_text" and "abstract_text"
    
    re_ordered_df - dataframe of schema: text | target, where text is the truncated and temporally corrected text, and the target is the raw abstract
    
    """
    documents = [i['article_text'] for i in data]
    truncated_df = pd.DataFrame(columns=['text', 'target'])
    # instantiate tokenizer and lexranker
    tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")
    lxr = LexRank(documents, stopwords=STOPWORDS['en'])
    #iterate over dataset and assemble dataframe
    count = 0
    for i in data:
        summary = to_paragraph(i['abstract_text'])
        # create data frame of schema sentance | rank | order 
        rank = lxr.rank_sentences(sentences = i['article_text'], threshold=None, fast_power_method=False) # generate list of ranks for each sentance
        rank = pd.DataFrame(i['article_text'], rank) # put each sentance in a data frame with corresponding rank
        rank.reset_index(inplace=True)  # reset index
        rank['order'] = rank.index  # create column to keep track of the order of each sentance
        rank.rename({'index':'rank', 0:'sentance'}, inplace=True, axis=1) # rename columns to make sense
        #check length of article and select sentance cuttoff start point
        token_len = len(tokenizer(i['article_text'], return_tensors='pt', is_split_into_words=True)[0])
        ratio = 4096 / token_len  # should be something like 0.5 if its 100% oversized used to approximate split point of data to speed up algorithm
        sentance_cuttoff = int(len(rank) * ratio) + 10 # attempt to gather extra to pair down as needed (select 10 extra sentances)
        
        # if the article is too long, slice away sentances until it is short enough
        while token_len > 4096:
            temp_article = []
            # append sentances from sorted rank data frame tp temp_article until the sentance cuttoff is reached (only the most important sentances remain)
            for j in rank.sort_values('rank', ascending=False).sentance[:sentance_cuttoff]:
                temp_article.append(j)
            # check token length of temp_article to see if we have cutt off enough of the document
            token_len = len(tokenizer(temp_article, return_tensors='pt', is_split_into_words=True)[0])
            # reduce sentance cuttoff by 1
            sentance_cuttoff -=1
        # when token lenght is satisfied, add record to final dataframe
        rank = rank.sort_values('rank', ascending=False)[:sentance_cuttoff] # select only up to cuttoff point
        rank.sort_values('order', ascending=True, inplace=True) # implement temporal correction
        # re-assemble truncated text and create a single continuous text field
        truncated_text = []
        for k in rank.sentance:
            truncated_text.append(k)
        truncated_text = to_paragraph(truncated_text) # truncated text
        summary = to_paragraph(i['abstract_text']) # summary
        addition = pd.DataFrame([[truncated_text, summary]],columns=['text', 'target'])
        truncated_df = truncated_df.append(addition, ignore_index=True)
        if count % 10 == 0:
            print(f'processed {count} files.')
    return truncated_df

In [31]:
# create df and save as csv
ranked_temporal_mini_val_df = re_order_temporal(mini_val)
ranked_temporal_mini_val_df.to_csv('ranked_temporal_mini_val_df.csv', index=False)

processed 0 files.


Token indices sequence length is longer than the specified maximum sequence length for this model (6506 > 4096). Running this sequence through the model will result in indexing errors


processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 

In [5]:
def re_order(data):
    """
    data - input data in json format with attribute "article_text" and "abstract_text"
    
    re_ordered_df - dataframe of schema: text | target, where text is the truncated and temporally corrected text, and the target is the raw abstract
    
    """
    documents = [i['article_text'] for i in data]
    truncated_df = pd.DataFrame(columns=['text', 'target'])
    # instantiate tokenizer and lexranker
    tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-pubmed")
    lxr = LexRank(documents, stopwords=STOPWORDS['en'])
    #iterate over dataset and assemble dataframe
    count = 0
    for i in data:
        summary = to_paragraph(i['abstract_text'])
        # create data frame of schema sentance | rank | order 
        rank = lxr.rank_sentences(sentences = i['article_text'], threshold=None, fast_power_method=False) # generate list of ranks for each sentance
        rank = pd.DataFrame(i['article_text'], rank) # put each sentance in a data frame with corresponding rank
        rank.reset_index(inplace=True)  # reset index
        rank['order'] = rank.index  # create column to keep track of the order of each sentance
        rank.rename({'index':'rank', 0:'sentance'}, inplace=True, axis=1) # rename columns to make sense
        #check length of article and select sentance cuttoff start point
        token_len = len(tokenizer(i['article_text'], return_tensors='pt', is_split_into_words=True)[0])
        ratio = 4096 / token_len  # should be something like 0.5 if its 100% oversized used to approximate split point of data to speed up algorithm
        sentance_cuttoff = int(len(rank) * ratio) + 10 # attempt to gather extra to pair down as needed (select 10 extra sentances)
        
        # if the article is too long, slice away sentances until it is short enough
        while token_len > 4096:
            temp_article = []
            # append sentances from sorted rank data frame tp temp_article until the sentance cuttoff is reached (only the most important sentances remain)
            for j in rank.sort_values('rank', ascending=False).sentance[:sentance_cuttoff]:
                temp_article.append(j)
            # check token length of temp_article to see if we have cutt off enough of the document
            token_len = len(tokenizer(temp_article, return_tensors='pt', is_split_into_words=True)[0])
            # reduce sentance cuttoff by 1
            sentance_cuttoff -=1
        # when token lenght is satisfied, add record to final dataframe
        rank = rank.sort_values('rank', ascending=False)[:sentance_cuttoff] # select only up to cuttoff point
        # re-assemble truncated text and create a single continuous text field
        truncated_text = []
        for k in rank.sentance:
            truncated_text.append(k)
        truncated_text = to_paragraph(truncated_text) # truncated text
        summary = to_paragraph(i['abstract_text']) # summary
        addition = pd.DataFrame([[truncated_text, summary]],columns=['text', 'target'])
        truncated_df = truncated_df.append(addition, ignore_index=True)
        if count % 10 == 0:
            print(f'processed {count} files.')
    return truncated_df

In [6]:
# create df and save as csv
ranked_mini_val_df = re_order(mini_val)
ranked_mini_val_df.to_csv('ranked_mini_val_df.csv', index=False)

processed 0 files.


Token indices sequence length is longer than the specified maximum sequence length for this model (6506 > 4096). Running this sequence through the model will result in indexing errors


processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 files.
processed 0 