## Use:

### This notebook is used to create .csv files that contain augmented data

ranked_temporal_train_df.csv - contains data truncated based on least important information, and put back in temporal order

ranked_train_df.csv - contains data truncated based on the least important information and ranked by sentance by importance

### Imports

In [3]:
# imports
#import ast # ?
import json
#import sys # ?
#import pickle # ?
#from rouge_score import rouge_scorer # dont need
#from rouge_score import scoring # dont need
import random # need for shuffle and sample (delete when rolled into utils.py)
import pandas as pd
#import matplotlib.pyplot as plt # dont need
#import seaborn as sns # dont need
#import numpy as np # ?
#import scipy # ?
from lexrank import STOPWORDS, LexRank

### Load Data

In [5]:
# load in train data in json format
with open('json_train.txt') as json_file:
    train = json.load(json_file)
train = train['data']

### Utilitiy Functions

In [6]:
# useful functions:
def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text

def shuffle_and_sample(data):
    random.seed(42)
    random.shuffle(data)
    return data[:600]

def load_data(path):
    with open(path) as f:
        contents = f.readlines()
    return contents

def parse_paper(json_paper):
    first_section = to_paragraph(json_paper['sections'][0])
    last_section = to_paragraph(json_paper['sections'][-1])
    first_n_last = first_section + last_section
    return first_n_last

def to_json(str_blob):
    json_paper = json.loads(str_blob)
    return json_paper

def to_paragraph(text):
    """
    converts text comprised of lists of sentances to a single to_paragraph

    text - list of multiple string sentances

    continuous_text - single continuous text string
    """
    continuous_text = ''
    for line in text:
        continuous_text += line
    return continuous_text


### Generate truncated and temporally corrected dataframe and save to .csv

In [7]:
def re_order_temporal(data):
    #print('hi 1')
    """
    data - input data in json format with attribute "article_text" and "abstract_text"
    
    re_ordered_df - dataframe of schema: text | target, where text is the truncated and temporally corrected text, and the target is the raw abstract
    
    """
    documents = [i['article_text'] for i in data]
    truncated_df = pd.DataFrame(columns=['text', 'target'])
    # instantiate tokenizer and lexranker
    lxr = LexRank(documents[:1000], stopwords=STOPWORDS['en'])
    #iterate over dataset and assemble dataframe
    count = 0
    for i in data:
        summary = to_paragraph(i['abstract_text'])
        # create data frame of schema sentance | rank | order 
        rank = lxr.rank_sentences(sentences = i['article_text'], threshold=None, fast_power_method=False) # generate list of ranks for each sentance
        rank = pd.DataFrame(i['article_text'], rank) # put each sentance in a data frame with corresponding rank
        rank.reset_index(inplace=True)  # reset index
        rank['order'] = rank.index  # create column to keep track of the order of each sentance
        rank.rename({'index':'rank', 0:'sentance'}, inplace=True, axis=1) # rename columns to make sense
        #check length of article and select sentance cuttoff start point
        approx_len = sum([len(sentance.split(' ')) for sentance in i['article_text']])
        if approx_len == 0:
            continue
        ratio = 4096 / approx_len  # should be something like 0.5 if its 100% oversized used to approximate split point of data to speed up algorithm
        sentance_cuttoff = int(len(rank) * ratio) + 10 # attempt to gather extra to pair down as needed (select 10 extra sentances)
        # if the article is too long, slice away sentances until it is short enough
        while approx_len > 4096:
            temp_article = []
            # append sentances from sorted rank data frame tp temp_article until the sentance cuttoff is reached (only the most important sentances remain)
            for j in rank.sort_values('rank', ascending=False).sentance[:sentance_cuttoff]:
                temp_article.append(j)
            # check token length of temp_article to see if we have cutt off enough of the document
            approx_len = sum([len(sentance.split(' ')) for sentance in temp_article])
            # reduce sentance cuttoff by 1
            sentance_cuttoff -=1
        # when token lenght is satisfied, add record to final dataframe
        rank = rank.sort_values('rank', ascending=False)[:sentance_cuttoff] # select only up to cuttoff point
        rank.sort_values('order', ascending=True, inplace=True) # implement temporal correction
        # re-assemble truncated text and create a single continuous text field
        truncated_text = []
        for k in rank.sentance:
            truncated_text.append(k)
        truncated_text = to_paragraph(truncated_text) # truncated text
        summary = to_paragraph(i['abstract_text']) # summary
        addition = pd.DataFrame([[truncated_text, summary]],columns=['text', 'target'])
        truncated_df = truncated_df.append(addition, ignore_index=True)
        #print(len(truncated_df))
        if count % 100 == 0 and count !=0:
            print(f'processed {count} documents')
            print(len(truncated_df))
        count +=1
    return truncated_df

In [8]:
# # create df and save as csv
# ranked_temporal_train_df = re_order_temporal(train)
# ranked_temporal_train_df.to_csv('ranked_temporal_train_df.csv', index=False)

### Generate truncated and ranked dataframe and save to .csv

In [37]:
def re_order(data):
   #print('hi 1')
    """
    data - input data in json format with attribute "article_text" and "abstract_text"
    
    re_ordered_df - dataframe of schema: text | target, where text is the truncated and temporally corrected text, and the target is the raw abstract
    
    """
    documents = [i['article_text'] for i in data]
    ranked_df = pd.DataFrame(columns=['text', 'target'])
    # instantiate tokenizer and lexranker
    lxr = LexRank(documents[:1000], stopwords=STOPWORDS['en'])
    #iterate over dataset and assemble dataframe
    count = 0
    for i in data:
        summary = to_paragraph(i['abstract_text'])
        # create data frame of schema sentance | rank | order 
        rank = lxr.rank_sentences(sentences = i['article_text'], threshold=None, fast_power_method=False) # generate list of ranks for each sentance
        rank = pd.DataFrame(i['article_text'], rank) # put each sentance in a data frame with corresponding rank
        rank.reset_index(inplace=True)  # reset index
        rank['order'] = rank.index  # create column to keep track of the order of each sentance
        rank.rename({'index':'rank', 0:'sentance'}, inplace=True, axis=1) # rename columns to make sense
        temp_article = []
        # append sentances from sorted rank data frame tp temp_article until the sentance cuttoff is reached (only the most important sentances remain)
        if len(rank) != 0:
            for j in rank.sort_values('rank', ascending=False).sentance:
                temp_article.append(j)
        else:
            continue
        temp_article = to_paragraph(temp_article) 
        summary = to_paragraph(i['abstract_text']) # summary
        addition = pd.DataFrame([[temp_article, summary]],columns=['text', 'target'])
        ranked_df = ranked_df.append(addition, ignore_index=True)
        #print(len(truncated_df))
        if count % 100 == 0 and count !=0:
            print(f'processed {count} documents')
        count +=1
    return ranked_df

In [40]:
# load in train data in json format
with open('json_test.txt') as json_file:
    test = json.load(json_file)
test = test['data']

In [41]:
# create df and save as csv
ranked_test_df = re_order(test)
ranked_test_df.to_csv('ranked_test_df.csv', index=False)

processed 100 documents
processed 200 documents
processed 300 documents
processed 400 documents
processed 500 documents
processed 600 documents
processed 700 documents
processed 800 documents
processed 900 documents
processed 1000 documents
processed 1100 documents
processed 1200 documents
processed 1300 documents
processed 1400 documents
processed 1500 documents
processed 1600 documents
processed 1700 documents
processed 1800 documents
processed 1900 documents
processed 2000 documents
processed 2100 documents
processed 2200 documents
processed 2300 documents
processed 2400 documents
processed 2500 documents
processed 2600 documents
processed 2700 documents
processed 2800 documents
processed 2900 documents
processed 3000 documents
processed 3100 documents
processed 3200 documents
processed 3300 documents
processed 3400 documents
processed 3500 documents
processed 3600 documents
processed 3700 documents
processed 3800 documents
processed 3900 documents
processed 4000 documents
processed

In [42]:
ranked_test_df.to_json('ranked_test.json')

In [12]:
ranked_train_df = pd.read_csv('ranked_train_df.csv')

In [14]:
ranked_pt1 = ranked_train_df[:60000]
ranked_pt2 = ranked_train_df[60000:]

ranked_pt1.to_json('ranked_train_pt1.json')
ranked_pt2.to_json('ranked_train_pt2.json')