## Loading the train and validation dataset

In [1]:
import pandas as pd
import numpy as np
import re
import pickle

from google.cloud import storage


In [40]:
"""
Helper function to download data from Google Cloud Storage
  # Arguments:
      source: string, the GCS URL to download from (e.g. 'gs://bucket/file.csv')
      destination: string, the filename to save as on local disk. MUST be filename
      ONLY, doesn't support folders. (e.g. 'file.csv', NOT 'folder/file.csv')
  # Returns: nothing, downloads file to local disk
"""
def download_from_gcs(bucket_name, source, destination):
    #search = re.search('gs://(.*?)/(.*)', source)
    #bucket_name = search.group(1)
    #blob_name = search.group(2)
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    bucket.blob(source).download_to_filename(destination)

"""
Parses raw tsv containing hacker news headlines and returns (sentence, integer label) pairs
  # Arguments:
      train_data_path: string, path to tsv containing training data.
        can be a local path or a GCS url (gs://...)
      eval_data_path: string, path to tsv containing eval data.
        can be a local path or a GCS url (gs://...)
  # Returns:
      ((train_sentences, train_labels), (test_sentences, test_labels)):  sentences
        are lists of strings, labels are numpy integer arrays
"""
def load_news_data(bucket, train_data_path, train_filename, eval_data_path, eval_filename):
#    if train_data_path.startswith('gs://'):
    download_from_gcs(bucket, train_data_path, destination=train_filename)
    train_data_path = train_filename
#    if eval_data_path.startswith('gs://'):
    download_from_gcs(bucket,eval_data_path, destination=eval_filename)
    eval_data_path = eval_filename

    # Parse CSV using pandas
    column_names = ('label', 'text')
    df_train = pd.read_csv(train_data_path, names=column_names, sep=',',header=0)
    df_eval = pd.read_csv(eval_data_path, names=column_names, sep=',',header=0)

    return ((list(df_train['text']), list(df_train['label'])),
            (list(df_eval['text']), list(df_eval['label'])))


Lets define the global parameters

In [41]:
#gs_data_path='gs://mlend_text_summarization/data/amazon-fine-food-reviews'
bucket='mlend_text_summarization'
gs_data_path='data/amazon-fine-food-reviews/'
train_filename='train.csv'
eval_filename='validation.csv'

In [42]:
# Load the data from GS to local disk
    ((train_texts, train_labels), (test_texts, test_labels)) = load_news_data(
        bucket,gs_data_path+train_filename,train_filename,
        gs_data_path+eval_filename,eval_filename)


In [43]:
print('Train data: ',len(train_texts),' target: ',len(train_labels))
print('Eval data: ',len(test_texts),' target: ',len(test_labels))

Train data:  776  target:  776
Eval data:  138  target:  138


## Preprocess the data to be consumed by the model

Remember to add the START and END special tokens at the beginning and end of the summary. Here, I have chosen sostok and eostok as START and END tokens

Note: Be sure that the chosen special tokens never appear in the summary

In [37]:
eos_token='eostok'
sos_token='sostok'

def include_EOS_SOS(texts, eos_token, sos_token):
    return list(map(lambda x: sos_token+' '+x+' '+eos_token, train_labels))

In [38]:
eos_token='eostok'
sos_token='sostok'
#result = list(map(lambda x: sos_token+' '+x+' '+eos_token, train_labels))
result = include_EOS_SOS(train_labels, eos_token, sos_token)

### Preparing the Tokenizer

A tokenizer builds the vocabulary and converts a word sequence to an integer sequence. Go ahead and build tokenizers for text and summary:

Text Tokenizer

In [44]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer() 
x_tokenizer.fit_on_texts(train_texts)

Using TensorFlow backend.


#Rarewords and its Coverage

Let us look at the proportion rare words and its total coverage in the entire text

Here, I am defining the threshold to be 4 which means word whose count is below 4 is considered as a rare word

In [45]:
def set_vocab_threshold(thresh,tokenizer)

    cnt=0
    tot_cnt=0
    freq=0
    tot_freq=0

    for key,value in tokenizer.word_counts.items():
        tot_cnt=tot_cnt+1
        tot_freq=tot_freq+value
        if(value<thresh):
            cnt=cnt+1
            freq=freq+value
    return cnt, tot_cnt

print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

% of rare words in vocabulary: 76.23762376237624
Total Coverage of rare words: 23.87706855791962
