In [1]:
import news_data
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
# Options for evaluating this cell.
tag = 'with-stops'   # tag for filenames
min_sents = 3        # min sentences to keep article
max_sents = 100      # truncate beyond this number of sentences
keep_stops = True    # keep stop words in vocab

In [None]:
# Read data directly into memory.  Each item is tuple: (text, label, url).
# We could do this with generators but the corpus is small enough for in-memory.
data = news_data.read_data_in_memory()
len(data)

21087

In [None]:
# Parse the text portion of the data.
parsed_text = news_data.parse_corpus([x[0] for x in data], keep_stops=keep_stops,
                                     min_sents=min_sents, max_sents=max_sents)

In [None]:
# Update dataset, discarding articles that had fewer than min_sents sentences.
# Each item is tuple: (parsed_text, original_text, label, url)
parsed_data = []
for ix, d in enumerate(data):
  if parsed_text[ix]:
    parsed_data.append((parsed_text[ix], d[0], d[1], d[2]))
len(parsed_data)

In [None]:
# Delete data we no longer need.
del data, parsed_text

In [None]:
# Pickle parsed data.
with open('parsed-data-{}.pkl'.format(tag), 'w') as f:
  pickle.dump(parsed_data, f)

In [None]:
# with open('parsed-data.pkl', 'r') as f:
#   parsed_data = pickle.load(f)

In [None]:
# Build vocabulary data structures.
vocab_dict, vocab_dict_rev, ranked_word_list = news_data.build_vocabulary(parsed_data)

In [None]:
# Encode the text.
encoded_corpus = news_data.encode_corpus(parsed_data, vocab_dict_rev)

In [None]:
# Encode the labels.
labels_int = LabelEncoder().fit_transform([x[2] for x in parsed_data])
len(labels_int)

In [None]:
# Pickle it.
with open('data-{}.pkl'.format(tag), 'w') as f:
  pickle.dump({'X': encoded_corpus, 'y': labels_int, 'vocab': vocab_dict, 'vocab_index': vocab_dict_rev}, f)