# Data Prep

* Prep data from `tensorflow_datasets` of `imdb_reviews` dataset.
* Prep data from [The Signal Media One-Million News Articles Dataset](https://research.signal-ai.com/newsir16/signal-dataset.html).

# Imports

In [1]:
import os
# Import TensorFlow
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import json
import gzip
import pickle
import numpy as np

TensorFlow 2.x selected.


# Moview Reviews

## Get Dataset

In [2]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', 
                                          split = (tfds.Split.TRAIN, tfds.Split.TEST),
                                          as_supervised=True,
                                          with_info=True)

text_features = info.features['text'].encoder

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…






HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete0X1302/imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete0X1302/imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete0X1302/imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0. Subsequent calls will reuse this data.[0m


## Text Encoder - Deocder

In [0]:
# save text_features to binary
with open('imdb_reviews_subwords8k_text_features.pickle', 'wb') as handle:
    pickle.dump(text_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

# get text_features from binary
with open('imdb_reviews_subwords8k_text_features.pickle', 'rb') as handle:
    text_features = pickle.load(handle)

## Save to Data Frame

In [4]:
reviews_texts, reviews_sentiments = [], []

# get all examples from train set
for in_sequence, out_sequence in train_data:
  # form tf tensor to numpy
  in_sequence = tf.constant(in_sequence)
  in_sequence = np.array(in_sequence)
  # decode ids to text
  reviews_texts.append(text_features.decode(in_sequence))
  # form tf tensor to numpy
  out_sequence = tf.constant(out_sequence)
  out_sequence = np.array(out_sequence)
  # int value form numpy
  reviews_sentiments.append(int(out_sequence))
print('\tFinished train generator')

# get all examples from test set
for in_sequence, out_sequence in test_data:
  # form tf tensor to numpy
  in_sequence = tf.constant(in_sequence)
  in_sequence = np.array(in_sequence)
  # decode ids to text
  reviews_texts.append(text_features.decode(in_sequence))
  # form tf tensor to numpy
  out_sequence = tf.constant(out_sequence)
  out_sequence = np.array(out_sequence)
  # int value form numpy
  reviews_sentiments.append(int(out_sequence))
print('\tFinished test generator')

review_sentiments_df = pd.DataFrame()
review_sentiments_df['review'] = reviews_texts
review_sentiments_df['sentiment'] = reviews_sentiments

review_sentiments_df.to_csv('imdb_reviews_50k.csv', index=False)

review_sentiments_df.head()

	Finished train generator
	Finished test generator


Unnamed: 0,review,sentiment
0,This was an absolutely terrible movie. Don't b...,0
1,"I have been known to fall asleep during films,...",0
2,Mann photographs the Alberta Rocky Mountains i...,0
3,This is the kind of film for a snowy Sunday af...,1
4,"As others have mentioned, all the women that g...",1


# Signalmedia

## Download

In [5]:
if not os.path.isfile('signalmedia-1m.jsonl.gz'):
  !wget http://research.signalmedia.co/newsir16/signalmedia-1m.jsonl.gz --quiet

--2020-03-01 18:48:52--  http://research.signalmedia.co/newsir16/signalmedia-1m.jsonl.gz
Resolving research.signalmedia.co (research.signalmedia.co)... 54.194.177.11, 54.154.193.71, 52.211.36.95
Connecting to research.signalmedia.co (research.signalmedia.co)|54.194.177.11|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.signalmedia.co:443/newsir16/signalmedia-1m.jsonl.gz [following]
--2020-03-01 18:48:52--  https://research.signalmedia.co/newsir16/signalmedia-1m.jsonl.gz
Connecting to research.signalmedia.co (research.signalmedia.co)|54.194.177.11|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.signal-ai.com:443/newsir16/signalmedia-1m.jsonl.gz [following]
--2020-03-01 18:48:53--  https://research.signal-ai.com/newsir16/signalmedia-1m.jsonl.gz
Resolving research.signal-ai.com (research.signal-ai.com)... 104.26.3.247, 104.26.2.247, 2606:4700:20::681a:3f7, ...
Connecting 

## Get Dataset

In [6]:
file_name = "signalmedia-1m.jsonl.gz"

news_contents = []
with gzip.open(file_name, "rb") as f:
  for index, line in enumerate(f, start=1):
    entry = json.loads(line.decode())      
    news_contents.append(entry['content'])
    if (index%500000==0):
      print('\t',index)

print('news data: ',len(news_contents))

	 500000
	 1000000
news data:  1000000
