Read the review dataset using tf.data.Dataset and perform following transformations,

1. Read text review and generate a label from folder name. your dataset should have review text and label as a tuple
2. Filter blank text review. Two files are blank in this dataset
3. Do all of the above transformations in single line of code. Also shuffle all the reviews

In [4]:
import tensorflow as tf
import numpy as np

In [2]:
reviews_ds = tf.data.Dataset.list_files('./reviews/*/*', shuffle=False)

In [3]:
count = len(reviews_ds)
count

6

In [5]:
for review in reviews_ds:
    print(review.numpy())

b'.\\reviews\\negative\\neg_1.txt'
b'.\\reviews\\negative\\neg_2.txt'
b'.\\reviews\\negative\\neg_3.txt'
b'.\\reviews\\positive\\pos_1.txt'
b'.\\reviews\\positive\\pos_2.txt'
b'.\\reviews\\positive\\pos_3.txt'


In [6]:
reviews_ds = reviews_ds.shuffle(4)
for review in reviews_ds:
    print(review.numpy())

b'.\\reviews\\positive\\pos_1.txt'
b'.\\reviews\\negative\\neg_3.txt'
b'.\\reviews\\positive\\pos_2.txt'
b'.\\reviews\\positive\\pos_3.txt'
b'.\\reviews\\negative\\neg_1.txt'
b'.\\reviews\\negative\\neg_2.txt'


In [11]:
import os

def get_reviews_and_label(path):
    label = tf.strings.split(path, os.path.sep)[-2]
    data = tf.io.read_file(path)
    return  data, label

In [13]:
df = reviews_ds.map(get_reviews_and_label)

In [15]:
for data, label in df:
    print("Review: ", data.numpy()[:100])
    print("Label: ", label.numpy())

Review:  b"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The"
Label:  b'positive'
Review:  b''
Label:  b'negative'
Review:  b"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his par"
Label:  b'negative'
Review:  b"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 or 8 "
Label:  b'negative'
Review:  b'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B'
Label:  b'positive'
Review:  b''
Label:  b'positive'


In [17]:
'''
Filtering the blank reviews
'''
df1 = df.filter(lambda review, label: review!="")
for data, label in df1:
    print("Review: ", data.numpy()[:100])
    print("Label: ", label.numpy())

Review:  b"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 or 8 "
Label:  b'negative'
Review:  b'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B'
Label:  b'positive'
Review:  b"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The"
Label:  b'positive'
Review:  b"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his par"
Label:  b'negative'


In [21]:
'''
In Single line of code
'''
final_ds = reviews_ds.map(get_reviews_and_label).filter(lambda rew, lab: rew!="").shuffle(3)

for data, label in final_ds:
    print("Review: ", data.numpy()[:150])
    print("Label: ", label.numpy())

Review:  b"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This mo"
Label:  b'negative'
Review:  b'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes d'
Label:  b'positive'
Review:  b"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with"
Label:  b'positive'
Review:  b"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 or 8 years were brilliant, but things dropped off after"
Label:  b'negative'
