In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf

seed_ = 20200218
np.random.seed(seed_)
tf.random.set_seed(seed_)

from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid",
              context="paper",
              font_scale=1.25,
              rc={
                  "figure.figsize": (10.5, 4.5),
                  "figure.dpi": 150,
                  "grid.alpha": 0.1,
                  "grid.color": "#1b262c",
                  "grid.linewidth": 0.5,
                  "font.family": "Operator Mono"
              })

_30k = ["#202f66", "#ff7048", "#7f68d0", "#f3d36e", "#d869ab", "#48ADA9", "#1b262c"]
sns.set_palette(_30k)

import warnings
warnings.filterwarnings('ignore')

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_dark"

plotly_config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 900,
    'width': 2100,
    'scale': 1 # Multiply title/legend/axis/canvas sizes by this factor
  }
}

In [3]:
import tensorflow_datasets as tfds

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [5]:
from bs4 import BeautifulSoup

# Load Data

In [6]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [7]:
imdb

{'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [8]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='C:\\Users\\Chuan\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitInfo num_examples=2500

In [9]:
train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

for sentence, label in imdb['train']:
        train_sentences.append(str(sentence.numpy(), encoding='utf-8'))
        train_labels.append(label.numpy())
        
for sentence, label in imdb['test']:
        test_sentences.append(str(sentence.numpy(), encoding='utf-8'))
        test_labels.append(label.numpy())

In [10]:
df = pd.DataFrame({
    "sentence": train_sentences+test_sentences, 
    "label": train_labels+test_labels
})

In [11]:
df.head()

Unnamed: 0,sentence,label
0,This was an absolutely terrible movie. Don't b...,0
1,"I have been known to fall asleep during films,...",0
2,Mann photographs the Alberta Rocky Mountains i...,0
3,This is the kind of film for a snowy Sunday af...,1
4,"As others have mentioned, all the women that g...",1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  50000 non-null  object
 1   label     50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [13]:
del train_labels
del test_labels
del test_sentences
del train_sentences

# Prepare Data For Train Word Embedding

In [14]:
list(df['sentence'])[:3]

["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
 'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development wa

> ลองดูข้อมูลแบบเร็วๆ พบว่ามี html ปนมา ... ต้อง clean ก่อน

## Clean HTML

In [15]:
def clean_html(text):
    soup = BeautifulSoup(text)
    return soup.text

In [16]:
df['sentence'] = df['sentence'].apply(clean_html)

In [17]:
list(df['sentence'])[:3]

["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
 'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development wa

## Tokenize and Remove stopword

In [18]:
# load stop word
stop_words = stopwords.words('english')

sorted(stop_words)[:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [19]:
sentences = []
for sentence in df['sentence']:
    tokenized_sentence = nltk.word_tokenize(sentence.lower())
    tokenized_sentence = [word for word in tokenized_sentence if word not in stop_words]
    tokenized_sentence = [word for word in tokenized_sentence if word.isalpha()]
    sentences.append(tokenized_sentence)

In [20]:
sentences[:5]

[['absolutely',
  'terrible',
  'movie',
  'lured',
  'christopher',
  'walken',
  'michael',
  'ironside',
  'great',
  'actors',
  'must',
  'simply',
  'worst',
  'role',
  'history',
  'even',
  'great',
  'acting',
  'could',
  'redeem',
  'movie',
  'ridiculous',
  'storyline',
  'movie',
  'early',
  'nineties',
  'us',
  'propaganda',
  'piece',
  'pathetic',
  'scenes',
  'columbian',
  'rebels',
  'making',
  'cases',
  'revolutions',
  'maria',
  'conchita',
  'alonso',
  'appeared',
  'phony',
  'affair',
  'walken',
  'nothing',
  'pathetic',
  'emotional',
  'plug',
  'movie',
  'devoid',
  'real',
  'meaning',
  'disappointed',
  'movies',
  'like',
  'ruining',
  'actor',
  'like',
  'christopher',
  'walken',
  'good',
  'name',
  'could',
  'barely',
  'sit'],
 ['known',
  'fall',
  'asleep',
  'films',
  'usually',
  'due',
  'combination',
  'things',
  'including',
  'really',
  'tired',
  'warm',
  'comfortable',
  'sette',
  'eaten',
  'lot',
  'however',
  'occa

In [21]:
with open("./outputs/imdb_cleaned_sentences.txt", "w", encoding="utf-8") as f:
    for line in sentences:
        f.write(' '.join(line)+'\n')