# 1. Retrieving review description of the dataframe-json file

In [None]:
import json
import pandas as pd 


#### putting review description of proportionate dataset into a .txt file

In [None]:
dfSamples = pd.read_json('train_samples_prop_8k.json')
    
with open('train_reviews_prop_8k.txt', 'w') as text_f:
    for description in dfSamples['description']:
        text_f.write(description + "\n")
    

# Loading reviews

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [None]:
# load document
in_filename = 'train_reviews_prop_8k.txt'
doc = load_doc(in_filename)
print(doc[:200])
len(doc)

Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country win


1929877

# 2. Cleaning reviews

In [None]:
import string
 
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [None]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['much', 'like', 'the', 'regular', 'bottling', 'from', 'this', 'comes', 'across', 'as', 'rather', 'rough', 'and', 'tannic', 'with', 'rustic', 'earthy', 'herbal', 'characteristics', 'nonetheless', 'if', 'you', 'think', 'of', 'it', 'as', 'a', 'pleasantly', 'unfussy', 'country', 'wine', 'its', 'a', 'good', 'companion', 'to', 'a', 'hearty', 'winter', 'stew', 'a', 'sleek', 'mix', 'of', 'tart', 'berry', 'stem', 'and', 'herb', 'along', 'with', 'a', 'hint', 'of', 'oak', 'and', 'chocolate', 'this', 'is', 'a', 'fair', 'value', 'in', 'a', 'widely', 'available', 'drinknow', 'oregon', 'pinot', 'the', 'wine', 'oakaged', 'for', 'six', 'months', 'whether', 'in', 'neutral', 'or', 'restaved', 'is', 'not', 'indicated', 'oak', 'and', 'earth', 'intermingle', 'around', 'robust', 'aromas', 'of', 'wet', 'forest', 'floor', 'in', 'this', 'vineyarddesignated', 'pinot', 'that', 'hails', 'from', 'a', 'highelevation', 'site', 'small', 'in', 'production', 'it', 'offers', 'intense', 'fullbodied', 'raspberry', 'and', 

# 3. Tokenizing

In [None]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 316364


In [None]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [None]:
# save sequences to file
out_filename = 'reviews_sequences.txt'
save_doc(sequences, out_filename)

# 4. creating the Language Generation model

In [None]:
#! pip install tensorflow==1.15.0
#! pip uninstall keras-nightly
#! pip install keras==2.0.8 


In [None]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'reviews_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

# save the model to file
model.save('model_prop_8k_100x.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Using TensorFlow backend.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            579900    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 11598)             1171398   
Total params: 1,902,198
Trainable params: 1,902,198
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100