In [1]:
import gzip, math, numpy, re, string, random, os.path
import tensorflow as tf

path = "reviews_Amazon_Instant_Video_5.json.gz"
vocab_path = "AFINN-111.txt"

# Hyperparameters
learning_rate = 0.01

def parse_review_dataset(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)
    
def parse_vocabulary(path):
    vocabulary_dict = {}
    for line in open(path, 'r').readlines():
        (word, measure) = line.strip().split('\t')
        
        # Measure is an int in range [-5, 5]
        vocabulary_dict[word] = int(measure)
    return vocabulary_dict

def split_strip_punctuation(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    no_punc_text = regex.sub('', text.lower())
    words_only = no_punc_text.split()
    #list of words in review
    #print(words_only)
    return words_only

word_in_text = lambda word, text: 1.0 if word in text else 0.0

def encode_review_text(text):
    word_list = split_strip_punctuation(text)
    encoded_vector = [word_in_text(vocab_word, text) for vocab_word in vocabulary_dict]
    return encoded_vector

vocabulary_dict = parse_vocabulary(vocab_path)
print("Vocabulary size: ", len(vocabulary_dict), "\n")

data = []

print(" === Reading and encoding data... === \n")

k = 0
for review in parse_review_dataset(path):
    review_item = []
    review_item.append(encode_review_text(review['summary'] + " " + review['reviewText']))
    review_item.append(int(review['overall']))
    data.append(review_item)
    k += 1
    if k % 5000 == 0:
        print("Progress: ", k)

data_size = len(data)
training_data_size = math.floor(data_size * 0.8)

print("\nDataset size: ", len(data), "\n")
print("Training dataset size: ", training_data_size, "\n")


num_features = len(vocabulary_dict)
num_samples = training_data_size
num_classes = 5
   
# Features = Words
# Feature Matrix

X = tf.placeholder(tf.float32, [num_samples, num_features])
Y = tf.placeholder(tf.int32, [num_samples])

Y_one_hot = tf.one_hot(Y, depth=num_classes)

weights = tf.Variable(tf.random_normal([num_features, num_classes],
                                       mean=0,
                                       stddev=0.01,
                                       name="weights"))

bias = tf.Variable(tf.zeros([1,num_classes], name="bias"))


init_op = tf.global_variables_initializer()

apply_weights_op = tf.matmul(X, weights, name="apply_weights")
add_bias_op = tf.add(apply_weights_op, bias, name="add_bias") 
activation_op = tf.nn.sigmoid(add_bias_op, name="activation")
cost_op = tf.nn.l2_loss(tf.nn.sigmoid_cross_entropy_with_logits(logits=activation_op, labels=Y_one_hot), name="cost")
    
training_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op)

cost = 0
diff = 1
num_epochs = 10 #100



with tf.Session() as sess:
    if os.path.isfile("LogisticRegressionAmazon.ckpt"):
        saver.restore(sess, "LogisticRegressionAmazon.ckpt")
    else:
        sess.run(tf.global_variables_initializer())
        print("\nInitializing session...\n")

    for i in range(num_epochs):
        #if i > 1 and diff < .0001:
        #    print("\nConvergence. Diff: "%diff)
        #    break
        #else:
            print("\n =========== STEP ", i, " =========== \n")
            
            train_data = random.sample(data, training_data_size)
            train_data_X, train_data_Y = zip(*random.sample(data, training_data_size))
            #print(train_data_X[0], "  ===>  Rating: ", train_data_Y[0], "\n")
            
            newCost = sess.run(cost_op, feed_dict={X: train_data_X, Y: train_data_Y})
            diff = abs(newCost - cost)
            cost = newCost

            print("Step: ", i, "   Cost:  ", newCost)
            print("Step: ", i, "   Diff: ", diff)

            saver = tf.train.Saver()
            saver.save(sess, "LogisticRegressionAmazon.ckpt")
            
            biasSummary = tf.summary.histogram("biases", bias.eval(session=sess))
    
sess.close()

# How to get the unsampled data?

Vocabulary size:  2477 

 === Reading and encoding data... === 

Progress:  5000
Progress:  10000
Progress:  15000
Progress:  20000
Progress:  25000
Progress:  30000
Progress:  35000

Dataset size:  37126 

Training dataset size:  29700 


Initializing session...



Step:  0    Cost:   65600.0
Step:  0    Diff:  65600.0


Step:  1    Cost:   65570.5
Step:  1    Diff:  29.5156


Step:  2    Cost:   65581.6
Step:  2    Diff:  11.0781


Step:  3    Cost:   65588.0
Step:  3    Diff:  6.47656


Step:  4    Cost:   65595.8
Step:  4    Diff:  7.72656


Step:  5    Cost:   65566.1
Step:  5    Diff:  29.7031


Step:  6    Cost:   65593.8
Step:  6    Diff:  27.7188


Step:  7    Cost:   65588.7
Step:  7    Diff:  5.08594


Step:  8    Cost:   65594.6
Step:  8    Diff:  5.92188


Step:  9    Cost:   65607.1
Step:  9    Diff:  12.5078
