In [1]:
import numpy as np
import re

In [2]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz

--2020-09-13 14:15:55--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: ‘reviews_Electronics_5.json.gz’


2020-09-13 14:16:31 (13.6 MB/s) - ‘reviews_Electronics_5.json.gz’ saved [495854086/495854086]



In [30]:
import json
import gzip

filename = 'reviews_Electronics_5.json.gz' 
json_content = []
with gzip.open(filename , 'rb') as gzip_file:
    for line in gzip_file:  # Read one line.

        line = line.rstrip()
        if line:  # Any JSON data on it?
            obj = json.loads(line)

            json_content.append(obj['reviewText'])
            
            if len(json_content) == 50000:
                break

# print(json.dumps(json_content, indent=4))

In [None]:
# json_content

In [4]:
def tokenize(data):
    return re.split(r"[^A-Za-z0-9]+", data)

In [None]:
# print("25".lower())

In [5]:
data = [re.sub(r'[!?]', '.', review) for review in json_content]
data2 = []
for sent in data:
    for x in sent.split("."):
        if x.strip():
            data2.append(x.strip())
data = data2
# data

In [6]:
data = [re.sub(r'[,!?;-]', '.', sent) for sent in data]
data = [tokenize(sent) for sent in data]

data = [[ch.lower() for ch in sent if ch.isalpha() or ch == '.'] for sent in data]

from collections import defaultdict

freq = defaultdict(int)

def getfrequencies(data):
    for sent in data:
         for token in sent:
            freq[token] += 1
        
getfrequencies(data)

def remove_less_frequent(data_old):
    data = []
    for sent in data_old:
         cur_sent = [token for token in sent if freq[token] > 5]
         
         if len(cur_sent) > 0:
             data.append(cur_sent)
                
    return data
    
data = remove_less_frequent(data)

number_of_tokens = 0
for sent in data:
    number_of_tokens += len(sent)
print("Number of tokens:", number_of_tokens)

# data

Number of tokens: 2079908


In [None]:
data[:5]

In [8]:
import numpy as np
from scipy import linalg
from collections import defaultdict

epoch = -1
def get_vectors(data, word2Ind, V, C):
    global epoch
#     epoch = -1
    samples = 0
    while True:
        epoch += 1
        print(f"new epoch {epoch}, samples are {samples}")
        
#         if epoch == 5:
#             exit(0)

        for sent in data:
            prob = np.random.uniform(0, 1)

            if prob > 0.4:
                continue

            len_sent = len(sent)

            for i in range(len_sent):
                y = np.zeros(V)
                y[word2Ind[sent[i]]] = 1


                x = np.zeros(V)
                mean_cnt = 0
                for j in range(max(0, i - C), min(i + C + 1, len_sent)):
                    if j != i:
                        x[word2Ind[sent[j]]] += 1
                        mean_cnt += 1

                if mean_cnt > 0:
                    x /= mean_cnt

                samples += 1
                yield x, y

batches = 0
def get_batches(data, word2Ind, V, C, batch_size):
    global batches
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        # print("here")
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            batches += 1
            # print(batches)
            yield np.array(batch_x).T, np.array(batch_y).T
            batch = []

def get_dict(data):
    words = set([])
    for sent in data:
        for token in sent:
            words.add(token)
    words = sorted(list(words))
    n = len(words)
    idx = 0
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

In [9]:
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  9019


In [None]:
# def get_training_data(data, word2Ind, V, C):
#     global epoch
#     X = []
#     Y = []
#     epoch = -1
    
#     for x, y in get_vectors(data, word2Ind, V, C):
#       if epoch == 1:
#         break
#       X.append(x)
#       Y.append(Y)

#     X = np.array(X)
#     Y = np.array(Y)

#     return X, Y

In [None]:
# X, Y = get_training_data(data, word2Ind, V, 3)

In [None]:
# epoch = -1
# for x, y in get_batches(data, word2Ind, V, 3, 128):
#   if epoch == 6:
#     break
#     # pass

In [None]:
# print(epoch)

In [None]:
# # example of word to index mapping
# print("Index of the word 'king' :  ",word2Ind['king'] )
# print("Word which has index 2743:  ",Ind2word[2743] )

In [10]:
def initialize_model(N,V):
  
    W1 = np.random.rand(N, V)
    W2 = np.random.rand(V, N)
    b1 = np.random.rand(N, 1)
    b2 = np.random.rand(V, 1)

    return W1, W2, b1, b2

In [11]:
def softmax(z):
    e_z = np.exp(z)
    yhat = e_z/np.sum(e_z, axis=0)
    return yhat

In [12]:
def forward_prop(x, W1, W2, b1, b2):
    h = np.dot(W1, x) + b1
    # h = np.maximum(0, h)
    z = np.dot(W2, h) + b2
    return z, h

In [13]:
eps = 1e-10
def compute_cost(y, yhat, batch_size):
    logprobs = np.multiply(np.log(yhat + eps),y) + np.multiply(np.log(1 - yhat + eps), 1 - y)
    cost = -10 * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [14]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    l1 = np.dot(W2.T, yhat - y)
    # l1 = np.maximum(l1, 0)
    grad_W1 = np.dot(l1, x.T) / batch_size
    grad_W2 = np.dot(yhat - y, h.T) / batch_size
    grad_b1 = np.sum(l1, axis = 1, keepdims = True) / batch_size
    grad_b2 = np.sum(yhat - y, axis = 1, keepdims = True) / batch_size
    
    return grad_W1, grad_W2, grad_b1, grad_b2

In [23]:
costs = []

def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.03):
    global costs
    W1, W2, b1, b2 = initialize_model(N, V)
    batch_size = 256
    iters = 0
    C = 3
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        z, h = forward_prop(x, W1, W2, b1, b2)
        yhat = softmax(z)
        cost = compute_cost(y, yhat, batch_size)
        costs.append(cost)
        
        if ((iters+1) % 10 == 1):
            print(f"iters: {iters + 1} cost: {cost:.12f}")
        
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)

        W1 -= alpha * grad_W1 
        W2 -= alpha * grad_W2
        b1 -= alpha * grad_b1
        b2 -= alpha * grad_b2

        
        iters += 1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2

In [None]:
C = 3
N = 100
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 240000
print("Call gradient_descent")
W1, W2, b1, b2 = gradient_descent(data, word2Ind, N, V, num_iters, 0.03)

import matplotlib.pyplot as plt
axes = plt.plot(costs)
plt.ylim(0, 10)
plt.show()

In [20]:
W2

array([[0.38435372, 0.42447199, 0.82103233, ..., 0.71661822, 0.54792843,
        0.97807928],
       [0.44532166, 0.68786715, 0.09882211, ..., 0.32850059, 0.7522977 ,
        0.18998135],
       [0.22457577, 0.73915263, 0.91651081, ..., 0.97168771, 0.36568035,
        0.81263412],
       ...,
       [0.54045888, 0.04603386, 0.70633335, ..., 0.63795372, 0.3660147 ,
        0.63182383],
       [0.8279313 , 0.06105092, 0.55871708, ..., 0.11467812, 0.76364993,
        0.01626806],
       [0.15397381, 0.03347262, 0.89411425, ..., 0.27012268, 0.1518555 ,
        0.22288167]])

# New Section