In [3]:
import numpy as np
import re
from scipy.special import expit as sigmoid
import random

In [None]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz

In [None]:
import json
import gzip

filename = 'reviews_Electronics_5.json.gz' 
json_content = []
with gzip.open(filename , 'rb') as gzip_file:
    for line in gzip_file:  # Read one line.

        line = line.rstrip()
        if line:  # Any JSON data on it?
            obj = json.loads(line)

            json_content.append(obj['reviewText'])
            
            if len(json_content) == 50000:
                break

# print(json.dumps(json_content, indent=4))

In [4]:
# def tokenize(data):
#     return re.split(r"[^A-Za-z0-9]+", data)

In [12]:
# s = "my name is $jdbkgs!@#$%^UI}}}{{||~~"
# s = re.sub(r'[^\w\s]','', s)
# s

'my name is jdbkgsUI'

In [None]:
data = []
for review in json_content:
    for sent in review.split("."):
        if sent.strip():
            sent = re.sub(r'[^\w\s]', '', sent.strip().lower())
            sent = [token for token in sent.split() if token.isalpha()]
            data.append(sent)
        
data

In [14]:
from collections import defaultdict

freq_old = defaultdict(int)

def getfrequencies(data):
    for sent in data:
         for token in sent:
            freq_old[token] += 1
        
getfrequencies(data)

freq = defaultdict(int)

def remove_less_frequent(data):
    for sent in data:
        for token in sent:
            if freq_old[token] > 5:
                freq[token] = freq_old[token]
    
remove_less_frequent(data)

number_of_tokens = 0
for sent in data:
    number_of_tokens += len(sent)
print("Number of tokens:", number_of_tokens)

# data

Number of tokens: 2053535


In [None]:
data[:5]

In [16]:
def get_dict(data):
    words = sorted(list(freq.keys()))
    
    n = len(words)
    idx = 0
    word2Ind = {}
    Ind2word = {}
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
    return word2Ind, Ind2word

word2Ind, Ind2word = get_dict(data)

In [17]:
def get_probs(data):
    wordProbs = np.zeros(len(freq.keys()))
    for token in freq.keys():
        wordProbs[word2Ind[token]] = freq[token] ** (3/4)

    denominator = wordProbs.sum()
    wordProbs /= denominator

    return wordProbs

wordProbs = get_probs(data)
wordProbs

array([1.14488717e-02, 1.53770583e-04, 6.74578229e-05, ...,
       4.30828414e-05, 6.40560130e-05, 2.14228081e-05])

In [52]:
def get_context(sent, center, C, word2Ind):
    context = []
    len_sent = len(sent)

    for i in range(2):
        if not i:
            for j in range(C):
                # print(center, j)
                if center - 1 - j >= 0:
                    if sent[center - 1 - j] in word2Ind:
                        context.append(word2Ind[sent[center - 1 - j]])
            

        else:
            for j in range(C):
                if center + 1 + j < len_sent:
                    if sent[center + 1 + j] in word2Ind:
                        context.append(word2Ind[sent[center + 1 + j]])
    
    return list(set(context))

In [18]:
def initialize_model(N, V):
  
    W1 = np.random.randn(V, N)
    W2 = np.random.randn(N, V)
    # b1 = np.random.randn(V, 1)
    # b2 = np.random.randn(N, 1)

    # return W1, W2, b1, b2
    return W1, W2

In [23]:
def forward_prop(x, t, W1, W2):
    h = W1[x]
    z = sigmoid(np.dot(h, W2[:, t]))
    
    return z, h

In [47]:
def back_prop(z, h, t, context, W2):
    y = np.zeros(len(t))
    for i in range(len(context)):
        y[i] = 1
    
    e = z - y

    dW1 = np.dot(e, W2[:, t].T)
    dW2 = np.outer(e, h)

    return dW1, dW2, y

In [25]:
def compute_cost(y, yhat):
    eps = 1e-10
    logprobs = y * np.log(yhat + eps) + (1 - y) * np.log(1 - yhat + eps)
    cost = -np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [2]:
costs = []

def gradient_descent(data, N, V, num_epochs, C, word2Ind, NS_size, wordProbs, alpha):
    global costs

    random.shuffle(data)
    W1, W2 = initialize_model(N, V)

    max_sent = len(data)

    for epoch in range(num_epochs):
        total_cost = 0
        div = 0
        ss = 0
        
        for sent in data:
            for index in np.random.permutation(np.arange(len(sent))):
                if sent[index] in word2Ind:
                    center = word2Ind[sent[index]]
                    context = get_context(sent, index, C, word2Ind)

                    if len(context) > 0:
                        NS = np.random.choice(V, size = NS_size, p = wordProbs)
                        t = context.copy()

                        for ns in NS:
                            t.append(ns)

                        t = np.array(t)

                        z, h = forward_prop(center, t, W1, W2)
                        dW1, dW2, y = back_prop(z, h, t, context, W2)
                        cost = compute_cost(y, z)
                        print(cost)
                        total_cost += cost
                        div += 1

                        W1 -= alpha * dW1
                        W2 = W2.T
                        for ind in range(len(t)):
                            W2[t[ind]] -= alpha * dW2[ind]
                        W2 = W2.T

            ss += 1
            if ss % 1000 == 1:
                  print(max_sent - ss)
        
        alpha *= (0.66)
        costs.append(total_cost/div)
        print(total_cost/div)
        print()
        print()
        print()
        print()
        print()
        print()
        print()



    plt.plot(costs)
    plt.show()


            
    return W1, W2

In [None]:
N = 100
num_epochs = 2
C = 3
NS_size = 10

W1, W2 = gradient_descent(data, N, len(freq), num_epochs, C, word2Ind, NS_size, wordProbs, 0.03)