In [18]:
#Download IMDB dataset
import sys
f = open(r'C:\datasets\DS_projects\Numpy neural network\reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open(r'C:\datasets\DS_projects\Numpy neural network\labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if len(word) > 0:
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
    
input_dataset = list()
for sent in tokens:
    sent_indicies = list()
    for word in sent:
        try:
            sent_indicies.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indicies)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

In [22]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = (0, 0)
for iter in range(iterations):
    
    for i in range(len(input_dataset) - 1000):
        x, y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if (np.abs(layer_2_delta) < 0.5):
            correct += 1
            
        total += 1
        if (i%10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write("\rIter:" + str(iter) + \
                             " Progress:" + progress[2:4] + \
                             "% Training Accuracy:" + str(correct/float(total)) + "%")
            
    print()
    
correct, total = (0, 0)
for i in range(len(input_dataset) - 1000, len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    
    if (np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:95% Training Accuracy:0.8342083333333333%%
Iter:1 Progress:95% Training Accuracy:0.8674791666666667%
Test Accuracy:0.849


In [29]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

In [32]:
similar('beautiful')

[('beautiful', -0.0),
 ('hooked', -0.7036775468304617),
 ('sent', -0.7257559354015641),
 ('available', -0.7423010232867517),
 ('performances', -0.7608041032075339),
 ('innocent', -0.7677762064852629),
 ('vhs', -0.7701434583144203),
 ('brilliant', -0.7763963613664883),
 ('realistic', -0.7776422633457915),
 ('scared', -0.777881979992764)]

In [34]:
similar('terrible')

[('terrible', -0.0),
 ('annoying', -0.73711902316873),
 ('poor', -0.774638080977361),
 ('boring', -0.7759687870622768),
 ('disappointment', -0.7866072410632217),
 ('mess', -0.7873729228507546),
 ('lacks', -0.7944383951131646),
 ('fails', -0.7978708143579716),
 ('laughable', -0.8084660895975864),
 ('dull', -0.8141379213570674)]

***
***

### Filling in the Blank

In [49]:
import sys,random,math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)
f = open(r'C:\datasets\DS_projects\Numpy neural network\reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

hidden_size,window,negative = (50,2,5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        target_samples = [review[target_i]]+list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]

        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)*iterations)) + "   " + str(similar('terrible')))
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)*iterations)))
print(similar('terrible'))

Progress:0.99998 [('terrible', -0.0), ('horrible', -3.0699970180376566), ('brilliant', -3.0836860706177633), ('superb', -3.698666337213767), ('pathetic', -3.6993491374587237), ('masterful', -3.784516403029361), ('phenomenal', -3.78809427097417), ('mediocre', -4.007479449608766), ('terrific', -4.04351100476048), ('dreadful', -4.078279006504741)]71688)])]27)])][('terrible', -0.0), ('horrible', -3.036501672840957), ('brilliant', -3.34804353726919), ('pathetic', -3.68753126467379), ('masterful', -3.9105050572043845), ('phenomenal', -3.9152131749739616), ('bad', -3.91931700004378), ('superb', -3.943135735100445), ('mediocre', -4.042231348436223), ('dreadful', -4.123511404171038)]


### King - Man + Woman ~= Queen

In [52]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
        
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)[1:]

In [53]:
analogy(['terrible', 'good'], ['bad'])

[('superb', -214.0124288995147),
 ('terrific', -214.36184712569323),
 ('decent', -214.57047017319027),
 ('fine', -214.57066092372062),
 ('perfect', -214.86626991463712),
 ('nice', -214.88701742178608),
 ('worth', -214.89191979093732),
 ('brilliant', -214.89590278359077),
 ('great', -214.95816706622736)]

In [54]:
analogy(['elizabeth', 'he'], ['she'])

[('christopher', -197.7111753781856),
 ('it', -197.96910768210537),
 ('morgan', -197.96996951025025),
 ('mr', -197.9836120604252),
 ('de', -198.073321696136),
 ('this', -198.07410990655822),
 ('william', -198.07647336644882),
 ('him', -198.08241814594183),
 ('simon', -198.08486287733103)]