# Neural Networks that Understand Language:
## King - Man + Woman == ?

## Section 11.5

In [1]:
import numpy as np
onehosts = {
    'cat': np.array([1, 0, 0, 0]),
    'the': np.array([0, 1, 0, 0]),
    'dog': np.array([0, 0, 1, 0]),
    'sat': np.array([0, 0, 0, 1])
}

x = onehosts['the'] + onehosts['cat'] + onehosts['sat']
print("Sentence Encoding {}".format(x))

Sentence Encoding [1 1 0 1]


## Section 11.6

In [2]:
import pandas as pd

imdb = pd.read_csv('datasets/imdb_master.csv', encoding='latin-1')
imdb

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
5,5,test,"A funny thing happened to me while watching ""M...",neg,10004_2.txt
6,6,test,This German horror film has to be one of the w...,neg,10005_2.txt
7,7,test,"Being a long-time fan of Japanese film, I expe...",neg,10006_2.txt
8,8,test,"""Tokyo Eyes"" tells of a 17 year old Japanese g...",neg,10007_4.txt
9,9,test,Wealthy horse ranchers in Buenos Aires have a ...,neg,10008_4.txt


In [3]:
train_rows = imdb.loc[(imdb['type'] == 'train') & ((imdb['label'] == 'pos') | (imdb['label'] == 'neg'))]
with open('datasets/reviews.txt', 'w') as f:
    for r in train_rows['review']:
        f.write(r+'\n')
        
with open('datasets/labels.txt', 'w') as f:
    for l in train_rows['label']:
        f.write(l+'\n')

In [14]:
import sys
f = open('datasets/reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('datasets/labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))
vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)

vocab = list(vocab)
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            pass
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'pos\n':
        target_dataset.append(1)
    elif label == 'neg\n':
        target_dataset.append(0)


In [21]:
from sklearn.utils import shuffle
input_dataset_shuffled, target_dataset_shuffled = shuffle(input_dataset, target_dataset)
print('Length dataset samples {}'.format(len(input_dataset_shuffled)))
print('Length positive samples {}'.format(len([t for t in target_dataset_shuffled if t == 1])))
print('Length negative samples {}'.format(len([t for t in target_dataset_shuffled if t == 0])))

Length dataset samples 25000
Length positive samples 12500
Length negative samples 12500


## Section 11.8

In [16]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 5)
hidden_size = 100
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

correct,total = (0,0)
for iter in range(iterations):
    # train on first 24,000
    for i in range(len(input_dataset_shuffled)-1000):
        x,y = (input_dataset_shuffled[i],target_dataset_shuffled[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #embed + sigmoid
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2)) # linear + softmax
        layer_2_delta = layer_2 - y # compare pred with truth
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) #backprop
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha
        
        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset_shuffled)))
            sys.stdout.write('\rIter:'+str(iter)\
            +' Progress:'+progress[2:4]\
            +'.'+progress[4:6]\
            +'% Training Accuracy:'\
            + str(correct/float(total)) + '%')
    print()

correct,total = (0,0)
for i in range(len(input_dataset_shuffled)-1000,len(input_dataset_shuffled)):
    x = input_dataset_shuffled[i]
    y = target_dataset_shuffled[i]
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:95.99% Training Accuracy:0.8100416666666667%%
Iter:1 Progress:95.99% Training Accuracy:0.8575416666666666%
Iter:2 Progress:95.99% Training Accuracy:0.8866388888888889%
Iter:3 Progress:95.99% Training Accuracy:0.9082916666666667%
Iter:4 Progress:95.99% Training Accuracy:0.9245833333333333%
Test Accuracy:0.877


## Section 11.12

In [19]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

print(similar('awesome'))
print(similar('horrible'))

[('awesome', -0.0), ('negative', -0.6428268962317969), ('dismiss', -0.6587273863519271), ('everyone,', -0.6716502022368201), ('driven', -0.6752046573839212), ('relax', -0.6880980202842559), ('guys', -0.6951381178230869), ('9', -0.7047216587301881), ('Stanley', -0.7088872448337845), ('/>7', -0.7116472084817893)]
[('horrible', -0.0), ('forgettable', -0.7431769315015866), ('pointless', -0.7682601939836151), ('mess.', -0.7806290451098229), ('lacks', -0.7862819661498142), ('badly', -0.7864137199647989), ('terrible.', -0.7895563188012381), ('wonder', -0.7940850750996855), ('saving', -0.7948789805611245), ('1', -0.7964693908541256)]


## Section 11.14

In [9]:
import sys,random,math
from collections import Counter
import numpy as np
np.random.seed(1)
random.seed(1)

f = open('datasets/reviews.txt')
raw_reviews = f.readlines()
print(len(raw_reviews))
f.close()

raw_reviews = raw_reviews[:10000]
tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)
weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0
layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

for rev_i,review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):
        # since it's really expensive to predict every vocabulary
        # we're only going to predict a random subset
        target_samples = [review[target_i]]+list(concatenated[(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])
        left_context = review[max(0,target_i-window):target_i]
        right_context = review[target_i+1:min(len(review),target_i+window)]
        layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        weights_0_1[left_context+right_context] -= layer_1_delta * alpha
        weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha
        
    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)*iterations)) + " " + str(similar('terrible')))
        #sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        #*iterations)))
print("Similar Terrible: {}".format(similar('terrible')))

25000
Progress:0.9875 [('terrible', -0.0), ('horrible', -3.5885288355003637), ('mediocre', -3.679223905936852), ('dreadful', -3.6989494935181813), ('wonderful', -3.759486794719543), ('laughable', -3.762810580774965), ('ridiculous', -3.929027547308048), ('lame', -3.9713006850989974), ('pathetic', -3.9780746143466046), ('fantastic', -4.060631101363255)]117)]95)]Similar Terrible: [('terrible', -0.0), ('horrible', -3.319912597693124), ('dreadful', -3.638214373670726), ('mediocre', -3.778918862039497), ('laughable', -3.8938659462641647), ('ridiculous', -3.8981728567788125), ('wonderful', -3.9186827631203864), ('pathetic', -4.009095940972478), ('lame', -4.01268017269171), ('lousy', -4.077510011610978)]


In [12]:
def analogy(positive=['terrible','good'],negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    normed_weights = weights_0_1 * norms
    query_vect = np.zeros(len(weights_0_1[0]))
    
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)[1:]

print(analogy(['terrible','good'],['bad']))


[('decent', -286.1548836754392), ('terrible', -286.92683695688805), ('positive', -287.08144592769185), ('fine', -287.3772167731053), ('redeeming', -287.3860512507905), ('nice', -287.40956233753053), ('different', -287.4316921613575), ('small', -287.4348638694352), ('worth', -287.4910575403065)]
