# Neural Networks that Write like Shakesphere
## Recurrent Layers for Variable Length Data

## Data Preparation

In [1]:
import pandas as pd

imdb = pd.read_csv('datasets/imdb_master.csv', encoding='latin-1')
imdb

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
5,5,test,"A funny thing happened to me while watching ""M...",neg,10004_2.txt
6,6,test,This German horror film has to be one of the w...,neg,10005_2.txt
7,7,test,"Being a long-time fan of Japanese film, I expe...",neg,10006_2.txt
8,8,test,"""Tokyo Eyes"" tells of a 17 year old Japanese g...",neg,10007_4.txt
9,9,test,Wealthy horse ranchers in Buenos Aires have a ...,neg,10008_4.txt


In [2]:
train_rows = imdb.loc[(imdb['type'] == 'train') & ((imdb['label'] == 'pos') | (imdb['label'] == 'neg'))]
with open('datasets/reviews.txt', 'w') as f:
    for r in train_rows['review']:
        f.write(r+'\n')
        
with open('datasets/labels.txt', 'w') as f:
    for l in train_rows['label']:
        f.write(l+'\n')

In [3]:
import sys
f = open('datasets/reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('datasets/labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")),raw_reviews))
vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)

vocab = list(vocab)
word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            pass
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'pos\n':
        target_dataset.append(1)
    elif label == 'neg\n':
        target_dataset.append(0)


In [4]:
from sklearn.utils import shuffle
input_dataset_shuffled, target_dataset_shuffled = shuffle(input_dataset, target_dataset)
print('Length dataset samples {}'.format(len(input_dataset_shuffled)))
print('Length positive samples {}'.format(len([t for t in target_dataset_shuffled if t == 1])))
print('Length negative samples {}'.format(len([t for t in target_dataset_shuffled if t == 0])))

Length dataset samples 25000
Length positive samples 12500
Length negative samples 12500


## Emdedding Creation

In [5]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 5)
hidden_size = 100
weights_0_1 = 0.2*np.random.random((len(vocab),hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,1)) - 0.1

correct,total = (0,0)
for iter in range(iterations):
    # train on first 24,000
    for i in range(len(input_dataset_shuffled)-1000):
        x,y = (input_dataset_shuffled[i],target_dataset_shuffled[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0)) #embed + sigmoid
        layer_2 = sigmoid(np.dot(layer_1,weights_1_2)) # linear + softmax
        layer_2_delta = layer_2 - y # compare pred with truth
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) #backprop
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1,layer_2_delta) * alpha
        
        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset_shuffled)))
            sys.stdout.write('\rIter:'+str(iter)\
            +' Progress:'+progress[2:4]\
            +'.'+progress[4:6]\
            +'% Training Accuracy:'\
            + str(correct/float(total)) + '%')
    print()

correct,total = (0,0)
for i in range(len(input_dataset_shuffled)-1000,len(input_dataset_shuffled)):
    x = input_dataset_shuffled[i]
    y = target_dataset_shuffled[i]
    layer_1 = sigmoid(np.sum(weights_0_1[x],axis=0))
    layer_2 = sigmoid(np.dot(layer_1,weights_1_2))
    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total)))

Iter:0 Progress:95.99% Training Accuracy:0.8130416666666667%%
Iter:1 Progress:95.99% Training Accuracy:0.8585833333333334%
Iter:2 Progress:95.99% Training Accuracy:0.8870277777777777%
Iter:3 Progress:95.99% Training Accuracy:0.9084166666666667%
Iter:4 Progress:95.99% Training Accuracy:0.9245583333333334%
Test Accuracy:0.864


## Section 12.3

In [8]:
import numpy as np
from collections import Counter
import math

norms = np.sum(weights_0_1 * weights_0_1,axis=1)
norms.resize(norms.shape[0],1)
normed_weights = weights_0_1 * norms

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x], filter(lambda x:x in word2index,words)))
    return np.mean(normed_weights[indices],axis=0)

reviews2vectors = list()
for review in tokens: # tokenized reviews
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)

def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i,val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()
    for idx,score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:100])
    return most_similar


In [9]:
most_similar_reviews(['boring','awful'])

["The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Au",
 'This movie is so bad, it can only be compared to the all-time worst "comedy": Police Academy 7. No l',
 'Most definitely the worst Columbo ever dreamt up. No murder and the abandonment of the tried and tes']

In [10]:
most_similar_reviews(['great','amazing'])

['Excellent episode movie ala Pulp Fiction. 7 days - 7 suicides. It doesnt get more depressing than th',
 'Smallville episode Justice is the best episode of Smallville ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !',
 'Smallville episode Justice is the best episode of Smallville ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !']

# Section 12.8

In [11]:
import numpy as np

a = np.array([1,2,3])
b = np.array([0.1,0.2,0.3])
c = np.array([-1,-0.5,0])
d = np.array([0,0,0])

identity = np.eye(3)
print(identity)

print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


In [12]:
this = np.array([2,4,6])
movie = np.array([10,10,10])
rocks = np.array([1,1,1])
print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)

[13 15 17]
[13. 15. 17.]


# Section 12.11

In [17]:
import numpy as np

def softmax(_x):
    x = np.atleast_2d(_x)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

word_vects = {}
word_vects['yankees'] = np.array([[0., 0., 0.]])
word_vects['bears'] = np.array([[0., 0., 0.]])
word_vects['braves'] = np.array([[0., 0., 0.]])
word_vects['red'] = np.array([[0., 0., 0.]])
word_vects['socks'] = np.array([[0., 0., 0.]])
word_vects['lose'] = np.array([[0., 0., 0.]])
word_vects['defeat'] = np.array([[0., 0., 0.]])
word_vects['beat'] = np.array([[0., 0., 0.]])
word_vects['tie'] = np.array([[0., 0., 0.]])

sent2output = np.random.rand(3, len(word_vects))
identity = np.eye(3)

layer_0 = word_vects['red']
layer_1 = np.dot(layer_0, identity) + word_vects['socks']
layer_2 = np.dot(layer_1, identity) + word_vects['defeat']

pred = softmax(np.dot(layer_2, sent2output))
print(pred)

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


# Section 12.12

In [18]:
y = np.array([1,0,0,0,0,0,0,0,0]) # target one-hot vector for "yankees"
pred_delta = pred - y
layer_2_delta = pred_delta.dot(sent2output.T)
defeat_delta = layer_2_delta * 1 # can ignore the "1" like prev. chapter
layer_1_delta = layer_2_delta.dot(identity.T)
socks_delta = layer_1_delta * 1 # again... can ignore the "1"
layer_0_delta = layer_1_delta.dot(identity.T)
alpha = 0.01
word_vects['red'] -= layer_0_delta * alpha
word_vects['socks'] -= socks_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha
identity -= np.outer(layer_0,layer_1_delta) * alpha
identity -= np.outer(layer_1,layer_2_delta) * alpha
sent2output -= np.outer(layer_2,pred_delta) * alpha

# Section 12.13

In [None]:
! wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-1.tar.gz
! tar -xvf tasks_1-20_v1-1.tar.gz

In [21]:
import sys,random,math
from collections import Counter
import numpy as np

f = open('datasets/tasksv11/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()
tokens = list()

for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])
print(tokens[0:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1']]


In [22]:
vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [23]:
np.random.seed(1)

embed_size = 10
# word embeddings
embed = (np.random.rand(len(vocab),embed_size) - 0.5) * 0.1
# embedding -> embedding (initially the identity matrix)
recurrent = np.eye(embed_size)
# sentence embedding for empty sentence
start = np.zeros(embed_size)
# embedding -> output weights
decoder = (np.random.rand(embed_size, len(vocab)) - 0.5) * 0.1
# one hot lookups (for loss function)
one_hot = np.eye(len(vocab))

def predict(sent):
    layers = list()
    layer = {}
    layer['hidden'] = start
    layers.append(layer)
    loss = 0
    # forward propagate
    preds = list()
    for target_i in range(len(sent)):
        layer = {}
        # try to predict the next term
        layer['pred'] = softmax(layers[-1]['hidden'].dot(decoder))
        loss += -np.log(layer['pred'][sent[target_i]])
        # generate the next hidden state
        layer['hidden'] = layers[-1]['hidden'].dot(recurrent) + embed[sent[target_i]]
        layers.append(layer)
    return layers, loss

In [28]:
# forward
for iter in range(30000):
    alpha = 0.001
    sent = words2indices(tokens[iter%len(tokens)][1:])
    layers,loss = predict(sent)
    # back propagate
    for layer_idx in reversed(range(len(layers))):
        layer = layers[layer_idx]
        target = sent[layer_idx-1]
        if(layer_idx > 0): # if not the first layer
            layer['output_delta'] = layer['pred'] - one_hot[target]
            new_hidden_delta = layer['output_delta']\
            .dot(decoder.transpose())
            # if the last layer - don't pull from a
            # later one becasue it doesn't exist
            if(layer_idx == len(layers)-1):
                layer['hidden_delta'] = new_hidden_delta
            else:
                layer['hidden_delta'] = new_hidden_delta + \
                layers[layer_idx+1]['hidden_delta']\
                .dot(recurrent.transpose())
        else: # if the first layer
            layer['hidden_delta'] = layers[layer_idx+1]['hidden_delta']\
            .dot(recurrent.transpose())
            
    # update weights
    start -= layers[0]['hidden_delta'] * alpha / float(len(sent))
    for layer_idx,layer in enumerate(layers[1:]):
        decoder -= np.outer(layers[layer_idx]['hidden'],\
        layer['output_delta']) * alpha / float(len(sent))
        embed_idx = sent[layer_idx]
        embed[embed_idx] -= layers[layer_idx]['hidden_delta'] * \
        alpha / float(len(sent))
        recurrent -= np.outer(layers[layer_idx]['hidden'],\
        layer['hidden_delta']) * alpha / float(len(sent))
        if(iter % 1000 == 0):
            print("Perplexity:" + str(np.exp(loss/len(sent))))


Perplexity:82.01968947036016
Perplexity:82.01968947036016
Perplexity:82.01968947036016
Perplexity:82.01968947036016
Perplexity:81.88890996222612
Perplexity:81.88890996222612
Perplexity:81.88890996222612
Perplexity:81.88890996222612
Perplexity:81.66944062593234
Perplexity:81.66944062593234
Perplexity:81.66944062593234
Perplexity:81.66944062593234
Perplexity:81.2533657132904
Perplexity:81.2533657132904
Perplexity:81.2533657132904
Perplexity:81.2533657132904
Perplexity:80.41260197960652
Perplexity:80.41260197960652
Perplexity:80.41260197960652
Perplexity:80.41260197960652
Perplexity:78.57213147957657
Perplexity:78.57213147957657
Perplexity:78.57213147957657
Perplexity:78.57213147957657
Perplexity:73.79275536232552
Perplexity:73.79275536232552
Perplexity:73.79275536232552
Perplexity:73.79275536232552
Perplexity:54.61649580473904
Perplexity:54.61649580473904
Perplexity:54.61649580473904
Perplexity:54.61649580473904
Perplexity:28.740329201730873
Perplexity:28.740329201730873
Perplexity:28.74

# Section  12.18

In [29]:
sent_index = 4
l,_ = predict(words2indices(tokens[sent_index]))
print(tokens[sent_index])
for i,each_layer in enumerate(l[1:-1]):
    input = tokens[sent_index][i]
    true = tokens[sent_index][i+1]
    pred = vocab[each_layer['pred'].argmax()]
    print("Prev Input:" + input + (' ' * (12 - len(input))) +\
    "True:" + true + (" " * (15 - len(true))) + "Pred:" + pred)

['sandra', 'moved', 'to', 'the', 'garden.']
Prev Input:sandra      True:moved          Pred:is
Prev Input:moved       True:to             Pred:to
Prev Input:to          True:the            Pred:the
Prev Input:the         True:garden.        Pred:bedroom.
