# SECTION 1: Import

In [6]:
import numpy as np

In [4]:
with open('labels.txt') as labels:
    target_dataset = [1 if lbl[0]=='p' else 0 for lbl in labels.readlines()]

In [7]:
np.array(target_dataset).shape

(25000,)

In [111]:
with open('reviews.txt') as reviews:
    raw_reviews = [review.upper() for review in reviews.readlines()]

In [112]:
raw_reviews[0]

'BROMWELL HIGH IS A CARTOON COMEDY . IT RAN AT THE SAME TIME AS SOME OTHER PROGRAMS ABOUT SCHOOL LIFE  SUCH AS  TEACHERS  . MY   YEARS IN THE TEACHING PROFESSION LEAD ME TO BELIEVE THAT BROMWELL HIGH  S SATIRE IS MUCH CLOSER TO REALITY THAN IS  TEACHERS  . THE SCRAMBLE TO SURVIVE FINANCIALLY  THE INSIGHTFUL STUDENTS WHO CAN SEE RIGHT THROUGH THEIR PATHETIC TEACHERS  POMP  THE PETTINESS OF THE WHOLE SITUATION  ALL REMIND ME OF THE SCHOOLS I KNEW AND THEIR STUDENTS . WHEN I SAW THE EPISODE IN WHICH A STUDENT REPEATEDLY TRIED TO BURN DOWN THE SCHOOL  I IMMEDIATELY RECALLED . . . . . . . . . AT . . . . . . . . . . HIGH . A CLASSIC LINE INSPECTOR I  M HERE TO SACK ONE OF YOUR TEACHERS . STUDENT WELCOME TO BROMWELL HIGH . I EXPECT THAT MANY ADULTS OF MY AGE THINK THAT BROMWELL HIGH IS FAR FETCHED . WHAT A PITY THAT IT ISN  T   \n'

In [9]:
np.array(raw_reviews).shape

(25000,)

In [12]:
words = set(' '.join(raw_reviews).split(' '))

In [19]:
word2index = {word: i for i, word in enumerate(words)}

In [20]:
word2index

{'': 0,
 'HANDYMAN': 1,
 'BEDROOMS': 2,
 'CRAWL': 3,
 'TUSSLES': 4,
 'STUDIOUSLY': 5,
 'INTERMEDIATE': 6,
 'LONGEVITY': 7,
 'TELEPORTATION': 8,
 'DOPES': 9,
 'CRUCIFIES': 10,
 'MACMURPHY': 11,
 'INCINERATES': 12,
 'VALID': 13,
 'CAB': 14,
 'HODGES': 15,
 'HOLMAN': 16,
 'COGSWORTH': 17,
 'YAKMALLAH': 18,
 'SUZU': 19,
 'CAMPUS': 20,
 'YUMA': 21,
 'DEPRESSES': 22,
 'LEENA': 23,
 'HEADTRIPPING': 24,
 'DHRY': 25,
 'ISRAELO': 26,
 'OLDS': 27,
 'POCASNI': 28,
 'ARCHIPELAGO': 29,
 'ENABLING': 30,
 'VAPOORIZE': 31,
 'GLACIALLY': 32,
 'PREFERRED': 33,
 'KEEL': 34,
 'BIGTIME': 35,
 'BENNY': 36,
 'YOUNGEST': 37,
 'INTENSIFIES': 38,
 'WINDSWEPT': 39,
 'EMRAAN': 40,
 'GJON': 41,
 'GRAZIA': 42,
 'SQUIBS': 43,
 'SEQUITURS': 44,
 'OHANA': 45,
 'ZIVAGHO': 46,
 'UNPALATABLY': 47,
 'MASTERCARD': 48,
 'POUCHY': 49,
 'NATIVIDAD': 50,
 'SERVICEMEN': 51,
 'THICKENS': 52,
 'WAINRIGHTS': 53,
 'LEVENS': 54,
 'CHERIE': 55,
 'JOY': 56,
 'EYEBROWS': 57,
 'SOFTLY': 58,
 'CONSISTANCY': 59,
 'LOVEABILITY': 60,
 'BOYUM

In [21]:
input_dataset = [[word2index[word] for word in review.split(' ')] for review in raw_reviews]

In [22]:
input_dataset[0]

[34015,
 37545,
 36198,
 39465,
 58413,
 39928,
 57587,
 29660,
 72073,
 20640,
 58974,
 6934,
 47512,
 4349,
 69789,
 5769,
 4034,
 11025,
 62813,
 46733,
 0,
 18114,
 4349,
 0,
 17671,
 0,
 57587,
 42680,
 0,
 0,
 57067,
 43093,
 58974,
 44325,
 53121,
 64022,
 58854,
 63656,
 66166,
 63658,
 34015,
 37545,
 0,
 30902,
 57246,
 36198,
 60615,
 1514,
 63656,
 70326,
 42396,
 36198,
 0,
 17671,
 0,
 57587,
 58974,
 21732,
 63656,
 51122,
 73688,
 0,
 58974,
 50384,
 46905,
 52713,
 33789,
 19968,
 8088,
 54064,
 17865,
 36407,
 17671,
 0,
 38739,
 0,
 58974,
 543,
 65919,
 58974,
 27541,
 25338,
 0,
 17220,
 15297,
 58854,
 65919,
 58974,
 22705,
 57997,
 61394,
 70688,
 17865,
 46905,
 57587,
 24921,
 57997,
 63606,
 58974,
 37227,
 43093,
 30941,
 39465,
 57809,
 72916,
 69170,
 63656,
 6044,
 34291,
 58974,
 62813,
 0,
 57997,
 24999,
 48400,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 20640,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587,
 57587

# SECTION 2: network

In [23]:
np.random.seed(1)

In [24]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [26]:
ALPHA = 0.01
ITERATIONS = 2
HIDDEN_SIZE = 100

In [35]:
weights_0_1 = 0.2 * np.random.random((len(words), HIDDEN_SIZE)) - 0.1
weights_1_2 = 0.2 * np.random.random((HIDDEN_SIZE, 1)) - 0.1

In [39]:
correct = 0
total = 0

In [44]:
def fit(x_train, y_train):
    correct = 0
    total = 0
    
    for iter in range(ITERATIONS):
        for i in range(len(x_train)):
            x = input_dataset[i]
            y = target_dataset[i]
            
            global weights_0_1, weights_1_2
            
            layer_1 = sigmoid(np.sum(weights_0_1[x], axis = 0))
            layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
            
            layer_2_delta = layer_2 - y
            layer_1_delta = layer_2_delta.dot(weights_1_2.T)
            
            weights_0_1[x] -= layer_1_delta * ALPHA
            weights_1_2 -= np.outer(layer_1, layer_2_delta) * ALPHA
            
            if (np.abs(layer_2_delta) < 0.5):
                correct += 1
            total += 1
        print('iter: {}, train: {}'.format(iter, correct/total))

In [45]:
fit(input_dataset[:-1000], target_dataset[:-1000])

iter: 0, train: 0.8082083333333333
iter: 1, train: 0.8418958333333333


In [48]:
def predict(x_test, y_test):
    correct = 0
    total = 0

    for i in range(len(x_test)):
    
        x = input_dataset[i]
        y = target_dataset[i]

        global weights_0_1, weights_1_2

        layer_1 = sigmoid(np.sum(weights_0_1[x], axis = 0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        if (np.abs(layer_2 - y) < 0.5):
            correct += 1
        total += 1
    print('test: {}'.format(correct/total))

In [49]:
predict(input_dataset[-1000:], target_dataset[-1000:])

test: 0.877


# SECTION 3: euclide distance

In [53]:
from collections import Counter
import math

In [113]:
def similar(target='beautiful'):
    target_index = word2index[target.upper()]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)

In [56]:
print(similar('beautiful'))

[('BEAUTIFUL', -0.0), ('JOB', -0.8201713046101491), ('SUPERB', -0.8449179511029534), ('FAVORITE', -0.8460199579648245), ('BRILLIANT', -0.854275180211964), ('LOVED', -0.88704156764818), ('HIGHLY', -0.8900605679933914), ('FUN', -0.9270620827573745), ('AMAZING', -0.9435052233027476), ('SIMPLE', -0.9659135001324589)]


In [57]:
print(similar('terrible'))

[('TERRIBLE', -0.0), ('WORSE', -0.7988922180376138), ('UNFORTUNATELY', -0.8496862106433781), ('ANNOYING', -0.8772104500349881), ('POORLY', -0.8874868452617922), ('NOTHING', -0.9418814341251022), ('BORING', -0.9499432151141799), ('SUPPOSED', -0.9695315327332293), ('DULL', -1.0475177730908505), ('MINUTES', -1.0953818108290263)]


# SECTION 4: predict word

In [77]:
ALPHA = 0.05
ITERATIONS = 2
HIDDEN_SIZE = 50
WINDOW = 2
NEGATIVE = 5

In [78]:
raw_reviews = raw_reviews[:10000]
words = set(' '.join(raw_reviews).split(' '))
word2index = {word: i for i, word in enumerate(words)}

In [88]:
input_dataset = [[word2index[word] for word in review.split(' ')] for review in raw_reviews]
random.shuffle(input_dataset)

In [91]:
len(input_dataset)

10000

In [92]:
input_dataset[0]

[16800,
 41133,
 38340,
 24806,
 41606,
 42864,
 42219,
 16132,
 24806,
 6738,
 33198,
 45628,
 27083,
 29965,
 21506,
 16132,
 7515,
 38922,
 0,
 30032,
 16249,
 33812,
 29965,
 21506,
 17127,
 0,
 29801,
 26300,
 31317,
 0,
 20441,
 1968,
 1365,
 0,
 24806,
 45082,
 1503,
 442,
 38340,
 24806,
 9477,
 44860,
 42864,
 24806,
 0,
 24765,
 10438,
 40834,
 24347,
 24806,
 46939,
 38340,
 21506,
 6467,
 46939,
 40008,
 31,
 30679,
 0,
 29801,
 33500,
 21506,
 30497,
 25595,
 33198,
 48927,
 29729,
 0,
 29801,
 24520,
 15257,
 0,
 27819,
 25595,
 8639,
 8622,
 27831,
 38340,
 0,
 35814,
 0,
 0,
 0,
 35814,
 0,
 0,
 21506,
 3904,
 20996,
 24806,
 9477,
 898,
 29965,
 1653,
 1534,
 40908,
 21506,
 32375,
 22067,
 33805,
 29801,
 12391,
 37963,
 27203,
 21506,
 45720,
 12342,
 38340,
 0,
 35814,
 0,
 0,
 0,
 35814,
 0,
 0,
 28328,
 32906,
 14691,
 27953,
 32150,
 33198,
 4688,
 227,
 39840,
 37401,
 1653,
 17127,
 41133,
 21506,
 3823,
 38340,
 18569,
 0,
 20441,
 1503,
 49105,
 24817,
 8585,

In [89]:
concatenated = np.array([word2index[word] for word in ' '.join(raw_reviews).split(' ')])

In [90]:
concatenated = np.array(concatenated)
print(concatenated.shape)
concatenated

(2946251,)


array([   27, 18488, 40908, ...,     0,     0,  7782])

In [86]:
weights_0_1 = 0.4 * np.random.random((len(words), HIDDEN_SIZE)) - 0.2
weights_1_2 = np.zeros((len(words), HIDDEN_SIZE))
layer_2_target = np.zeros(NEGATIVE + 1)
layer_2_target[0] = 1

In [84]:
weights_1_2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [95]:
indexes = (np.random.rand(NEGATIVE) * len(concatenated)).astype('int').tolist()

In [96]:
indexes

[2492819, 2766219, 190495, 518296, 2898945]

In [101]:
indexes = np.random.randint(0, len(concatenated), 5)

In [102]:
indexes

array([ 561892,  265739, 2881850,  295009,  884228])

In [107]:
for rev_i, review in enumerate(input_dataset * ITERATIONS):
    for target_i in range(len(review)):
        indexes = (np.random.rand(NEGATIVE) * len(concatenated)).astype('int').tolist()
        target_samples = [review[target_i]] + list(concatenated[indexes])
        left_context = review[max(0, target_i - WINDOW): target_i]
        right_context = review[target_i + 1: min(len(review), target_i + WINDOW)]
        layer_1 = np.mean(weights_0_1[left_context + right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])
        weights_0_1[left_context + right_context] -= layer_1_delta * ALPHA
        weights_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * ALPHA

In [114]:
word2index['terrible']

32973

In [115]:
similar('terrible')

[('terrible', -0.0),
 ('horrible', -3.126533937324243),
 ('fantastic', -3.719115489220661),
 ('brilliant', -3.777787812152067),
 ('pathetic', -4.049689818470703),
 ('fascinating', -4.0532591329362955),
 ('hilarious', -4.101615663014112),
 ('dreadful', -4.204791068613334),
 ('superb', -4.206683249345387),
 ('ridiculous', -4.22409420046186)]

# SECTION 5: analogy

In [118]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weights_0_1 * weights_0_1, axis=1)
    norms.resize(norms.shape[0], 1)
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
        
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [119]:
analogy(['terrible', 'good'], ['bad'])

[('decent', -210.85176375271558),
 ('good', -210.98537731251614),
 ('nice', -211.20574197882056),
 ('terrible', -211.3653396575296),
 ('fantastic', -211.53084418312977),
 ('superb', -211.61523431397774),
 ('great', -211.75504401081193),
 ('memorable', -211.89102898437523),
 ('wonderful', -211.91471731270016)]

In [123]:
analogy(['elizabeth', 'he'], ['she'])

[('far', -152.2759074743223),
 ('doesn', -152.4300828491404),
 ('same', -152.5552086204698),
 ('top', -152.800186446065),
 ('didn', -152.9483096351835),
 ('wouldn', -152.99856289403237),
 ('isn', -153.09799607359355),
 ('can', -153.21395313665366),
 ('taking', -153.27459549214925)]