In [1]:
import gensim

In [2]:
from gensim.models import KeyedVectors

In [3]:
model = KeyedVectors.load_word2vec_format('../dataset/word2vec.6B.200d.txt')

In [4]:
model.most_similar(positive=['person', 'monarch', 'queen'], negative=['king'])

[('woman', 0.611224889755249),
 ('spouse', 0.5144810676574707),
 ('she', 0.5090979933738708),
 ('mother', 0.5086205005645752),
 ('only', 0.5061703324317932),
 ('her', 0.5001659989356995),
 ('someone', 0.494682252407074),
 ('one', 0.4940170645713806),
 ('normally', 0.4937121868133545),
 ('child', 0.48596322536468506)]

In [5]:
model.similar_by_word('asian')

[('asia', 0.7845684885978699),
 ('pacific', 0.6230911612510681),
 ('chinese', 0.5776212811470032),
 ('african', 0.5714053511619568),
 ('southeast', 0.5647538900375366),
 ('markets', 0.5640715956687927),
 ('economies', 0.5618596076965332),
 ('world', 0.5607767701148987),
 ('asean', 0.5589121580123901),
 ('countries', 0.5564254522323608)]

In [6]:
import numpy as np
import pandas as pd
import nltk

In [7]:
data = pd.read_csv('../dataset/imdb_labelled.txt', sep='\t', header=None, names=['Review', 'Sentiment'])
data.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [8]:
data.shape

(748, 2)

In [9]:
data.drop_duplicates(inplace=True)

In [10]:
data.shape

(745, 2)

In [11]:
X = data['Review']
y = data['Sentiment']

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [13]:
tokenizer = Tokenizer()

In [14]:
tokenizer.fit_on_texts(X)

In [15]:
vocab_size = len(tokenizer.word_counts.keys())

In [16]:
vocab_size

3133

In [17]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'is': 5,
 'this': 6,
 'i': 7,
 'it': 8,
 'to': 9,
 'in': 10,
 'was': 11,
 'movie': 12,
 'film': 13,
 'that': 14,
 '0': 15,
 '1': 16,
 'for': 17,
 'as': 18,
 'but': 19,
 'with': 20,
 'one': 21,
 'on': 22,
 'you': 23,
 'are': 24,
 'not': 25,
 'bad': 26,
 "it's": 27,
 'very': 28,
 'all': 29,
 'just': 30,
 'so': 31,
 'good': 32,
 'at': 33,
 'an': 34,
 'be': 35,
 'there': 36,
 'about': 37,
 'have': 38,
 'by': 39,
 'like': 40,
 'from': 41,
 'if': 42,
 'acting': 43,
 'time': 44,
 'his': 45,
 'or': 46,
 'out': 47,
 'really': 48,
 'great': 49,
 'even': 50,
 'he': 51,
 'who': 52,
 'were': 53,
 'has': 54,
 'see': 55,
 'my': 56,
 'characters': 57,
 'well': 58,
 'most': 59,
 'how': 60,
 'more': 61,
 'no': 62,
 'only': 63,
 'when': 64,
 'ever': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 '10': 71,
 'they': 72,
 'best': 73,
 'because': 74,
 'your': 75,
 'can': 76,
 'also': 77,
 "don't": 78,
 'films': 79,
 'than': 80,
 'its': 81,
 

In [18]:
tokenizer.word_counts

OrderedDict([('a', 433),
             ('very', 65),
             ('slow', 6),
             ('moving', 4),
             ('aimless', 1),
             ('movie', 181),
             ('about', 50),
             ('distressed', 1),
             ('drifting', 1),
             ('young', 4),
             ('man', 13),
             ('not', 72),
             ('sure', 3),
             ('who', 38),
             ('was', 185),
             ('more', 31),
             ('lost', 4),
             ('the', 848),
             ('flat', 2),
             ('characters', 35),
             ('or', 41),
             ('audience', 5),
             ('nearly', 1),
             ('half', 6),
             ('of', 377),
             ('whom', 2),
             ('walked', 2),
             ('out', 41),
             ('attempting', 1),
             ('artiness', 1),
             ('with', 90),
             ('black', 9),
             ('white', 8),
             ('and', 434),
             ('clever', 5),
             ('camera', 10),
       

In [19]:
tokens = tokenizer.texts_to_sequences(X)
tokens[0]

[3, 28, 28, 28, 287, 407, 1216, 12, 37, 3, 1217, 1218, 408, 143]

In [20]:
X[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [21]:
sentence_length = [len(x) for x in tokens]
min(sentence_length), max(sentence_length)

(1, 1400)

In [22]:
from collections import Counter
Counter(sentence_length)

Counter({14: 23,
         18: 18,
         29: 7,
         8: 37,
         21: 21,
         20: 30,
         3: 24,
         15: 30,
         10: 30,
         6: 38,
         11: 47,
         4: 26,
         16: 34,
         25: 14,
         17: 21,
         872: 1,
         12: 42,
         5: 36,
         19: 21,
         24: 15,
         34: 8,
         7: 35,
         23: 12,
         9: 37,
         2: 12,
         13: 25,
         26: 6,
         1: 3,
         37: 2,
         22: 14,
         27: 8,
         35: 4,
         200: 1,
         1400: 1,
         45: 4,
         28: 7,
         302: 1,
         43: 2,
         31: 8,
         55: 1,
         44: 2,
         33: 10,
         36: 5,
         69: 1,
         57: 1,
         32: 3,
         30: 6,
         73: 1,
         47: 1,
         38: 3,
         39: 1,
         53: 1,
         51: 1,
         42: 2,
         802: 1})

In [23]:
padded_tokens = pad_sequences(tokens, maxlen=50)

In [24]:
embedding_index = {}

f = open('../dataset/glove.6B.200d.txt', encoding='UTF-8')
for line in f:
    values = line.split()
    word = values[0]
    coef = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coef
f.close()

In [25]:
embedding_index['many']

array([ 2.0507e-01,  1.5127e-01, -2.5217e-01, -1.7233e-01,  2.6062e-01,
        7.0112e-02, -1.0398e+00,  6.2410e-01, -2.7643e-01,  3.3849e-01,
        3.3937e-01,  3.5320e-01, -2.1393e-01, -1.5521e-01, -8.0089e-02,
        6.7428e-02, -3.8797e-01,  7.9854e-01, -5.4901e-01, -1.1373e-01,
        1.0324e-01,  2.9706e+00, -6.3670e-02, -6.2094e-01,  2.2930e-01,
        2.0083e-01,  1.9280e-01, -3.9005e-01,  5.5081e-03, -1.3289e-01,
       -6.3410e-02, -2.0959e-02,  3.9320e-02,  3.3570e-01, -9.2012e-02,
       -2.1650e-01, -3.8748e-01, -3.6220e-01,  2.2562e-01,  5.0937e-02,
        4.0245e-01, -3.5205e-02,  6.2662e-02,  1.6478e-01,  6.6872e-02,
       -1.5084e-01,  1.1231e+00, -1.9255e-01, -1.2812e-01,  2.2183e-01,
       -4.8346e-01, -1.1292e-01, -2.9738e-01,  3.7365e-01,  6.3390e-01,
       -3.9455e-02,  9.9065e-02,  3.0907e-02, -7.2679e-02,  2.0678e-01,
        2.5039e-02,  4.2294e-01, -3.7570e-01,  4.5134e-01,  3.6297e-01,
        2.7407e-01, -3.6211e-01,  6.2673e-01,  2.0517e-01, -1.71