In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups

import nltk
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer

# Data Preprocessing

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print(len(documents))

11314


In [3]:
df = pd.DataFrame(documents, columns=['documents'])
df.head(5)

Unnamed: 0,documents
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


In [4]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html
df["clean_doc"] = df["documents"].str.replace("[^a-zA-Z]", " ")
# https://www.w3schools.com/python/python_lambda.asp
df["clean_doc"] = df["clean_doc"].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # remove words less than 3
df["clean_doc"] = df["clean_doc"].apply(lambda x: x.lower()) # lower case
df.head(3)

Unnamed: 0,documents,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased. what disagr...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...","yeah, expect people read faq, etc. actually ac..."
2,Although I realize that principle is not one o...,although realize that principle your strongest...


In [5]:
# check nullable
print(df.isnull().values.any())
# In Python's numpy library (which pandas is built upon), NaN is used to denote missing or undefined data.
# inPlace=True means that the changes are saved to the df right away (without having to assign it to another variable)
df.replace("", float("NaN"), inplace=True)
print(df.isnull().values.any())
df.dropna(inplace=True)
print(len(df))

False
True
11004


In [6]:
# download stopwords
!python -m nltk.downloader stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/godpeny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# apply stopwords
tokenized_doc = df['clean_doc'].apply(lambda x: x.split()) # tokenization
tokenized_doc = df['clean_doc'].apply(lambda x: [w for w in x.split() if w not in stopwords]) # remove stop-words
print(tokenized_doc[:5])
print(len(tokenized_doc))

0    [well, sure, story, seem, biased., disagree, s...
1    [yeah,, expect, people, read, faq,, etc., actu...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal,,...
4    [well,, change, scoring, playoff, pool., unfor...
Name: clean_doc, dtype: object
11004


In [8]:
short_sentence_indices = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
print(short_sentence_indices)
tokenized_doc = np.delete(tokenized_doc, short_sentence_indices)
print(len(tokenized_doc))

# https://pandas.pydata.org/docs/reference/api/pandas.Series.to_list.html
# tokenized_doc = tokenized_doc.to_list()

[44, 353, 486, 1224, 1653, 2323, 2373, 2864, 3292, 3389, 3397, 3398, 3562, 3567, 3595, 3784, 3879, 4185, 4592, 4622, 4952, 4975, 5395, 5530, 6020, 6657, 6728, 6888, 7085, 7961, 8161, 8288, 8422, 8594, 8627, 9703, 10283, 10447, 10738, 10758, 10904, 10916, 10964]
10961


In [9]:
# tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word_2_idx = tokenizer.word_index
idx_2_word = {v:k for k, v in word_2_idx.items()} # key is index, value is word
encoded = tokenizer.texts_to_sequences(tokenized_doc) 
print(encoded[:5])

vocab_size = len(word_2_idx) + 1 # index 0 is for padding

[[40, 53, 927, 143, 15889, 1684, 546, 279, 871, 12028, 17773, 24007, 29726, 279, 871, 63435, 871, 1128, 1103, 1998, 851, 29727, 913, 731, 20477, 279, 871, 170, 143, 1811, 149, 279, 20478, 17773, 6645, 5710, 76, 63436, 7, 36, 165, 614, 653, 29728, 6911, 24008, 2082, 829, 17774, 1119, 8790, 355, 1072, 15890, 671, 57, 163, 4231, 7206, 1933, 440, 56, 282, 4730, 9275, 2690, 39306], [1283, 429, 3, 52, 6164, 159, 112, 474, 89, 17775, 18, 63, 4731, 2865, 63437, 1042, 402, 39307, 8791, 902, 44, 8328, 316, 13041, 902, 3452, 5923, 533, 18, 87, 4732, 9872, 160, 1403, 120, 151, 5194, 63438, 63439, 17776, 63440, 13041, 903, 63441, 63442, 11172, 17777], [249, 851, 2773, 9276, 4033, 1, 26, 2, 5, 95, 295, 904, 5711, 17, 655, 7, 2549, 63443, 6165, 39308, 311, 30, 13042, 36, 151, 484, 295, 280, 904, 1204, 415, 851, 1, 1217, 904, 63444, 1431, 282, 35, 86, 4591, 39308, 311, 1, 1150, 56, 2, 4335, 743, 312, 152, 11173, 1192, 10475, 656, 15891, 128, 4127, 123, 20479, 14287, 2423], [29729, 1484, 15892, 8329, 1

# Negative Sampling

In [10]:
from keras.preprocessing.sequence import skipgrams

In [11]:
# test with samples
skip_grams = [skipgrams(sequence=sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]
# show relationship
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        idx_2_word[pairs[i][0]], pairs[i][0],
        idx_2_word[pairs[i][1]], pairs[i][1],
        labels[i]))
    
print(len(skip_grams))
print(len(pairs))
print(len(labels))

(treating (4730), deport (93025)) -> 0
(u.s. (279), pro-israeli (63435)) -> 1
(inhuman (17774), rec.nude (139872)) -> 0
(might (36), kazhakstan, (149379)) -> 0
(government (57), mtm+2tm+2<7$9&1fpl%-3[>wm[8n+-#3%q<5g#tq,3$q,b8f)b<g)r186%a86 (172327)) -> 0
10
2460
2460


In [12]:
# do with all datasets
skip_grams = [skipgrams(sequence=sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

# Modeling

In [13]:
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input
from keras.layers import Dot
from keras.utils import plot_model
from IPython.display import SVG

In [14]:
embedding_dims = 100

# embedding table for word
w_input = Input(shape=(1,), dtype='int32')
word_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dims)(w_input)

# embedding table for context
c_input = Input(shape=(1,), dtype='int32')
context_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dims)(c_input)

print(word_embedding.shape) # check shape
print(context_embedding.shape)

(None, 1, 100)
(None, 1, 100)


In [15]:
"""
Practice : Dot product with 2 embedding tables
"""
x = np.arange(12).reshape(1, 2, 6)
print(x)

y = np.arange(12, 24).reshape(1, 6, 2)
print(y)

result = Dot(axes=(1, 2))([y,x]) # x * y
print(result)

[[[ 0  1  2  3  4  5]
  [ 6  7  8  9 10 11]]]
[[[12 13]
  [14 15]
  [16 17]
  [18 19]
  [20 21]
  [22 23]]]
tf.Tensor(
[[[290 902]
  [305 953]]], shape=(1, 2, 2), dtype=int64)


In [None]:
! python -m pip install pydot
! python -m pip install graphviz

In [16]:
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,))(dot_product)
output = Activation('sigmoid')(dot_product)

model = Model(inputs=[w_input, c_input], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

plot_model(model, to_file='sgns.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1, 100)               1818390   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 embedding_1 (Embedding)     (None, 1, 100)               1818390   ['input_2[0][0]']         

In [17]:
for epoch in range(1, 6): 
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32') # zip(*elem[0]) : transpose
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)
    print('Epoch :',epoch, 'Loss :',loss)

Epoch : 1 Loss : 5588.079095847905
Epoch : 2 Loss : 4304.712773874402
Epoch : 3 Loss : 4064.503253623843
Epoch : 4 Loss : 3770.21881897375
Epoch : 5 Loss : 3455.9627989614382


# Save Embedding Vectors

In [18]:
import gensim

In [19]:
# save embedding vectors
f = open('./embeddings/vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dims))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [22]:
# load embedding vectors
w2v = gensim.models.KeyedVectors.load_word2vec_format('./embeddings/vectors.txt', binary=False)

In [23]:
w2v.most_similar(positive=['soldiers'])

[('villages', 0.7206360697746277),
 ('lebanese', 0.6771818995475769),
 ('shelling', 0.6653899550437927),
 ('arab', 0.6578678488731384),
 ('occupied', 0.6578179597854614),
 ('murdered', 0.6374198794364929),
 ('ottoman', 0.6369696259498596),
 ('lebanon', 0.6363499164581299),
 ('destruction', 0.6337578296661377),
 ('israelis', 0.6337222456932068)]