#**Autoencoder Model for Word Embedding**

#I. Import necessary things

In [5]:
import tensorflow as tf
from tensorflow import keras
import re
import pandas as pd
import nltk
import numpy as np
from sklearn import model_selection, preprocessing
from scipy.spatial import distance

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
df = pd.read_csv('bbc_news_train.csv')
# split the dataset into training and test datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['Text'], df['Category'])

In [10]:
print(train_x.shape)
print(train_x[0])

(1117,)


#II. Load corpus and preprocess it

In [8]:
def text_cleaner(text):
    # lower case text
    text = text.lower()
    text = re.sub(r"'s\b","",text)
    # remove punctuations
    text = re.sub("[^a-zA-Z]", " ", text)
    return text

In [11]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def preprocess_text(corpus):
    all_tokens = []
    for doc in corpus:
        clean_doc = text_cleaner(doc)
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(clean_doc)
        all_tokens += word_tokens
    return [w for w in all_tokens if not w in stop_words]

corpus_token =  preprocess_text(train_x)
print(corpus_token)



In [12]:
dictionary = sorted(list(set(corpus_token)))
mapping = dict((c, i) for i, c in enumerate(dictionary))
print(dictionary)



#III. Convert corpus to one-hot vectors & Define embedding dim

In [13]:
embedding_dim = 20
vocab_size = len(dictionary)
print(vocab_size)

20911


In [14]:
corpus_encode = [mapping[x] for x in corpus_token]
onehot_corpus = keras.utils.to_categorical(corpus_encode, num_classes=vocab_size)
print(onehot_corpus)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


#IV. Define Autoencoder model

In [15]:
ae_model = keras.Sequential()
ae_model.add(keras.Input(shape=(vocab_size,)))
ae_model.add(keras.layers.Dense(embedding_dim, activation='relu'))
ae_model.add(keras.layers.Dense(vocab_size, activation='softmax'))

ae_model.compile(optimizer='adam', loss='categorical_crossentropy')

ae_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                418240    
_________________________________________________________________
dense_1 (Dense)              (None, 20911)             439131    
Total params: 857,371
Trainable params: 857,371
Non-trainable params: 0
_________________________________________________________________


#V. Train AE model

In [16]:
ae_model.fit(x=onehot_corpus, y=onehot_corpus, batch_size=32, epochs=20)
ae_model.save("ae_model.h5")

Epoch 1/1000

KeyboardInterrupt: 

#=======================================================================

# **How to use trained model**

#=======================================================================

#VI. Load trained model & Get output of 1st FC layer

In [None]:
reconstructed_model = keras.models.load_model("ae_model.h5")
we_model = keras.models.Model(inputs=reconstructed_model.inputs, outputs=reconstructed_model.get_layer('dense').output)

Define function to encode one-hot list of words

In [None]:
def encode_onehot(mapping, list_words):
  output = []
  
  for word in list_words:
    word_vector = [0 for _ in range(vocab_size)]

    if word in mapping.keys():
      word_index = mapping[word]
      word_vector[word_index] = 1
    
    output.append(word_vector)
  
  return output

#VII. Test model

In [None]:
input_sentence = "Trump is the greatest President of American"

preprocess_sentence = preprocess_text(input_sentence)
onehot_sentence = encode_onehot(mapping, preprocess_sentence)

embedded_sentence = we_model.predict(onehot_sentence)
print(embedded_sentence)

[[1.7612209e+00 1.9391452e+00 1.8000300e+00 1.9311217e+00 1.5777216e+00
  1.0722967e+00 1.7772222e+00 1.7033827e+00 1.9363596e+00 4.0368736e-02
  1.7155013e+00 1.6689301e-06 0.0000000e+00 0.0000000e+00 1.7384272e+00
  1.9088118e+00 1.8002540e+00 1.5481601e+00 0.0000000e+00 2.0573587e+00]
 [1.9139431e+00 1.8181293e+00 1.7403293e+00 1.8527267e+00 1.4071194e+00
  9.7701848e-02 5.8770180e-05 9.9241734e-05 1.8841062e+00 0.0000000e+00
  6.2035424e-01 1.8845162e+00 3.1578541e-04 2.7168483e-01 3.4743547e-02
  1.6944926e+00 0.0000000e+00 1.7678223e+00 8.3727962e-01 1.7130679e-01]
 [5.9425831e-05 0.0000000e+00 1.8357744e+00 1.8386316e+00 1.4022405e+00
  1.8568649e+00 0.0000000e+00 1.8167863e+00 1.8564149e+00 1.6960521e+00
  1.2040138e-03 0.0000000e+00 1.8318267e+00 1.6779935e+00 1.8676193e+00
  1.5178099e+00 2.0604765e+00 0.0000000e+00 2.0643063e+00 1.3053499e+00]
 [1.5184386e+00 1.1748892e-01 2.4961555e-01 1.6311193e+00 1.6476970e+00
  1.5744007e+00 2.1450889e-01 9.4129264e-02 1.6558611e+00 1.7

In [None]:
word_0 = "American"
word_1 = "Trump"
word_2 = "peace"

preprocess_words = preprocess_text(' '.join([word_0, word_1, word_2]))
onehot_words = encode_onehot(mapping, preprocess_words)

word_0_eb, word_1_eb, word_2_eb = we_model.predict(onehot_words)

print("OUTPUT EMBEDDING")
print(word_0_eb)
print(word_1_eb)
print(word_2_eb)

dst_0_1 = distance.euclidean(word_0_eb, word_1_eb)
dst_1_2 = distance.euclidean(word_1_eb, word_2_eb)
dst_2_0 = distance.euclidean(word_2_eb, word_0_eb)

print("OUTPUT DISTANCE")
print("0 vs 1: ", dst_0_1)
print("1 vs 2: ", dst_1_2)
print("2 vs 0: ", dst_2_0)

OUTPUT EMBEDDING
[5.9425831e-05 0.0000000e+00 1.8357744e+00 1.8386316e+00 1.4022405e+00
 1.8568649e+00 0.0000000e+00 1.8167863e+00 1.8564149e+00 1.6960521e+00
 1.2040138e-03 0.0000000e+00 1.8318267e+00 1.6779935e+00 1.8676193e+00
 1.5178099e+00 2.0604765e+00 0.0000000e+00 2.0643063e+00 1.3053499e+00]
[1.7612209e+00 1.9391452e+00 1.8000300e+00 1.9311217e+00 1.5777216e+00
 1.0722967e+00 1.7772222e+00 1.7033827e+00 1.9363596e+00 4.0368736e-02
 1.7155013e+00 1.6689301e-06 0.0000000e+00 0.0000000e+00 1.7384272e+00
 1.9088118e+00 1.8002540e+00 1.5481601e+00 0.0000000e+00 2.0573587e+00]
[0.8852233  0.8997967  0.87155277 0.90282    0.829422   0.8387903
 0.81097806 0.81453836 0.87210333 0.8350026  1.02293    0.98017836
 0.91022384 0.916189   0.8611091  0.8521709  0.8915332  0.8635101
 0.8960087  0.9158224 ]
OUTPUT DISTANCE
0 vs 1:  5.477977275848389
1 vs 2:  4.032898426055908
2 vs 0:  4.081043243408203
