## **Word2Vec Skip-gram**

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Embedding, Reshape, Conv1D, MaxPool1D, Dropout
from tensorflow.keras.preprocessing.text import one_hot,Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# **Read Data**

In [3]:
data = pd.read_csv('data.txt')

In [4]:
data.head()

Unnamed: 0,1
0,ای رستخیز ناگهان، وی رحمت بی منتها\tای آتشی اف...
1,امروز خندان آمدی، مفتاح زندان آمدی\tبر مستمندا...
2,خورشید را حاجب تویی، امید را واجب تویی\tمطلب ت...
3,در سینه ها برخاسته، اندیشه را آراسته\tهم خویش ...
4,ای روح بخش بی بَدَل، وی لذتِ علم و عمل\tباقی ب...


**Read Stop-Words**

In [5]:
def read_stop_words(filename):
  with open(filename) as stopwords_file:
    stopwords = stopwords_file.readlines()
  stopwords = [line.replace('\n', '') for line in stopwords] 
  return stopwords

In [6]:
stopwords = read_stop_words('stopwords.txt')
print(len(stopwords))

1421


**hazm library**

In [7]:
# install hazm library
!pip install hazm
from hazm import word_tokenize

Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 10.4 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 17.0 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 54.7 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394488 sha256=01a53434b51bc34dbd2d996462e13648669dc34b7a7a313539875b874922f93f
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86_64.whl size=154070 sha256=65c6e731d68884947ebd964828a41b3c212037d12555d1d8d7be95b16dfb8789
  Store

### **Preprocess the text**

In [8]:
# preprocess the text
def text_preprocess(data):
  text = [line.replace('\t', ' ') for line in data.values.flatten()]
  text = [line.replace('-', ' ') for line in text]
  text = [re.sub("\d+", "", t) for t in text]

  word_tokenized = [word_tokenize(t) for t in text]
  word_tokenized_filtered = [[w for w in sentence if w not in stopwords] for sentence in word_tokenized]

  sentences = [' '.join(sentence) for sentence in word_tokenized_filtered]
  sentences = [sentence for sentence in sentences if sentence != '']

  return sentences


In [9]:
sentences = text_preprocess(data)

In [10]:
sentences[0]

'رستخیز رحمت منتها آتشی افروخته بیشه اندیشه'

### **get less frequente words**

In [18]:
def get_all_sentences():
  all_sentences = ''
  sentences = text_preprocess(data)
  for sentence in sentences:
    all_sentences += sentence
    all_sentences += ' '
  return all_sentences  

def get_word_freq(vocabularies):
  word_freq = []
  for vocab in vocabularies:
    word_freq.append(vocabularies.count(vocab))
  return word_freq  

# get the words that frequentes less than 2 times in the corpus
def get_less_frequente_words():
  low_frequency_words = []
  all_sentences = get_all_sentences()
  vocabularies = all_sentences.split(' ')
  word_freq = get_word_freq(vocabularies)

  for i in range(len(word_freq)):
    if word_freq[i] < 2:
      low_frequency_words.append(vocabularies[i])
  return low_frequency_words   

In [20]:
less_frequente_words = get_less_frequente_words()

## **remove less frequente words**

In [21]:
# remove the words that frequentes less than 2 times in the corpus
def remove_less_frequente_words(less_frequente_words, sentences):
  sentences_tokenized = [word_tokenize(sentence) for sentence in sentences]
  sentences_tokenized_filtered = [[w for w in sentence if w not in less_frequente_words] for sentence in sentences_tokenized]
  corpus = [' '.join(sentence) for sentence in sentences_tokenized_filtered]
  corpus = [sentence for sentence in corpus if sentence != '']
  return corpus

In [22]:
corpus = remove_less_frequente_words(less_frequente_words, sentences)

In [23]:
corpus[0:10]

['رستخیز رحمت منتها آتشی افروخته بیشه اندیشه',
 'خندان آمدی مفتاح زندان آمدی آمدی بخشش فضل خدا',
 'خورشید حاجب امید واجب مطلب طالب منتها',
 'سینه اندیشه آراسته حاجت روا',
 'روح علم باقی بهانه دغل علت دوا',
 'دغل کژ گنه کین مست مست نان شوربا',
 'هل عقل هل نان نشاید ماجرا',
 'تدبیر رنگ افکنی روم زنگ افکنی جنگ افکنی',
 'پنهان گوش جان بهانه کسان جان رب زنان والله کیا',
 'خامش رفتم پای علم کاغذ بنه بشکن قلم ساقی درآمد الصلا']

## **Tokenizer**

In [24]:
# tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [25]:
with open('tokenizer.h5', 'wb') as f:
    pickle.dump(tokenizer, f)

In [26]:
list(tokenizer.word_index.items())[0:10]

[('جان', 1),
 ('دل', 2),
 ('عشق', 3),
 ('آب', 4),
 ('چشم', 5),
 ('شب', 6),
 ('جهان', 7),
 ('شمس', 8),
 ('دست', 9),
 ('مست', 10)]

In [29]:
encoded = tokenizer.texts_to_sequences(corpus)

In [30]:
encoded[0:10]

[[2692, 272, 1310, 385, 1583, 940, 133],
 [174, 551, 1097, 386, 551, 551, 803, 320, 23],
 [45, 2693, 618, 2000, 804, 247, 1310],
 [88, 133, 1311, 704, 705],
 [29, 210, 152, 387, 1584, 941, 233],
 [1584, 706, 2001, 552, 10, 10, 198, 2694],
 [451, 13, 451, 198, 1098, 407],
 [942, 98, 1585, 340, 1099, 1585, 181, 1585],
 [74, 40, 1, 387, 1586, 1, 248, 234, 273, 1100],
 [190, 707, 55, 210, 2695, 365, 708, 1101, 30, 553, 175]]

In [31]:
num_all_words = sum(len(s) for s in encoded) # total number of words in the corpus
num_unique_words = len(tokenizer.word_index) + 1  # total number of unique words in the corpus

In [32]:
num_all_words, num_unique_words

(31950, 4250)

### **Generate data**

In [33]:
# Parameters
window_size = 2

In [34]:
def generate_data(corpus, window_size, num_unique_words):
    maxlen = window_size * 2
    all_inputs = []
    all_outputs = []
    for words in corpus:
      len_words = len(words)
      for index,w in enumerate(words):
        s = index - window_size
        e = index + window_size + 1
        for i in range(s, e):
            if i != index and 0 <= i < len_words:
              all_inputs.append(w) 
              all_outputs.append(to_categorical(words[i], num_unique_words))

    return (np.array(all_inputs), np.array(all_outputs))

In [35]:
# Create training data
X_train, y_train = generate_data(encoded, window_size, num_unique_words)
X_train.shape, y_train.shape

((96036,), (96036, 4250))

In [36]:
X_train, y_train

(array([2692, 2692,  272, ...,  601, 1782, 1782]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))

## **Create Neural Network**

In [37]:
embed_size=100
model = Sequential()
model.add(Embedding(input_dim=num_unique_words, output_dim=embed_size, input_length=1, embeddings_initializer='glorot_uniform'))
model.add(Reshape((embed_size, )))
model.add(Dense(num_unique_words, activation='softmax', kernel_initializer='glorot_uniform'))

In [38]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [39]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 100)            425000    
                                                                 
 reshape (Reshape)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 4250)              429250    
                                                                 
Total params: 854,250
Trainable params: 854,250
Non-trainable params: 0
_________________________________________________________________


In [40]:
history = model.fit(X_train, y_train, epochs=200, verbose=1, batch_size=512)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

## **Save Model**

In [41]:
model.save('model_skipgram.h5')

## **Load Model**

In [42]:
# laod model
model = load_model('model_skipgram.h5')

In [43]:
model

<keras.engine.sequential.Sequential at 0x7f2032c88210>

### **Get Most Similarity**

In [44]:
with open('tokenizer.h5', 'rb') as f:
    tokenizer = pickle.load(f)

In [45]:
def get_most_similarity(word, model=model, tokenizer=tokenizer, n=15):
  num_unique_words = len(tokenizer.word_index) + 1
  word_to_sequences = tokenizer.texts_to_sequences([word])[0]
  prediction = model.predict(word_to_sequences)[0]
  index = np.argsort(prediction)[::-1][:n]
  sequences_to_word = tokenizer.sequences_to_texts([index])[0]
  most_similarity = sequences_to_word.split(' ')
  return most_similarity

In [46]:
get_most_similarity('عشق')

['عشق',
 'جان',
 'دل',
 'عقل',
 'آتش',
 'شاه',
 'جمله',
 'شمس',
 'عاشقان',
 'دست',
 'مست',
 'جهان',
 'گشت',
 'ملک',
 'روح']

In [47]:
get_most_similarity('شمس')

['تبریزی',
 'تبریز',
 'دین',
 'الدین',
 'مفخر',
 'الحق',
 'جان',
 'عشق',
 'خداوند',
 'نور',
 'الضحی',
 'دل',
 'ذره',
 'مخدوم',
 'خورشید']

In [48]:
get_most_similarity('کنج')

['عاشق',
 'درآرد',
 'زندان',
 'ای',
 'نرسم',
 'نشین',
 'کنجی',
 'خلا',
 'کنج',
 'مشین',
 'دمی',
 'بیدار',
 'کرده',
 'مطلق',
 'نه']

In [49]:
get_most_similarity('گل')

['آب',
 'خار',
 'گل',
 'بلبل',
 'دل',
 'باغ',
 'یار',
 'ریحان',
 'آتش',
 'گلزار',
 'نسرین',
 'رعنا',
 'چمن',
 'شکر',
 'رود']

In [50]:
get_most_similarity('مست')

['مست',
 'چشم',
 'جان',
 'آمدست',
 'خاک',
 'عشق',
 'عقل',
 'خراب',
 'ره',
 'دست',
 'خواجه',
 'جام',
 'یار',
 'جمله',
 'مخمور']

In [51]:
get_most_similarity('دست')

['دست',
 'جان',
 'دل',
 'پا',
 'پای',
 'عشق',
 'جام',
 'دهان',
 'مست',
 'شب',
 'شمس',
 'کف',
 'دلم',
 'باده',
 'یار']

In [52]:
get_most_similarity('عقل')

['عشق',
 'عقل',
 'جان',
 'مست',
 'خار',
 'دل',
 'قضا',
 'گل',
 'خط',
 'روح',
 'دین',
 'سلطان',
 'بلا',
 'نهد',
 'دام']

In [53]:
get_most_similarity('غم')

['دل',
 'غم',
 'جان',
 'شادی',
 'شاد',
 'اندوه',
 'جهان',
 'شب',
 'یار',
 'دست',
 'عشق',
 'باده',
 'آب',
 'عاشق',
 'بخسب']

In [54]:
get_most_similarity('انگور')

['غوره',
 'پخته',
 'باغبان',
 'خون',
 'گور',
 'ای',
 'روضه',
 'نفی',
 'مستم',
 'دل',
 'شد',
 'کدامین',
 'شور',
 'شوره',
 'برده']

In [55]:
get_most_similarity('آب')

['گل',
 'آب',
 'حیات',
 'جان',
 'سنگ',
 'آتش',
 'جوی',
 'روان',
 'بحر',
 'جو',
 'دل',
 'نان',
 'چشمه',
 'حیوان',
 'زندگانی']

In [56]:
get_most_similarity('آتش')

['آتش',
 'عشق',
 'آب',
 'جان',
 'گل',
 'دل',
 'برآوردم',
 'فلک',
 'زدی',
 'جگر',
 'خلیل',
 'خانه',
 'آفتاب',
 'مفرش',
 'سوز']

In [57]:
get_most_similarity('شب')

['شب',
 'مه',
 'گشت',
 'جان',
 'دل',
 'سحر',
 'ماه',
 'دست',
 'رخ',
 'تیره',
 'دام',
 'خلق',
 'شمع',
 'خواب',
 'سیاه']

In [58]:
get_most_similarity('جهان')

['جان',
 'جهان',
 'دل',
 'نهان',
 'عشق',
 'جمله',
 'تنگ',
 'غم',
 'گرد',
 'نقش',
 'شاه',
 'عاشق',
 'دید',
 'ذره',
 'بنگر']

In [59]:
get_most_similarity('دل')

['جان',
 'دل',
 'عشق',
 'خانه',
 'غم',
 'خون',
 'دست',
 'جهان',
 'گل',
 'تبریز',
 'رود',
 'سبک',
 'دین',
 'سخن',
 'نور']

In [60]:
get_most_similarity('مه')

['مه',
 'نور',
 'شب',
 'رخ',
 'فلک',
 'خورشید',
 'ماه',
 'جان',
 'لقا',
 'سجده',
 'لقایی',
 'چرخ',
 'پاره',
 'توبه',
 'استاره']

In [61]:
get_most_similarity('نور')

['مه',
 'نور',
 'شمع',
 'شمس',
 'شعله',
 'نار',
 'دل',
 'تبریزی',
 'آفتاب',
 'عالم',
 'دیدار',
 'سایه',
 'موسی',
 'چشم',
 'صبح']