In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
data =[]
sentences = ""
with open("/home/muzzammil/text_guten/final.txt",'r',encoding = "utf-8",errors  ="ignore") as corpus:
    for line in corpus:
        sentences = sentences + line.lower()


In [3]:
#!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
punc = ['#','"','$','%','&','(',')','*','+','-','/','<','=','>','@','[',']','\\','^','_','`','{','}','|','~','\n',',',"'"]
for p in punc:
    sentences = sentences.replace(p," ")
    
sentences = sentences.replace("?",".")
sentences = sentences.replace("!",".")
sentences = sentences.replace(":",".")
sentences = sentences.replace(";",".")

data = sentences.split(".")
data = [d for d in data if len(d.split()) >= 10]

In [4]:

print(len(data))

93181


In [5]:
vocab_size = 10000
embedding_dim = 12

tokenizer = Tokenizer(num_words = vocab_size,oov_token ="<oov>")
tokenizer.fit_on_texts(data)

In [6]:
print(tokenizer.word_index)



In [7]:
word_index = tokenizer.word_index
word_index = {word:index for word,index in word_index.items() if index<=vocab_size}
#rint(word_index["lapland"])
index_to_word = { index:word for word,index in word_index.items()}
print(len(word_index))

10000


In [8]:
refined_data = []
for s in data:
    tokens = s.split()
    sen = ""
    for word in tokens:
        if word in word_index:
            sen = sen+" "+ word
        else:
            sen = sen+" "+"<oov>"
    refined_data.append(sen)

print(refined_data[10:40])    

[' she leaves the outer door open after her and through it is seen a porter who is carrying a christmas tree and a basket which he gives to the maid who has opened the door', ' be sure the children do not see it until this evening when it is dressed', ' she is laughing to herself as she takes off her hat and coat', ' she takes a packet of <oov> from her pocket and eats one or two', ' then goes cautiously to her husband s door and <oov>', ' still humming she goes to the table on the right', ' puts the bag of <oov> into her pocket and <oov> her mouth', ' come in here torvald and see what i have bought', ' a little later he opens the door and looks into the room pen in hand', ' yes but torvald this year we really can let ourselves go a little', ' this is the first christmas that we have not needed to <oov>', ' yes torvald we may be a <oov> bit more reckless now <oov> t we', ' you are going to have a big salary and earn lots and lots of money', ' but then it will be a whole quarter before 

In [9]:
def generate_data(refined_data,word_index,window_size = 3):
    X =[]
    Y =[]
    for s in refined_data:
        tokens = [word_index[word] for word in s.split()]
        for i,context in enumerate(tokens):
            for j in range(max(i-window_size,0),i):
                X.append(tokens[i])
                Y.append(tokens[j])
            for j in range(i+1,min(i+window_size,len(tokens))):
                X.append(tokens[i])
                Y.append(tokens[j])
    return X,Y
                
            

In [10]:
X,Y = generate_data(refined_data,word_index,window_size = 3)

In [11]:
print(X[:30])
print(Y[:30])

[2, 2, 155, 155, 155, 156, 156, 156, 156, 1036, 1036, 1036, 1036, 1036, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5587, 5587, 5587, 5587, 5587, 31]
[155, 156, 2, 156, 1036, 2, 155, 1036, 5, 2, 155, 156, 5, 6, 155, 156, 1036, 6, 5587, 156, 1036, 5, 5587, 31, 1036, 5, 6, 31, 137, 5]


In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1,embedding_dim,input_length = 1),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(embedding_dim,activation = 'relu'),
    tf.keras.layers.Dense(vocab_size+1,activation = 'softmax')
])

In [14]:
model.compile(loss = 'sparse_categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 12)             120012    
_________________________________________________________________
flatten (Flatten)            (None, 12)                0         
_________________________________________________________________
dense (Dense)                (None, 12)                156       
_________________________________________________________________
dense_1 (Dense)              (None, 10001)             130013    
Total params: 250,181
Trainable params: 250,181
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.fit(X,Y,epochs=5)

 11439/292407 [>.............................] - ETA: 18:56 - loss: 6.4991 - accuracy: 0.0557

KeyboardInterrupt: 

In [18]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(1001, 12)


In [19]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [20]:
import io

out_v = io.open('/mnt/d/word2vec/vecs_10000.tsv', 'w', encoding='utf-8')
out_m = io.open('/mnt/d/word2vec/meta_10000.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()