In [46]:
import pandas as pd
import numpy as np

from gensim.models import KeyedVectors

In [47]:
from sklearn.model_selection import train_test_split

from keras import Model
from keras import Sequential
from keras.layers import Dense

In [48]:
word_vectors = KeyedVectors.load_word2vec_format('trmodel', binary=True)

In [49]:
#test
word_vectors.most_similar(positive=["geliyor","gitmek"],negative=["gelmek"])

[('gidiyor', 0.592621922492981),
 ('gidiyorum', 0.5690363645553589),
 ('gidelim', 0.5637925863265991),
 ('geldim', 0.5413458943367004),
 ('bakıyor', 0.5373592376708984),
 ('gittim', 0.5343413949012756),
 ('gideceğim', 0.5251941680908203),
 ('geldik', 0.505060076713562),
 ('geliyoruz', 0.5047824382781982),
 ('gider', 0.504159688949585)]

In [50]:
vector_dict = {}
for word in word_vectors.index_to_key: 
    vector_dict[word] = word_vectors.get_vector(word)

df = pd.DataFrame(vector_dict).T
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
ve,0.296941,-0.269876,-1.180450,0.190302,-0.286231,1.286693,-0.886565,0.358893,-0.773943,1.468419,...,-0.650401,-0.742386,0.142566,0.559789,-0.762779,-0.337023,-0.248934,1.191360,0.110982,0.160032
kategori,-1.299320,0.990288,0.307114,-1.498320,1.659920,0.028483,-0.201647,2.227363,1.416063,0.608082,...,-0.449724,1.288062,-1.129284,-3.064066,-0.667573,-1.473625,-0.382927,-1.564002,1.952312,2.889835
bir,0.510168,-0.418357,-1.772496,0.227772,-0.475176,0.432043,-3.295489,0.403906,-0.536450,0.002459,...,1.942218,0.180768,0.778027,1.420458,-0.523086,0.714488,0.296618,1.671680,0.358448,-0.116159
da,0.203403,0.377098,-2.617118,-1.166068,0.350960,0.072562,-1.258488,0.911878,-0.665523,1.468329,...,2.019327,0.436097,-1.341626,2.594386,-1.235236,0.492994,0.000796,0.438153,-1.542385,0.250436
de,0.477729,0.802959,-2.586396,-1.524061,0.580028,-0.531550,-1.301819,0.316712,-0.331988,1.520155,...,1.572341,1.336991,0.004450,0.023776,0.022804,0.143684,-0.769812,1.441397,-1.471803,1.120255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zırhlardan,-0.037423,-0.064962,0.003443,-0.006851,-0.027573,0.048518,-0.011570,-0.002876,0.051829,-0.019547,...,-0.033733,-0.029264,0.033644,0.049926,-0.015837,-0.012362,-0.005236,-0.059078,-0.031783,-0.019363
tanklardı,-0.008692,0.008498,0.018832,0.068778,-0.043494,0.077601,0.027750,0.001146,0.020478,-0.033677,...,-0.016961,-0.052620,-0.019530,0.008937,0.067636,-0.028386,0.064698,0.000106,0.007457,-0.044515
bēkon,-0.043483,0.022171,0.011649,0.008683,0.004279,-0.005566,0.018467,-0.005685,0.000652,0.012918,...,-0.045089,-0.002602,0.001707,-0.005691,-0.009385,0.025023,-0.002974,-0.049444,0.004175,-0.027719
coosemans,-0.047166,-0.025059,0.031535,0.039888,-0.013063,0.030065,0.038028,0.020881,0.089070,-0.033487,...,-0.026820,-0.039541,-0.033091,0.044215,0.046135,-0.001221,-0.031015,-0.053000,-0.043469,-0.069313


In [51]:
class AutoEncoders(Model):

    def __init__(self, output_units):
        super().__init__()
        activ_func = "LeakyReLU" 
        
        self.encoder = Sequential(
            [
                Dense(300, activation=activ_func),
                Dense(200, activation=activ_func),
                Dense(100, activation=activ_func),
                Dense(40, activation=activ_func),
            ]
        )

        self.decoder = Sequential(
            [
                Dense(40, activation=activ_func),
                Dense(100, activation=activ_func),
                Dense(200, activation=activ_func),
                Dense(300, activation=activ_func),
                Dense(output_units, activation="linear") 
            ]
        )


    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [52]:
INPUT = list(vector_dict.values())
OUTPUT = list(vector_dict.keys())
X = np.asarray(INPUT)
y = np.asarray(OUTPUT)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1, shuffle = False)

In [53]:
auto_encoder = AutoEncoders(X.shape[1])
auto_encoder.compile(
    loss='mse',
    metrics=['mse'],
    optimizer='adam'
)

In [54]:
history = auto_encoder.fit(
    X_train,
    X_train,
    epochs=1,
    batch_size=64,
    validation_data=(X_test, X_test)
)
auto_encoder.summary()
auto_encoder.encoder.summary()
auto_encoder.decoder.summary()

history = auto_encoder.fit(
    X_train,
    X_train,
    epochs=100, 
    batch_size=64,
    validation_data=(X_test, X_test)
)

encoded = auto_encoder.encoder(X).numpy()
decoded = auto_encoder.decoder(encoded).numpy()

print(encoded)

Model: "auto_encoders_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_6 (Sequential)   (None, 40)                204640    
                                                                 
 sequential_7 (Sequential)   (None, 400)               206640    
                                                                 
Total params: 411280 (1.57 MB)
Trainable params: 411280 (1.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_25 (Dense)            (None, 300)               120300    
                                                                 
 dense_26 (Dense)            (None, 200)               60200     
                                                           

In [67]:
encoded.shape

(412457, 40)

In [68]:
word_vectors.vectors = word_vectors.vectors[:,0:40] 
word_vectors.vector_size = 40

In [69]:
for word, vector in zip(list(word_vectors.key_to_index.values()), encoded):
    word_vectors.vectors[word] = vector

In [70]:
word_vectors.syn0norm = None
word_vectors.fill_norms(force=True)
word_vectors.get_normed_vectors()

array([[ 0.22479425,  0.10453413,  0.1206836 , ...,  0.13660567,
        -0.05555847, -0.08413158],
       [ 0.12412136,  0.06913043,  0.10093462, ...,  0.1114392 ,
        -0.06289022, -0.14442302],
       [ 0.07095607,  0.13517934,  0.18149143, ...,  0.15895435,
        -0.03559506, -0.07762313],
       ...,
       [ 0.16969655,  0.22771138,  0.12386845, ...,  0.05123984,
        -0.110131  , -0.18512909],
       [ 0.15769629,  0.25126234,  0.14403404, ...,  0.14322414,
        -0.10679551, -0.16430952],
       [ 0.18754789,  0.33141026,  0.13147567, ...,  0.15469831,
        -0.07907975, -0.20366178]], dtype=float32)

In [79]:
#test
word_vectors.most_similar(positive=["geliyor","gitmek"],negative=["gelmek"])

[('geliyoruz', 0.9236549735069275),
 ('geldiğimde', 0.9195456504821777),
 ('rastladım', 0.9137952327728271),
 ('bağladım', 0.9092761278152466),
 ('yerleşin', 0.9069922566413879),
 ('gideceğim', 0.9050990343093872),
 ('yerleş', 0.9029061794281006),
 ('yardan', 0.901644766330719),
 ('gideceksin', 0.9004161357879639),
 ('yağmış', 0.8982545733451843)]

In [82]:
#test
word_vectors.most_similar(positive=["brokoli","patates"],negative=["meyve"])

[('lahana', 0.9346223473548889),
 ('pancar', 0.9302788972854614),
 ('kızartması', 0.9274839758872986),
 ('ezmesi', 0.925815761089325),
 ('ıspanak', 0.923117995262146),
 ('turşusu', 0.9229980111122131),
 ('kartol', 0.9228886961936951),
 ('erişte', 0.9220452308654785),
 ('karalahana', 0.9209051728248596),
 ('miso', 0.9204742908477783)]

In [80]:
word_vectors.similar_by_key("internet", topn=10, restrict_vocab=None)

[('web', 0.9463964700698853),
 ('facebook', 0.9444456696510315),
 ('blog', 0.9443336129188538),
 ('paylaşım', 0.9389534592628479),
 ('mynet', 0.9324177503585815),
 ('Web', 0.9304092526435852),
 ('ınternet', 0.9287830591201782),
 ('Wordpress', 0.9243919253349304),
 ('ağ', 0.9243459701538086),
 ('tumblr', 0.9240301251411438)]

In [74]:
vector_dict_new = {}
for word in word_vectors.index_to_key: 
    vector_dict_new[word] = word_vectors.get_vector(word)

df = pd.DataFrame(vector_dict_new).T
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
ve,2.206798,1.026208,1.184747,0.593272,2.577303,2.871521,-1.364984,1.016538,0.341405,0.308782,...,1.395383,2.191643,0.809908,-0.077335,1.542955,1.110401,1.876573,1.341053,-0.545416,-0.825917
kategori,2.382383,1.326888,1.937338,0.482670,-0.213720,2.293994,-1.004302,0.668673,1.220763,-0.061286,...,3.205154,2.362246,0.227907,6.962224,0.734765,-0.481548,0.574860,2.138962,-1.207114,-2.772053
bir,0.957867,1.824844,2.450032,1.767411,3.953114,4.714775,-2.716141,3.286252,-0.096642,0.705466,...,1.009294,2.402153,2.724217,-0.164441,2.067279,0.722448,1.331011,2.145794,-0.480513,-1.047868
da,2.150470,4.129030,2.423162,1.271311,4.176322,5.218581,-1.219997,1.913177,-1.945502,2.582092,...,-0.169896,1.646575,1.768944,1.060161,1.850431,1.426014,1.182015,2.510493,-1.137761,-1.906018
de,2.007289,3.745586,2.680676,1.159703,4.277326,5.012633,-1.324931,1.115746,-0.679559,2.133528,...,-0.286980,0.417521,1.825454,0.860004,1.629969,1.857992,0.412137,3.235987,-0.798860,-1.830661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zırhlardan,0.079502,0.298920,0.126667,0.079523,0.225341,0.169391,-0.179397,0.123934,-0.103940,0.102190,...,0.182464,0.126936,0.218056,0.130266,0.123055,0.151548,0.139524,0.175488,-0.031088,-0.156149
tanklardı,0.151478,0.252644,0.157573,0.140643,0.277283,0.220984,-0.169026,0.109878,-0.112233,0.074280,...,0.212731,0.148030,0.189897,0.032477,0.133748,0.231958,0.145211,0.180077,-0.109420,-0.229828
bēkon,0.141650,0.190076,0.103396,0.061164,0.195010,0.249845,-0.127987,0.057267,-0.146276,0.019158,...,0.123661,0.082751,0.078318,-0.001527,0.121922,0.143550,0.058766,0.042771,-0.091929,-0.154532
coosemans,0.158550,0.252623,0.144814,0.097701,0.218991,0.098823,-0.139720,0.169183,-0.072500,0.071288,...,0.150008,0.206961,0.141978,0.181806,0.196731,0.192109,0.187174,0.144000,-0.107374,-0.165199


In [76]:
word_vectors.save_word2vec_format("trmodel_truncated", binary=True)

In [77]:
word_vectors = KeyedVectors.load_word2vec_format('trmodel_truncated', binary=True)