In [1]:
import logging
logging.basicConfig(level=logging.INFO)
from pathlib import Path

import numpy as np
import pandas as pd
from gensim.models import Word2Vec

from naruto_skills.new_voc import Voc
from naruto_skills.word_embedding import WordEmbedding

INFO:summarizer.preprocessing.cleaner:'pattern' package not found; tag filters are not available for English


# Wiki

In [None]:
root = '/dataset/'

In [None]:
class CsvSubdirsCorpus(object):
    """
    Iterable: on each iteration, return bag-of-words vectors,
    one vector for each document.
 
    Process one document at a time using generators, never
    load the entire corpus into RAM.
 
    """
    def __init__(self, top_dir):
        self.top_dir = top_dir
 
    def __iter__(self):
        """
        Again, __iter__ is a generator => TxtSubdirsCorpus is a streamed iterable.
        """
        for idx, p in enumerate(Path(self.top_dir).glob('wiki_clean.csv')):
            # read each document as one big string
            dfs = pd.read_csv(p, chunksize=5e5)
            for df in dfs:
                list_docs = list(df['target'].dropna())
                for doc in list_docs:
                    yield doc.lower().split()


In [None]:
corpus = CsvSubdirsCorpus(root)

In [None]:
model = Word2Vec(corpus, size=300, window=10, min_count=100, workers=8)

In [None]:
len(model.wv.index2word)

In [None]:
model.save('/source/main/vocab/output/word2vec_gensim_model_wiki')

In [None]:
model.wv.index2entity[-10:]

In [None]:
model.get_latest_training_loss()

In [6]:
model.most_similar('méo')
model.most_similar('gà')
model.most_similar('chó')
# model.most_similar('fuck')

  """Entry point for launching an IPython kernel.
INFO:gensim.models.keyedvectors:precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):
  
  This is separate from the ipykernel package so we can avoid doing imports until


[('mèo', 0.7636736631393433),
 ('thỏ', 0.6197304725646973),
 ('cún', 0.6055511236190796),
 ('khỉ', 0.6011084914207458),
 ('sói', 0.5494617223739624),
 ('nhím', 0.5479783415794373),
 ('husky', 0.5313003659248352),
 ('cọp', 0.5297122001647949),
 ('dê', 0.5252997875213623),
 ('trâu', 0.5251345634460449)]

# Training wiki + social

In [2]:
social = '/source/main/preprocess/output/topics/'
wiki = '/dataset/'

class CsvSubdirsCorpus(object):
    """
    Iterable: on each iteration, return bag-of-words vectors,
    one vector for each document.
 
    Process one document at a time using generators, never
    load the entire corpus into RAM.
 
    """
    def __init__(self, wiki, social):
        self.wiki = wiki
        self.social = social
 
    def __iter__(self):
        """
        Again, __iter__ is a generator => TxtSubdirsCorpus is a streamed iterable.
        """
        for idx, p in enumerate(Path(self.wiki).glob('wiki_clean.csv')):
            # read each document as one big string
            dfs = pd.read_csv(p, chunksize=5e5)
            for df in dfs:
                list_docs = list(df['target'].dropna())
                for doc in list_docs:
                    yield doc.lower().split()
                    
        for idx, p in enumerate(Path(self.social).glob('*.csv')):
            # read each document as one big string
            dfs = pd.read_csv(p, chunksize=5e5)
            for df in dfs:
                list_docs = list(df['mention'].dropna())
                for doc in list_docs:
                    yield doc.lower().split()
        
corpus = CsvSubdirsCorpus(wiki, social)


In [None]:
model = Word2Vec(corpus, size=300, window=20, min_count=100, workers=8)

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 380505 words, keeping 10358 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 762565 words, keeping 15922 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 1144077 words, keeping 20856 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 1528381 words, keeping 25453 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 1911770 words, keeping 29782 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 2295922 words, keeping 33795 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 2677760 words, keeping 37662 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 3059976 words, kee

INFO:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 28305981 words, keeping 217781 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #750000, processed 28688886 words, keeping 220055 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #760000, processed 29072030 words, keeping 222208 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #770000, processed 29454435 words, keeping 224400 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #780000, processed 29837338 words, keeping 226598 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #790000, processed 30218195 words, keeping 228683 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #800000, processed 30601007 words, keeping 230802 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #810000, processed 30984724 words, keeping 232890 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #820000, processed 31369128 words, keeping 235064 word types
I

INFO:gensim.models.word2vec:PROGRESS: at sentence #1480000, processed 56627797 words, keeping 366594 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1490000, processed 57011191 words, keeping 368477 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1500000, processed 57393375 words, keeping 370214 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1510000, processed 57776472 words, keeping 372025 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1520000, processed 58159613 words, keeping 373891 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1530000, processed 58542551 words, keeping 375739 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1540000, processed 58927550 words, keeping 377566 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1550000, processed 59312464 words, keeping 379349 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #1560000, processed 59695326 words, keeping 381171 wor

INFO:gensim.models.word2vec:PROGRESS: at sentence #2220000, processed 84982891 words, keeping 495191 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2230000, processed 85367636 words, keeping 496953 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2240000, processed 85750512 words, keeping 498576 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2250000, processed 86134475 words, keeping 500224 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2260000, processed 86516104 words, keeping 501843 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2270000, processed 86898727 words, keeping 503576 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2280000, processed 87282104 words, keeping 505261 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2290000, processed 87665955 words, keeping 506932 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2300000, processed 88050372 words, keeping 508534 wor

INFO:gensim.models.word2vec:PROGRESS: at sentence #2950000, processed 112989563 words, keeping 612466 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2960000, processed 113373672 words, keeping 614016 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2970000, processed 113756299 words, keeping 615615 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2980000, processed 114140322 words, keeping 617043 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #2990000, processed 114525848 words, keeping 618645 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3000000, processed 114908328 words, keeping 620255 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3010000, processed 115292355 words, keeping 621987 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3020000, processed 115677426 words, keeping 623582 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3030000, processed 116061898 words, keeping 6

INFO:gensim.models.word2vec:PROGRESS: at sentence #3680000, processed 141021608 words, keeping 722237 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3690000, processed 141405673 words, keeping 723562 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3700000, processed 141788748 words, keeping 725024 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3710000, processed 142175130 words, keeping 726490 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3720000, processed 142559439 words, keeping 727917 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3730000, processed 142942825 words, keeping 729312 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3740000, processed 143327584 words, keeping 730784 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3750000, processed 143708377 words, keeping 732269 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #3760000, processed 144093223 words, keeping 7

INFO:gensim.models.word2vec:PROGRESS: at sentence #4410000, processed 169057815 words, keeping 825262 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4420000, processed 169443619 words, keeping 826478 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4430000, processed 169828938 words, keeping 827845 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4440000, processed 170215583 words, keeping 829226 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4450000, processed 170600588 words, keeping 830580 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4460000, processed 170984125 words, keeping 832021 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4470000, processed 171369736 words, keeping 833378 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4480000, processed 171752638 words, keeping 834848 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #4490000, processed 172138278 words, keeping 8

INFO:gensim.models.word2vec:PROGRESS: at sentence #5140000, processed 197121075 words, keeping 924061 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5150000, processed 197505657 words, keeping 925348 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5160000, processed 197891406 words, keeping 926581 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5170000, processed 198276504 words, keeping 927870 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5180000, processed 198659483 words, keeping 929155 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5190000, processed 199043738 words, keeping 930459 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5200000, processed 199428593 words, keeping 931724 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5210000, processed 199811307 words, keeping 932975 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5220000, processed 200195343 words, keeping 9

INFO:gensim.models.word2vec:PROGRESS: at sentence #5870000, processed 225194019 words, keeping 1018528 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5880000, processed 225577883 words, keeping 1019821 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5890000, processed 225962277 words, keeping 1021143 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5900000, processed 226344067 words, keeping 1022418 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5910000, processed 226729246 words, keeping 1023719 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5920000, processed 227110472 words, keeping 1025002 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5930000, processed 227493626 words, keeping 1026313 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5940000, processed 227879059 words, keeping 1027628 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #5950000, processed 228266359 words, k

INFO:gensim.models.word2vec:PROGRESS: at sentence #6590000, processed 252900068 words, keeping 1108992 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6600000, processed 253285732 words, keeping 1110269 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6610000, processed 253670500 words, keeping 1111591 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6620000, processed 254055201 words, keeping 1112927 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6630000, processed 254441895 words, keeping 1114127 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6640000, processed 254826407 words, keeping 1115457 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6650000, processed 255211800 words, keeping 1116712 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6660000, processed 255596467 words, keeping 1118072 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #6670000, processed 255981725 words, k

INFO:gensim.models.word2vec:PROGRESS: at sentence #7310000, processed 280628891 words, keeping 1197826 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7320000, processed 281015127 words, keeping 1199066 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7330000, processed 281399208 words, keeping 1200241 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7340000, processed 281783852 words, keeping 1201467 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7350000, processed 282169171 words, keeping 1202707 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7360000, processed 282554330 words, keeping 1203875 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7370000, processed 282939128 words, keeping 1205030 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7380000, processed 283324196 words, keeping 1206235 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #7390000, processed 283710420 words, k

INFO:gensim.models.word2vec:PROGRESS: at sentence #8030000, processed 308364627 words, keeping 1283619 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8040000, processed 308748454 words, keeping 1284788 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8050000, processed 309133312 words, keeping 1285999 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8060000, processed 309520197 words, keeping 1287128 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8070000, processed 309906263 words, keeping 1288306 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8080000, processed 310291923 words, keeping 1289409 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8090000, processed 310677603 words, keeping 1290531 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8100000, processed 311063988 words, keeping 1291714 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8110000, processed 311449887 words, k

INFO:gensim.models.word2vec:PROGRESS: at sentence #8750000, processed 336108261 words, keeping 1366779 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8760000, processed 336495371 words, keeping 1367892 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8770000, processed 336880581 words, keeping 1368992 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8780000, processed 337265900 words, keeping 1370194 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8790000, processed 337651942 words, keeping 1371334 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8800000, processed 338038584 words, keeping 1372483 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8810000, processed 338424157 words, keeping 1373662 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8820000, processed 338809222 words, keeping 1374779 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #8830000, processed 339195778 words, k

INFO:gensim.models.word2vec:PROGRESS: at sentence #9470000, processed 363876622 words, keeping 1448320 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9480000, processed 364265820 words, keeping 1449503 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9490000, processed 364650448 words, keeping 1450561 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9500000, processed 365033128 words, keeping 1451603 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9510000, processed 365420240 words, keeping 1452667 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9520000, processed 365804518 words, keeping 1453802 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9530000, processed 366188724 words, keeping 1454917 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9540000, processed 366574770 words, keeping 1456046 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #9550000, processed 366960169 words, k

INFO:gensim.models.word2vec:PROGRESS: at sentence #10190000, processed 391638751 words, keeping 1528746 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10200000, processed 392023286 words, keeping 1529846 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10210000, processed 392409424 words, keeping 1530969 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10220000, processed 392796184 words, keeping 1532066 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10230000, processed 393183478 words, keeping 1533150 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10240000, processed 393570130 words, keeping 1534245 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10250000, processed 393955923 words, keeping 1535443 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10260000, processed 394341080 words, keeping 1536517 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10270000, processed 394726780

INFO:gensim.models.word2vec:PROGRESS: at sentence #10910000, processed 419420934 words, keeping 1608158 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10920000, processed 419805994 words, keeping 1609271 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10930000, processed 420192987 words, keeping 1610502 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10940000, processed 420581113 words, keeping 1611597 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10950000, processed 420966262 words, keeping 1612643 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10960000, processed 421350011 words, keeping 1613738 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10970000, processed 421736665 words, keeping 1614784 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10980000, processed 422122267 words, keeping 1615843 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10990000, processed 422507646

INFO:gensim.models.word2vec:PROGRESS: at sentence #11630000, processed 447225628 words, keeping 1685586 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11640000, processed 447611704 words, keeping 1686696 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11650000, processed 447997277 words, keeping 1687687 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11660000, processed 448383418 words, keeping 1688762 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11670000, processed 448770635 words, keeping 1689834 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11680000, processed 449158796 words, keeping 1690867 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11690000, processed 449545244 words, keeping 1691913 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11700000, processed 449931967 words, keeping 1692969 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #11710000, processed 450317794

INFO:gensim.models.word2vec:PROGRESS: at sentence #12350000, processed 475023800 words, keeping 1761906 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12360000, processed 475407581 words, keeping 1762977 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12370000, processed 475794139 words, keeping 1764083 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12380000, processed 476179846 words, keeping 1765081 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12390000, processed 476565480 words, keeping 1766163 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12400000, processed 476949709 words, keeping 1767259 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12410000, processed 477335922 words, keeping 1768342 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12420000, processed 477721474 words, keeping 1769386 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #12430000, processed 478109485

INFO:gensim.models.word2vec:PROGRESS: at sentence #13070000, processed 502839349 words, keeping 1837349 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13080000, processed 503226979 words, keeping 1838336 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13090000, processed 503613014 words, keeping 1839318 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13100000, processed 503999961 words, keeping 1840393 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13110000, processed 504385844 words, keeping 1841476 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13120000, processed 504771815 words, keeping 1842529 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13130000, processed 505159413 words, keeping 1843646 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13140000, processed 505546739 words, keeping 1844669 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13150000, processed 505932864

INFO:gensim.models.word2vec:PROGRESS: at sentence #13790000, processed 530656320 words, keeping 1911539 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13800000, processed 531040195 words, keeping 1912552 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13810000, processed 531428320 words, keeping 1913543 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13820000, processed 531815356 words, keeping 1914520 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13830000, processed 532200265 words, keeping 1915589 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13840000, processed 532585120 words, keeping 1916605 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13850000, processed 532973406 words, keeping 1917672 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13860000, processed 533360122 words, keeping 1918727 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #13870000, processed 533747460

INFO:gensim.models.word2vec:PROGRESS: at sentence #14510000, processed 558493029 words, keeping 1984987 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14520000, processed 558878812 words, keeping 1985996 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14530000, processed 559265046 words, keeping 1986952 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14540000, processed 559653444 words, keeping 1988021 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14550000, processed 560039465 words, keeping 1989015 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14560000, processed 560426435 words, keeping 1990064 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14570000, processed 560812225 words, keeping 1991080 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14580000, processed 561199446 words, keeping 1992093 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #14590000, processed 561585407

INFO:gensim.models.word2vec:PROGRESS: at sentence #15230000, processed 586331351 words, keeping 2057876 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15240000, processed 586717862 words, keeping 2058885 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15250000, processed 587103844 words, keeping 2059861 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15260000, processed 587490602 words, keeping 2060778 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15270000, processed 587876226 words, keeping 2061733 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15280000, processed 588260431 words, keeping 2062691 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15290000, processed 588646634 words, keeping 2063748 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15300000, processed 589031274 words, keeping 2064777 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15310000, processed 589417815

INFO:gensim.models.word2vec:PROGRESS: at sentence #15950000, processed 614183698 words, keeping 2129069 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15960000, processed 614570697 words, keeping 2130039 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15970000, processed 614959353 words, keeping 2130985 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15980000, processed 615345888 words, keeping 2131978 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #15990000, processed 615733562 words, keeping 2133011 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16000000, processed 616120090 words, keeping 2134003 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16010000, processed 616508632 words, keeping 2135061 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16020000, processed 616896634 words, keeping 2136073 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16030000, processed 617282639

INFO:gensim.models.word2vec:PROGRESS: at sentence #16670000, processed 642045207 words, keeping 2200159 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16680000, processed 642431853 words, keeping 2201058 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16690000, processed 642818090 words, keeping 2202053 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16700000, processed 643204147 words, keeping 2202995 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16710000, processed 643593422 words, keeping 2204037 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16720000, processed 643979388 words, keeping 2204921 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16730000, processed 644365912 words, keeping 2205930 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16740000, processed 644752963 words, keeping 2206827 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #16750000, processed 645140744

INFO:gensim.models.word2vec:PROGRESS: at sentence #17390000, processed 669894212 words, keeping 2270161 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17400000, processed 670281082 words, keeping 2271101 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17410000, processed 670666929 words, keeping 2272056 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17420000, processed 671052725 words, keeping 2273047 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17430000, processed 671440548 words, keeping 2274040 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17440000, processed 671828116 words, keeping 2275071 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17450000, processed 672215004 words, keeping 2276042 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17460000, processed 672602168 words, keeping 2277056 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #17470000, processed 672990372

INFO:gensim.models.word2vec:PROGRESS: at sentence #18110000, processed 697747070 words, keeping 2339693 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18120000, processed 698134869 words, keeping 2340593 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18130000, processed 698522317 words, keeping 2341516 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18140000, processed 698911799 words, keeping 2342423 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18150000, processed 699298506 words, keeping 2343420 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18160000, processed 699684423 words, keeping 2344347 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18170000, processed 700072315 words, keeping 2345275 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18180000, processed 700457894 words, keeping 2346208 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18190000, processed 700845432

INFO:gensim.models.word2vec:PROGRESS: at sentence #18830000, processed 725630325 words, keeping 2408584 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18840000, processed 726017854 words, keeping 2409646 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18850000, processed 726406619 words, keeping 2410581 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18860000, processed 726795198 words, keeping 2411598 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18870000, processed 727181635 words, keeping 2412572 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18880000, processed 727569107 words, keeping 2413508 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18890000, processed 727956899 words, keeping 2414432 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18900000, processed 728344427 words, keeping 2415341 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #18910000, processed 728732652

INFO:gensim.models.word2vec:PROGRESS: at sentence #19550000, processed 747471331 words, keeping 2518259 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19560000, processed 747640697 words, keeping 2519666 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19570000, processed 747935671 words, keeping 2521744 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19580000, processed 748023968 words, keeping 2523265 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19590000, processed 748202414 words, keeping 2524984 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19600000, processed 748372922 words, keeping 2526129 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19610000, processed 748778244 words, keeping 2528850 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19620000, processed 749053512 words, keeping 2531741 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #19630000, processed 749290189

INFO:gensim.models.word2vec:PROGRESS: at sentence #20270000, processed 766394214 words, keeping 2642720 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20280000, processed 766963165 words, keeping 2645403 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20290000, processed 767486790 words, keeping 2647879 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20300000, processed 767895633 words, keeping 2650234 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20310000, processed 768152044 words, keeping 2651536 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20320000, processed 768362540 words, keeping 2652717 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20330000, processed 768452270 words, keeping 2653239 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20340000, processed 768891627 words, keeping 2654647 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20350000, processed 769209764

INFO:gensim.models.word2vec:PROGRESS: at sentence #20970000, processed 784983830 words, keeping 2737673 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20980000, processed 785159417 words, keeping 2738481 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20990000, processed 785285970 words, keeping 2739648 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #21000000, processed 785515820 words, keeping 2741342 word types
INFO:gensim.models.word2vec:collected 2742529 word types from a corpus of 785635402 raw words and 21008964 sentences
INFO:gensim.models.word2vec:Loading a fresh vocabulary
INFO:gensim.models.word2vec:effective_min_count=100 retains 26636 unique words (0% of original 2742529, drops 2715893)
INFO:gensim.models.word2vec:effective_min_count=100 leaves 777854008 word corpus (99% of original 785635402, drops 7781394)
INFO:gensim.models.word2vec:deleting the raw counts dictionary of 2742529 items
INFO:gensim.models.word2vec:sample=0.001 down

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 10.23% examples, 1011614 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 10.41% examples, 1014361 words/s, in_qsize 15, out_qsize 4
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 10.59% examples, 1017507 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 10.78% examples, 1020317 words/s, in_qsize 13, out_qsize 5
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 10.96% examples, 1023295 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 11.14% examples, 1025707 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 11.32% examples, 1028814 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 11.50% examples, 1031448 words/s, in_qsize 15, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 11.68% examples, 1033845 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 22.49% examples, 1030774 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 22.67% examples, 1032374 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 22.85% examples, 1033692 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.04% examples, 1035051 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.22% examples, 1036327 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.40% examples, 1037627 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.58% examples, 1039100 words/s, in_qsize 15, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.77% examples, 1040639 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 23.80% examples, 1026083 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 34.75% examples, 1038118 words/s, in_qsize 15, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 34.92% examples, 1038889 words/s, in_qsize 14, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.10% examples, 1039633 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.27% examples, 1040251 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.44% examples, 1040804 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.62% examples, 1041572 words/s, in_qsize 15, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.70% examples, 1031742 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 35.89% examples, 1032734 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 36.07% examples, 1033558 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.03% examples, 1040636 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.20% examples, 1041052 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.37% examples, 1041639 words/s, in_qsize 12, out_qsize 6
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.56% examples, 1042514 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.60% examples, 1035319 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.77% examples, 1035596 words/s, in_qsize 10, out_qsize 5
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 47.95% examples, 1036403 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 48.14% examples, 1037070 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 48.31% examples, 1037627 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 59.29% examples, 1043551 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 59.47% examples, 1044037 words/s, in_qsize 12, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 59.50% examples, 1037972 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 59.66% examples, 1038226 words/s, in_qsize 15, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 59.84% examples, 1038598 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 60.02% examples, 1039088 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 60.20% examples, 1039565 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 60.37% examples, 1040005 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 60.55% examples, 1040407 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 71.40% examples, 1039383 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 71.57% examples, 1039654 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 71.75% examples, 1040032 words/s, in_qsize 15, out_qsize 4
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 71.93% examples, 1040538 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 72.11% examples, 1040961 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 72.28% examples, 1041207 words/s, in_qsize 13, out_qsize 5
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 72.46% examples, 1041609 words/s, in_qsize 14, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 72.65% examples, 1042146 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 72.84% examples, 1042646 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 83.66% examples, 1041963 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 83.84% examples, 1042318 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 84.02% examples, 1042664 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 84.20% examples, 1042978 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 84.38% examples, 1043320 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 84.56% examples, 1043764 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 84.74% examples, 1044090 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 84.92% examples, 1044361 words/s, in_qsize 14, out_qsize 6
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 85.10% examples, 1044701 words/s,

INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 95.88% examples, 1030548 words/s, in_qsize 0, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.07% examples, 1029857 words/s, in_qsize 12, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.09% examples, 1029152 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.09% examples, 1028199 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.09% examples, 1027345 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.23% examples, 1027022 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.40% examples, 1026648 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.49% examples, 1025965 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 1 - PROGRESS: at 96.63% examples, 1026361 words/s, i

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.01% examples, 1004942 words/s, in_qsize 13, out_qsize 3
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.19% examples, 1010046 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.37% examples, 1014662 words/s, in_qsize 14, out_qsize 3
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.55% examples, 1020356 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.73% examples, 1024903 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 6.91% examples, 1029025 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 7.09% examples, 1032958 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 7.14% examples, 987052 words/s, in_qsize 15, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 7.31% examples, 990410 words/s, in_qsize 1

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 18.42% examples, 1033115 words/s, in_qsize 15, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 18.60% examples, 1034569 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 18.78% examples, 1036201 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 18.96% examples, 1037434 words/s, in_qsize 12, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 19.04% examples, 1019848 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 19.22% examples, 1021273 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 19.39% examples, 1022793 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 19.57% examples, 1024039 words/s, in_qsize 14, out_qsize 4
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 19.74% examples, 1025411 words/s,

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 30.69% examples, 1038855 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 30.87% examples, 1039713 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 30.94% examples, 1028625 words/s, in_qsize 10, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 31.12% examples, 1029459 words/s, in_qsize 14, out_qsize 3
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 31.30% examples, 1030544 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 31.48% examples, 1031551 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 31.65% examples, 1032210 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 31.83% examples, 1033019 words/s, in_qsize 13, out_qsize 5
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 32.01% examples, 1033944 words/s,

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 42.84% examples, 1033079 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 43.02% examples, 1033843 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 43.21% examples, 1034571 words/s, in_qsize 14, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 43.38% examples, 1035094 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 43.56% examples, 1035802 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 43.75% examples, 1036650 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 43.93% examples, 1037433 words/s, in_qsize 16, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 44.11% examples, 1038246 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 44.29% examples, 1038713 words/s,

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.09% examples, 1037552 words/s, in_qsize 15, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.27% examples, 1037948 words/s, in_qsize 13, out_qsize 7
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.45% examples, 1038461 words/s, in_qsize 15, out_qsize 3
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.62% examples, 1038805 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.80% examples, 1039334 words/s, in_qsize 13, out_qsize 3
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 55.99% examples, 1039989 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 56.16% examples, 1040450 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 56.35% examples, 1041154 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 56.52% examples, 1041594 words/s,

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 67.35% examples, 1040091 words/s, in_qsize 16, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 67.54% examples, 1040570 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 67.72% examples, 1041085 words/s, in_qsize 13, out_qsize 2
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 67.89% examples, 1041396 words/s, in_qsize 15, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 68.08% examples, 1042012 words/s, in_qsize 16, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 68.26% examples, 1042351 words/s, in_qsize 12, out_qsize 4
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 68.43% examples, 1042598 words/s, in_qsize 14, out_qsize 5
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 68.60% examples, 1042934 words/s, in_qsize 15, out_qsize 7
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 68.78% examples, 1043413 words/s,

In [None]:
model.save('/source/main/vocab/output/word2vec_gensim_model_wiki_social')

In [5]:
model.

<gensim.models.word2vec.Word2Vec at 0x7f557e904630>

In [12]:
model.most_similar('sống')

  """Entry point for launching an IPython kernel.


[('sống…', 0.5705943703651428),
 ('đời', 0.49514040350914),
 ('sống..', 0.45615914463996887),
 ('sống-', 0.43332797288894653),
 ('sống.-', 0.3902968168258667),
 ('sống.', 0.3377140760421753),
 ('đời…', 0.32389163970947266),
 ('rốt', 0.3238018751144409),
 ('sống', 0.3073164224624634),
 ('mít-tinh', 0.3070906102657318)]

In [14]:
len(model.wv.index2word)

26636

In [16]:
voc = Voc(tokenize_func=Voc.WORD_LV_TOK_FUNC, space_char=Voc.WORD_LV_SPACE_CHR)

In [2]:
voc.add_embedding_weights()

NameError: name 'voc' is not defined

# Create Voc

In [2]:
model = Word2Vec.load('/source/main/vocab/output/word2vec_gensim_model_wiki_social')

INFO:gensim.utils:loading Word2Vec object from /source/main/vocab/output/word2vec_gensim_model_wiki_social
INFO:gensim.utils:loading wv recursively from /source/main/vocab/output/word2vec_gensim_model_wiki_social.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loading vocabulary recursively from /source/main/vocab/output/word2vec_gensim_model_wiki_social.vocabulary.* with mmap=None
INFO:gensim.utils:loading trainables recursively from /source/main/vocab/output/word2vec_gensim_model_wiki_social.trainables.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:loaded /source/main/vocab/output/word2vec_gensim_model_wiki_social


In [12]:
print(model.wv.syn0[0][:10])
print(model.wv.syn0[-1][:10])

[ 0.09677044 -0.5980865   1.6606522   0.3972973   0.00939904 -0.01667785
 -1.4936599  -1.1991605  -0.29754373 -0.17177866]
[ 0.0013401  -0.04555359  0.11077558 -0.02073835  0.08600322 -0.01954939
 -0.02022713  0.00361845  0.1397001   0.04248903]


In [15]:
model.wv.add(['__p__', '__o__'], np.random.normal(loc=model.wv.syn0.mean(), scale=model.wv.syn0.std(),
                                             size=(2, model.wv.syn0.shape[1])))

  """Entry point for launching an IPython kernel.
  


In [16]:
vocabs = model.wv.index2word
weights = model.wv.syn0

  


In [17]:
len(vocabs)

26638

In [18]:
vocabs[-10:]

['vuitetnhuy',
 'rén..',
 'chamnhethemyeulanda',
 'fiala',
 'antamhungthinhtoandien',
 'bifidusbl',
 'stra',
 'lgsuperuhdtv',
 '__p__',
 '__o__']

In [25]:
voc = Voc(tokenize_func=Voc.WORD_LV_TOK_FUNC, space_char=Voc.WORD_LV_SPACE_CHR)

In [21]:
len(vocabs)

26638

In [26]:
voc.build_from_tokens(vocabs, padding_idx=len(vocabs)-2, oov_idx=len(vocabs)-1)

In [27]:
voc.add_embedding_weights(model.wv.syn0)

  """Entry point for launching an IPython kernel.


In [28]:
voc.freeze()

In [29]:
voc.dump('/source/main/vocab/output/voc.pkl')

# Check it

In [30]:
del voc

In [3]:
voc = Voc.load('/source/main/vocab/output/voc.pkl')

In [4]:
voc.docs2idx(['đá'])

[[624]]

In [5]:
model.wv.get_vector('đá')[:10]

array([ 0.75781786,  1.6542717 ,  0.163088  , -0.04168139,  4.2450223 ,
        2.8398998 ,  2.542663  , -1.376479  ,  1.4295504 ,  1.575918  ],
      dtype=float32)

In [6]:
voc.get_embedding_weights()[624][:10]

array([ 0.75781786,  1.65427172,  0.16308799, -0.04168139,  4.2450223 ,
        2.83989978,  2.5426631 , -1.37647903,  1.42955041,  1.57591796])