In [1]:
import numpy as np
import random
import string
import math
from collections import defaultdict
import json

In [2]:
def gen_word(word_length):
    word_len = np.random.randint(*word_length)
    return ''.join(random.sample(string.ascii_lowercase, word_len))

In [3]:
topic_size = 6
num_topics = 20

topics = [
    [gen_word((2,6)) for _ in range(topic_size)] for _ in range(num_topics)
]

In [4]:
topics

[['atipl', 'uy', 'ehby', 'hs', 'tclv', 'qifph'],
 ['xzmtq', 'cg', 'nv', 'wc', 'nj', 'reh'],
 ['dfv', 'cgu', 'mhp', 'rfu', 'myeka', 'kxm'],
 ['wovgr', 'rez', 'kiw', 'kjan', 'xcklr', 'hf'],
 ['btzoj', 'viqjy', 'gfkr', 'puwvs', 'bqvl', 'dt'],
 ['ivkjl', 'rln', 'xfhvu', 'ot', 'pctl', 'tqn'],
 ['jmw', 'dfai', 'nzrjo', 'hatue', 'ljzae', 'my'],
 ['esrpk', 'oe', 'edih', 'eosi', 'dywu', 'mun'],
 ['yf', 'wzrd', 'sqek', 'jpe', 'uia', 'soq'],
 ['jpvs', 'gyvdp', 'nlry', 'xzb', 'vk', 'cb'],
 ['mctf', 'sahzm', 'vqxh', 'bcal', 'detw', 'brace'],
 ['dxmg', 'jdfe', 'ohm', 'yg', 'wi', 'yshdp'],
 ['fnx', 'kxhen', 'sd', 'jf', 'opk', 'knel'],
 ['elh', 'gvt', 'nc', 'pw', 'onuij', 'ak'],
 ['ynrqm', 'jx', 'qsw', 'vonet', 'ku', 'upib'],
 ['mpey', 'jpzdy', 'ji', 'nwjk', 'ops', 'pneuc'],
 ['khgpj', 'ns', 'tzo', 'wnrf', 'mqs', 'vgbwe'],
 ['apg', 'leu', 'ki', 'tvk', 'yazcx', 'fkiqe'],
 ['pwzgc', 'hewa', 'szg', 'eyn', 'skd', 'bs'],
 ['guv', 'dnbe', 'ipc', 'kdf', 'ikbsh', 'hmy']]

In [5]:
def generate_sentence(m_topics):
    sent = []
    prefix_topics = 1
    suffix_topics = 1
    
    for i in range(prefix_topics):
        sent += random.sample(topics[random.choice(range(num_topics))], random.randint(0,2))
        
    sent += random.sample(topics[random.choice(m_topics)], random.randint(0,2)) + ['M'] + random.sample(topics[random.choice(m_topics)], random.randint(0,2))
    
    for i in range(suffix_topics):
        sent += random.sample(topics[random.choice(range(num_topics))], random.randint(0,2))
      
    return sent
    

In [6]:
m_topics = random.choices(range(num_topics), k=2)
m_topics, generate_sentence(m_topics)

([14, 6], ['M', 'dfai', 'jmw', 'yg', 'yshdp'])

In [7]:
with open('../data/fake_data.tsv', 'w') as out:
    for i in range(1000):
        m_topics = random.choices(range(num_topics), k=2)
        topics_str = "_".join(map(str, m_topics))
        for j in range(random.randint(2, 4)):
            sent = generate_sentence(m_topics)
            out.write(' '.join(sent) + "\t" + topics_str + '\n')

In [8]:
with open('../data/fake_ft_data.tsv', 'w') as out:
    for i in range(1000):
        m_topics = random.choices(range(num_topics), k=2)
        topics_str = "_".join(map(str, m_topics))
        for j in range(random.randint(2, 4)):
            sent = generate_sentence(m_topics)
            out.write(' '.join(sent) + '\n')

In [9]:
#!pip install git+https://github.com/facebookresearch/fastText.git

In [10]:
import fastText

In [11]:
model = fastText.train_unsupervised(
    input='../data/fake_ft_data.tsv', minCount=0, bucket=1000
)

In [12]:
len(model.get_words())

122

In [13]:
model.get_subwords('bpxre')

(['<bp',
  '<bpx',
  '<bpxr',
  '<bpxre',
  'bpx',
  'bpxr',
  'bpxre',
  'bpxre>',
  'pxr',
  'pxre',
  'pxre>',
  'xre',
  'xre>',
  're>'],
 array([ 963, 1117,  165,  196,  367,  815,  962,  676,  599,  842,  988,
         728,  658,  620]))

In [14]:
model.save_model("../data/fake_ft_model.bin")

In [53]:
from gensim.models import FastText
from gensim.models.utils_any2vec import ft_ngram_hashes
from gensim.models.utils_any2vec import compute_ngrams

In [28]:
ft = FastText.load_fasttext_format("../data/fake_ft_model.bin")

In [45]:
ft.save('../data/gensim_fake_ft.model')

In [54]:
ft2 = FastText.load('../data/gensim_fake_ft.model')

In [90]:
params = {
    "minn": ft2.wv.min_n,
    "maxn": ft2.wv.max_n,
    "num_buckets": ft2.wv.bucket,
    "fb_compatible": ft2.wv.compatible_hash
}

vocab = dict((word, keydvector.index) for word, keydvector in ft2.wv.vocab.items())
hash2index = ft2.wv.hash2index

In [93]:
def get_ids(word):
    
    if word in vocab:
        return np.array([vocab[word]])
    res = []
    for ngram_id in ft_ngram_hashes(word, **params):
        res.append(hash2index.get(ngram_id, ngram_id) + len(ft2.wv.vocab))
        
    return np.array(res)

In [98]:
get_ids('brace')

array([87])

In [105]:
np.concatenate([ft2.wv.vectors_vocab, ft2.wv.vectors_ngrams], axis=0).shape

(1122, 100)

In [102]:
ft2.wv.vectors_vocab.shape

(122, 100)

In [108]:
ft2.wv.vectors_vocab.shape[0] + ft2.wv.vectors_ngrams.shape[0]

1122