In [2]:
from gensim.models import FastText
sentences = [["你", "是", "誰"], ["我", "是", "臺灣人"]]

model = FastText(sentences,  size=4, window=3, min_count=1, iter=10,min_n = 3 , max_n = 6,word_ngrams = 0)


# 參數意義:  

+ 常規引數:  

    + model: Training architecture. Allowed values: cbow, skipgram (Default cbow)  
    + size: Size of embeddings to be learnt (Default 100)
    + alpha: Initial learning rate (Default 0.025)
    + window: Context window size (Default 5)
    + min_count: Ignore words with number of occurrences below this (Default 5)
    + loss: Training objective. Allowed values: ns, hs, softmax (Default ns)
    + sample: Threshold for downsampling higher-frequency words (Default 0.001)
    + negative: Number of negative words to sample, for ns (Default 5)
    + iter: Number of epochs (Default 5)
    + sorted_vocab: Sort vocab by descending frequency (Default 1)
    + threads: Number of threads to use (Default 12)
+ fasttext附加引數

    + min_n: min length of char ngrams (Default 3)
    + max_n: max length of char ngrams (Default 6)
    + bucket: number of buckets used for hashing ngrams (Default 2000000)
+ 額外引數：

   + word_ngrams ({1,0}, optional)
   + If 1, uses enriches word vectors with subword(n-grams) information. If 0, this is equivalent to Word2Vec.


In [3]:
sentences

[['你', '是', '誰'], ['我', '是', '中國人']]

In [4]:
model.wv['你'] # 詞向量獲得的方式

array([-0.15942173, -0.12655328,  0.00290119,  0.0401443 ], dtype=float32)

# Tutorial
https://radimrehurek.com/gensim/models/fasttext.html  
https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb

In [3]:
from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [21]:
model = FastText(size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

In [23]:
print(model)

FastText(vocab=12, size=4, alpha=0.025)


### 用Corpus的方式餵給model

In [22]:
from gensim.test.utils import datapath

corpus_file = datapath('lee_background.cor')  # absolute path to corpus


#### corpus_file:   
.\anaconda3\\lib\\site-packages\\gensim\\test\\test_data\\lee_background.cor

In [24]:
model3 = FastText(size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary

total_words = model3.corpus_total_words  # number of words in the corpus #59890 words
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)

In [25]:
print(model3)

FastText(vocab=10781, size=4, alpha=0.025)


## Save & Load model  

In [55]:
from gensim.test.utils import get_tmpfile
import os
#fname = get_tmpfile("fasttext.model") # .\\AppData\\Local\\Temp\\fasttext.model
fname=os.getcwd()+'/fasttext_model/model3.model'
model3.save(fname)
model = FastText.load(fname)

In [56]:
print(model)

FastText(vocab=10781, size=4, alpha=0.025)


## Training hyperparameters

Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the following parameters from the original word2vec -

 - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)
 - size: Size of embeddings to be learnt (Default 100)
 - alpha: Initial learning rate (Default 0.025)
 - window: Context window size (Default 5)
 - min_count: Ignore words with number of occurrences below this (Default 5)
 - loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)
 - sample: Threshold for downsampling higher-frequency words (Default 0.001)
 - negative: Number of negative words to sample, for `ns` (Default 5)
 - iter: Number of epochs (Default 5)
 - sorted_vocab: Sort vocab by descending frequency (Default 1)
 - threads: Number of threads to use (Default 12)

In addition, FastText has three additional parameters -

- min_n: min length of char ngrams (Default 3)
- max_n: max length of char ngrams (Default 6)
- bucket: number of buckets used for hashing ngrams (Default 2000000)
Parameters min_n and max_n control the lengths of character ngrams that each word is broken down into while training and looking up embeddings. If max_n is set to 0, or to be lesser than min_n, no character ngrams are used, and the model effectively reduces to Word2Vec.

To bound the memory requirements of the model being trained, a hashing function is used that maps ngrams to integers in 1 to K. For hashing these character sequences, the Fowler-Noll-Vo hashing function (FNV-1a variant) is employed.

Note: As in the case of Word2Vec, you can continue to train your model while using Gensim's native implementation of fastText.

### 讀入的Model可以繼續練 

In [57]:
import numpy as np

'computation' in model.wv.vocab  # New word, currently out of vocab





False

In [58]:
old_vector = np.copy(model.wv['computation'])  # Grab the existing vector
old_vector

array([-5.0932827 ,  0.73385763,  0.35021338,  2.3630817 ], dtype=float32)

In [62]:
new_vector

array([-5.0932827 ,  0.73385763,  0.35021338,  2.3630817 ], dtype=float32)

In [63]:
new_sentences = [
    ['computer', 'aided', 'design'],
    ['computer', 'science'],
    ['computational', 'complexity'],
    ['military', 'supercomputer'],
    ['central', 'processing', 'unit'],
    ['onboard', 'car', 'computer'],
]
model.build_vocab(new_sentences, update=True)  # Update the vocabulary
model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)






In [64]:
print(model)

FastText(vocab=10786, size=4, alpha=0.025)


In [65]:
'computation' in model.wv.vocab  # Word is still out of vocab

False

In [76]:
model['axe']

  """Entry point for launching an IPython kernel.


array([-0.11457483, -0.1710916 , -0.0534538 ,  0.01892457], dtype=float32)

In [77]:
# Raises a KeyError since none of the character ngrams of the word `axe` are present in the training data
try:
    model['axe']
except KeyError:
    #
    # trap the error here so it does not interfere
    # with the execution of the cells below
    #
    pass
else:
    assert False, 'the above code should have raised a KeyError'

  This is separate from the ipykernel package so we can avoid doing imports until


AssertionError: the above code should have raised a KeyError

#### 可以看到上述只有檢查到Axe是否有向量
若要檢查Axe是否在vocab中:

In [80]:
# Tests if word present in vocab
print("Axe" in model.wv.vocab)
# Tests if vector present for word
print("Axe" in model)

False
True


  after removing the cwd from sys.path.


### Similarity 

In [6]:
from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
import os

In [7]:
fname=os.getcwd()+'/fasttext_model/model3.model'
model = FastText.load(fname)

In [8]:
print("night" in model.wv.vocab)
print("nights" in model.wv.vocab)
model.similarity("night","nights")

True
True


  This is separate from the ipykernel package so we can avoid doing imports until


0.99994737

In [102]:
model.most_similar("nights")

  """Entry point for launching an IPython kernel.


[('overs', 0.9999988675117493),
 ('easy', 0.999997615814209),
 ('overjoyed', 0.9999966025352478),
 ('quest', 0.9999966025352478),
 ('leaders', 0.9999938011169434),
 ('fighter-bombers', 0.9999935626983643),
 ('surprising.', 0.9999934434890747),
 ('numbers', 0.9999932646751404),
 ('ought', 0.9999927282333374),
 ('Tourism', 0.9999924898147583)]

In [104]:
model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])

  """Entry point for launching an IPython kernel.


0.9992425

In [105]:
model.doesnt_match("breakfast cereal dinner lunch".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'breakfast'

In [2]:
import nltk
#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [13]:
# Word Movers distance 值越小Documents越相似
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()

# Remove their stopwords.
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
sentence_obama = [w for w in sentence_obama if w not in stopwords]
sentence_president = [w for w in sentence_president if w not in stopwords]

# Compute WMD.
distance = model.wmdistance(sentence_obama, sentence_president)
distance

  if sys.path[0] == '':


2.1226722137179377