Gensim word2vec tutorial: http://rare-technologies.com/word2vec-tutorial/

Dependencies:
* pip install gensim

In [3]:
# Make sure you set FT_HOME to your fastText directory root
FT_HOME = '/Users/fib1123/Desktop/nlp/fastText/'

In [4]:
import os.path

MODELS_DIR = 'models/'
!mkdir -p {MODELS_DIR}

lr = 0.05
dim = 100
ws = 5
epoch = 5
minCount = 5
neg = 5
loss = 'ns'
t = 1e-4

from gensim.models import Word2Vec
from gensim.models.word2vec import Text8Corpus

# Same values as used for fastText training above
params = {
    'alpha': lr,
    'size': dim,
    'window': ws,
    'iter': epoch,
    'min_count': minCount,
    'sample': t,
    'sg': 1,
    'hs': 0,
    'negative': neg
}

def train_models(corpus_file, output_name):
    output_file = '{:s}_ft'.format(output_name)
    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):
        print('Training fasttext on {:s} corpus..'.format(corpus_file))
        %time !{FT_HOME}fasttext skipgram -input {corpus_file} -output {MODELS_DIR+output_file}  -lr {lr} -dim {dim} -ws {ws} -epoch {epoch} -minCount {minCount} -neg {neg} -loss {loss} -t {t}
    else:
        print('\nUsing existing model file {:s}.vec'.format(output_file))
        
    output_file = '{:s}_ft_no_ng'.format(output_name)
    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):
        print('\nTraining fasttext on {:s} corpus (without char n-grams)..'.format(corpus_file))
        %time !{FT_HOME}fasttext skipgram -input {corpus_file} -output {MODELS_DIR+output_file}  -lr {lr} -dim {dim} -ws {ws} -epoch {epoch} -minCount {minCount} -neg {neg} -loss {loss} -t {t} -maxn 0
    else:
        print('\nUsing existing model file {:s}.vec'.format(output_file))
        
    output_file = '{:s}_gs'.format(output_name)
    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):
        print('\nTraining word2vec on {:s} corpus..'.format(corpus_file))
        
        # Text8Corpus class for reading space-separated words file
        %time gs_model = Word2Vec(Text8Corpus(corpus_file), **params); gs_model
        # Direct local variable lookup doesn't work properly with magic statements (%time)
        locals()['gs_model'].save_word2vec_format(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file)))
        print('\nSaved gensim model as {:s}.vec'.format(output_file))
    else:
        print('\nUsing existing model file {:s}.vec'.format(output_file))

evaluation_data = {}
train_models('../data/parsedTrimmed', 'parsed_trimmed')

Training fasttext on ../data/parsedTrimmed corpus..
Read 272M words
Progress: 100.0%  words/sec/thread: 41247  lr: 0.000001  loss: 1.354188  eta: 0h0m 
Train time: 5577.000000 sec
CPU times: user 4min 15s, sys: 1min 48s, total: 6min 3s
Wall time: 1h 34min 57s

Training fasttext on ../data/parsedTrimmed corpus (without char n-grams)..
Read 272M words
Progress: 100.0%  words/sec/thread: 80323  lr: 0.000001  loss: 1.364331  eta: 0h0m 
Train time: 2301.000000 sec
CPU times: user 2min 25s, sys: 1min 18s, total: 3min 43s
Wall time: 40min 14s

Training word2vec on ../data/parsedTrimmed corpus..
CPU times: user 3h 43min 13s, sys: 1min 21s, total: 3h 44min 34s
Wall time: 2h 57min 20s

Saved gensim model as parsed_trimmed_gs.vec


In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def print_accuracy(model, questions_file):
    print('Evaluating...\n')
    acc = model.accuracy(questions_file)

    sem_correct = sum((len(acc[i]['correct']) for i in range(5)))
    sem_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5))
    sem_acc = 100*float(sem_correct)/sem_total
    print('\nSemantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct, sem_total, sem_acc))
    
    syn_correct = sum((len(acc[i]['correct']) for i in range(5, len(acc)-1)))
    syn_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5,len(acc)-1))
    syn_acc = 100*float(syn_correct)/syn_total
    print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(syn_correct, syn_total, syn_acc))
    return (sem_acc, syn_acc)

word_analogies_file = 'questions-words.txt'
accuracies = []

In [5]:
print('\nLoading Gensim embeddings')
parsed_trimmed_gs = Word2Vec.load_word2vec_format(MODELS_DIR + 'parsed_trimmed_gs.vec')
# print('Accuracy for Word2Vec:')
# accuracies.append(print_accuracy(parsed_trimmed_gs, word_analogies_file))


Loading Gensim embeddings


In [6]:
print('\nLoading FastText embeddings')
parsed_trimmed_ft = Word2Vec.load_word2vec_format(MODELS_DIR + 'parsed_trimmed_ft.vec')
# print('Accuracy for FastText (with n-grams):')
# accuracies.append(print_accuracy(parsed_trimmed_ft, word_analogies_file))


Loading FastText embeddings


In [7]:
print('Loading FastText embeddings')
parsed_trimmed_ft_no_ng = Word2Vec.load_word2vec_format(MODELS_DIR + 'parsed_trimmed_ft_no_ng.vec')
# print('Accuracy for FastText (without n-grams):')
# accuracies.append(print_accuracy(parsed_trimmed_ft_no_ng, word_analogies_file))
# evaluation_data['parsed_trimmed'] += [[acc[0] for acc in accuracies], [acc[1] for acc in accuracies]]

Loading FastText embeddings


In [13]:
print parsed_trimmed_gs.most_similar(positive=['kobieta', u'król'], negative=[u'mężczyzna'], topn=1)
print parsed_trimmed_gs.doesnt_match("breakfast cereal dinner lunch".split())
print parsed_trimmed_gs.similarity(u'kobieta', u'mężczyzna')

[(u'kr\xf3lowa', 0.8168740272521973)]
cereal
0.787359108935


In [18]:
print parsed_trimmed_gs.most_similar(positive=[u'czerwony'])

[(u'zielony', 0.8876147866249084), (u'niebieski', 0.8857795596122742), (u'\u017c\xf3\u0142ty', 0.8776800036430359), (u'bia\u0142y', 0.845734179019928), (u'czarny', 0.8080854415893555), (u'pomara\u0144czowy', 0.7869120240211487), (u'zielono', 0.7528743743896484), (u'r\xf3\u017cowy', 0.7518884539604187), (u'czerwono', 0.7458332777023315), (u'b\u0142\u0119kitny', 0.7254201173782349)]


In [19]:
print parsed_trimmed_ft.most_similar(positive=['kobieta', u'król'], negative=[u'mężczyzna'], topn=1)
print parsed_trimmed_ft.doesnt_match("breakfast cereal dinner lunch".split())
print parsed_trimmed_ft.similarity(u'kobieta', u'mężczyzna')

[(u'kr\xf3lowa', 0.8570606708526611)]
cereal
0.78283114675
