In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.extend(
    [
        'flock',
        'flock-conf',
        'src/poultry',
    ]
)

In [4]:
import itertools
from random import random

from gensim.models.doc2vec import LabeledSentence, Doc2Vec

import pandas as pd
import numpy as np
import sqlalchemy as sa

from flock import model
import db

In [5]:
import logging
logging.basicConfig(filename='flock.log', level=logging.DEBUG)
logger = logging.getLogger('flock')

In [16]:
select_query = (
"""
select tweet_id, features->'doc2vec'
from tweet
where tweet.search_vector @@ to_tsquery('pg_catalog.english', 'france')
"""
)

In [17]:
len(list(db.conn.execute(select_query)))

19707

In [18]:
# print(db.tweet_select_stmt)

In [19]:
def sentences():
#     tweets = db.conn.execute(db.tweet_select_stmt)
    tweets = db.conn.execute(select_query)
    
    for i, (tweet_id, doc2vec_features) in enumerate(tweets, start=1):
        if not doc2vec_features:
            logger.warning('Empty features for tweet %s.', tweet_id)
            continue
        
        yield LabeledSentence(**doc2vec_features)
        
    logger.info('%s tweets were selected from the DB.', i)

In [21]:
doc2vec_model = Doc2Vec(
    size=100,
    sample=1e-5,
    negative=15,
    alpha=0.025,
    min_alpha=0.025,
    workers=8,
    min_count=10,
)

In [22]:
%%time
doc2vec_model.build_vocab(sentences())

CPU times: user 848 ms, sys: 15 ms, total: 863 ms
Wall time: 1.31 s


In [25]:
def Epochs(model):
    epoch = 0
    while True:
        print('Epoch {}. Alpha {:.3f}'.format(epoch, model.alpha), end=' ')
        epoch += 1
        model.train(sentences())
        model.alpha = max(model.alpha - 0.001, 0.001)
        model.min_alpha = model.alpha

        print('Saving...', end=' ')
#         model.save('2015-04-04.through.2014-04-10_EN.model')
        model.save('france.model')
        print('Saved.')
        
        yield

In [26]:
epochs = Epochs(doc2vec_model)

In [27]:
%%time
for _ in range(25):
    next(epochs)

Epoch 0. Alpha 0.025 Saving... Saved.
Epoch 1. Alpha 0.024 Saving... Saved.
Epoch 2. Alpha 0.023 Saving... Saved.
Epoch 3. Alpha 0.022 Saving... Saved.
Epoch 4. Alpha 0.021 Saving... Saved.
Epoch 5. Alpha 0.020 Saving... Saved.
Epoch 6. Alpha 0.019 Saving... Saved.
Epoch 7. Alpha 0.018 Saving... Saved.
Epoch 8. Alpha 0.017 Saving... Saved.
Epoch 9. Alpha 0.016 Saving... Saved.
Epoch 10. Alpha 0.015 Saving... Saved.
Epoch 11. Alpha 0.014 Saving... Saved.
Epoch 12. Alpha 0.013 Saving... Saved.
Epoch 13. Alpha 0.012 Saving... Saved.
Epoch 14. Alpha 0.011 Saving... Saved.
Epoch 15. Alpha 0.010 Saving... Saved.
Epoch 16. Alpha 0.009 Saving... Saved.
Epoch 17. Alpha 0.008 Saving... Saved.
Epoch 18. Alpha 0.007 Saving... Saved.
Epoch 19. Alpha 0.006 Saving... Saved.
Epoch 20. Alpha 0.005 Saving... Saved.
Epoch 21. Alpha 0.004 Saving... Saved.
Epoch 22. Alpha 0.003 Saving... Saved.
Epoch 23. Alpha 0.002 Saving... Saved.
Epoch 24. Alpha 0.001 Saving... Saved.
CPU times: user 3min 59s, sys: 7.94