In [1]:
from random import random

from gensim.models.doc2vec import LabeledSentence, Doc2Vec

from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects import postgresql
from  sqlalchemy.sql.expression import func

from poultry import readline_dir
from poultry import tweet

In [2]:
import logging
logging.basicConfig(filename='flock.log', mode='w', level=logging.DEBUG)
logger = logging.getLogger('flock')

In [3]:
engine = create_engine('postgresql://dm303@localhost/sample')

In [4]:
Session = sessionmaker(bind=engine)

In [5]:
session = Session()

In [6]:
Base = declarative_base()

In [7]:
class Tweet(Base):
    __tablename__ = 'tweets'
    
    id = Column(Integer, primary_key=True)
    
    raw_data = Column(postgresql.JSONB)

In [8]:
Base.metadata.drop_all(engine)

In [9]:
Base.metadata.create_all(engine)

In [None]:
%%time

for i, tweet in enumerate(readline_dir('/import/dima-scratch/flock/data/brexit.jul23-24')):

    if (i  % 100000) == 1:
        logger.debug('Processed %s tweets, it\'s time to commit %s items.', i, len(session.new))
        session.commit()

    session.add(Tweet(raw_data=tweet.parsed))

session.commit()

In [None]:
session.query(Tweet).count()

In [None]:
session.query(Tweet).filter(Tweet.raw_data.contains({'lang': 'en'})).count()

In [None]:
session.query(Tweet).filter(Tweet.raw_data.contains({'lang': 'it'})).count()

In [None]:
session.query(Tweet).filter(Tweet.raw_data.contains({'lang': 'de'})).count()

In [None]:
%%time
# Random selection of tweets.
[t.raw_data['id'] for t in session.query(Tweet).order_by(func.random()).limit(10)]

In [None]:
t.raw_data['id_str']

In [None]:
def labeled_sentences(tweets):
    for tweet in tweets:
        yield LabeledSentence(
            words=tweet.raw_data['text'].split(),  # TODO: tokenise.
            tags=[
                u'id:{}'.format(tweet.raw_data['id_str']),
                u'@{}'.format(tweet.raw_data['user']['screen_name']),
            ]
            + [u'#{}'.format(ht['text']) for ht in tweet.raw_data['entities']['hashtags']]
        )

In [None]:
# list(
#     labeled_sentences(
#         session.query(Tweet).order_by(func.random()).limit(10)
#     )
# )

In [None]:
def sentences():
    return labeled_sentences(session.query(Tweet).order_by(func.random()))

In [None]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=64)  # use fixed learning rate

In [None]:
%%time
model.build_vocab(sentences())

In [None]:
%%time
for epoch in range(10):
    print('Epoch: {}'.format(epoch))
    model.train(sentences())
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay