In [1]:
from random import random

from gensim.models.doc2vec import LabeledSentence, Doc2Vec

from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects import postgresql
from  sqlalchemy.sql.expression import func

from poultry import readline_dir
from poultry import tweet

In [2]:
import logging
logging.basicConfig(filename='flock.log', mode='w', level=logging.DEBUG)
logger = logging.getLogger('flock')

In [3]:
engine = create_engine('postgresql://dm303@localhost/sample')

In [4]:
Session = sessionmaker(bind=engine)

In [5]:
session = Session()

In [6]:
Base = declarative_base()

In [7]:
class Tweet(Base):
    __tablename__ = 'tweets'
    
    id = Column(Integer, primary_key=True)
    
    raw_data = Column(postgresql.JSONB)

In [8]:
Base.metadata.drop_all(engine)

In [None]:
Base.metadata.create_all(engine)

In [None]:
%%time

for i, tweet in enumerate(readline_dir('/import/dima-scratch/flock/data/brexit.jul23-24')):

    if (i  % 100000) == 1:
        logger.debug('Processed %s tweets, it\'s time to commit %s items.', i, len(session.new))
        session.commit()

    session.add(Tweet(raw_data=tweet.parsed))

session.commit()

In [11]:
session.query(Tweet).count()

3703196L

In [12]:
session.query(Tweet).filter(Tweet.raw_data.contains({'lang': 'en'})).count()

2459735L

In [None]:
session.query(Tweet).filter(Tweet.raw_data.contains({'lang': 'it'})).count()

116503L

In [None]:
session.query(Tweet).filter(Tweet.raw_data.contains({'lang': 'de'})).count()

In [None]:
%%time
# Random selection of tweets.
[t.raw_data['id'] for t in session.query(Tweet).order_by(func.random()).limit(10)]

In [None]:
t.raw_data['id_str']

In [None]:
def labeled_sentences(tweets):
    for tweet in tweets:
        yield LabeledSentence(
            words=tweet.raw_data['text'].split(),  # TODO: tokenise.
            tags=[
                u'id:{}'.format(tweet.raw_data['id_str']),
                u'@{}'.format(tweet.raw_data['user']['screen_name']),
            ]
            + [u'#{}'.format(ht['text']) for ht in tweet.raw_data['entities']['hashtags']]
        )

In [None]:
# list(
#     labeled_sentences(
#         session.query(Tweet).order_by(func.random()).limit(10)
#     )
# )

In [None]:
def sentences():
    return labeled_sentences(session.query(Tweet).order_by(func.random()))

In [None]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=64)  # use fixed learning rate

In [None]:
%%time
model.build_vocab(sentences())

In [None]:
%%time
for epoch in range(10):
    print('Epoch: {}'.format(epoch))
    model.train(sentences())
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [25]:
model.most_similar('#leave')

[(u'Leave', 0.3594135344028473),
 (u'#leave?', 0.3591740131378174),
 (u'#Rexit', 0.34052973985671997),
 (u'https://t.co/Bdbs3ZfY9D', 0.3333190083503723),
 (u'#Remain', 0.33204835653305054),
 (u'#leave.', 0.3283088207244873),
 (u'#leave!', 0.3253001570701599),
 (u'#EuropeanUnion', 0.3220774233341217),
 (u'counter', 0.3179582953453064),
 (u'#Bremain', 0.30828213691711426)]

In [26]:
model.most_similar('#remain')

[(u'#UK', 0.4775325059890747),
 (u'#remain,', 0.4151751399040222),
 (u'Leave', 0.38699883222579956),
 (u'#remain.', 0.38196349143981934),
 (u'#EURef', 0.3802741765975952),
 (u'Remain', 0.355394184589386),
 (u'https://t.co/IQGSj1V3pU', 0.35470372438430786),
 (u'#Remain', 0.34487384557724),
 (u'#ShoutNews', 0.3439306616783142),
 (u'#canneslions', 0.3398371636867523)]

In [29]:
model.most_similar('boris')

[(u'Boris', 0.3845505118370056),
 (u'https://t.co/IFdi5TuKqE', 0.3593224883079529),
 (u'POTUS', 0.3321494162082672),
 (u'he', 0.3153676688671112),
 (u'EWING', 0.306194543838501),
 (u'trump', 0.30460125207901),
 (u'@Nivo0o0:', 0.294739305973053),
 (u'naus\xe9abond', 0.2929772138595581),
 (u'https://t.co/Z8hp1DQO2j', 0.2872544527053833),
 (u'she', 0.28533172607421875)]

In [30]:
model.most_similar('bus')

[(u'fire', 0.5007041692733765),
 (u'table', 0.39386045932769775),
 (u'sun', 0.3847666382789612),
 (u'air', 0.38156476616859436),
 (u'weekend', 0.37769317626953125),
 (u'kettle', 0.37703073024749756),
 (u'band', 0.3667536973953247),
 (u'island', 0.35691404342651367),
 (u'water', 0.35450342297554016),
 (u'bag', 0.35022270679473877)]