In [0]:
import json
import itertools
import numpy as np
import os
import pandas as pd
import random
import torch
import twitter

from fastai.text import TextLMDataBunch, URLs
from fastai.text import language_model_learner
from sklearn.model_selection import train_test_split
from torch import nn, optim

random.seed(2)

## Loading history of tweets

You will need to create your own developer account here https://developer.twitter.com/en/apps and register a Twitter application to get neccessary tokens and save them in *credentials.json* in JSON format or put directly in code bellow



In [0]:
def remove_links(tweet):
    splited = tweet.split(' ')
    for i, word in enumerate(splited):
        if 'http' in word or 't.co' in word:
            splited[i] = 'HYPERLINK'
    return ' '.join(splited)

def remove_nonascii(tweet):
    return ''.join([char if ord(char) < 128 else '' for char in tweet])

def clean_tweet(tweet):
    processed = remove_links(tweet)
    processed = remove_nonascii(processed)
    return processed


BATCH_SIZE = 200


class TweetLoader:
    def __init__(self, credentials, name):
        self.name = name
        self.api = twitter.Api(**credentials)
        self.api.VerifyCredentials()


    def get_last_tweet_id(self):
        statuses = self.api.GetUserTimeline(
            screen_name=self.name,
            count=1,
            include_rts=False
        )
        first_tweet = statuses[0]
        return first_tweet.id

    def load_batch(self, id):
        statuses = self.api.GetUserTimeline(
            screen_name=self.name,
            count=BATCH_SIZE,
            include_rts=False,
            max_id=id
        )
        tweets = [status.full_text for status in statuses]
        last_tweet = statuses[-1]
        return tweets, last_tweet.id

    def load(self):
        prev_id = None
        id = self.get_last_tweet_id()
        raw_tweets = []
        batch_tweets, id = self.load_batch(id)
        while id != prev_id:
            raw_tweets += batch_tweets[1:]
            prev_id = id
            batch_tweets, id = self.load_batch(prev_id)
        clean_data = list(map(clean_tweet, raw_tweets))
        return clean_data

In [5]:
PATH_TO_CREDENTIALS = 'credentials.json'

if os.path.isfile(PATH_TO_CREDENTIALS):
    with open(PATH_TO_CREDENTIALS, 'r') as f:
        credentials = json.load(f)
else:
    raise FileNotFoundError('Please save tokens in `{}`'.format(PATH_TO_CREDENTIALS))

tweet_loader = TweetLoader(credentials, name='realDonaldTrump')
real_tweets = tweet_loader.load()

print('Loaded {} tweets'.format(len(real_tweets)))

Loaded 2752 tweets


## Generating tweets

In [0]:
# Inspired by https://github.com/paraschopra/generating-text-small-corpus

class TweetGenerator:
    def __init__(self, data, dropout=0.5):
        self.dropout = 0.5
        train_data, validation_data = train_test_split(
            list(map(lambda x: x.lower(), data)),
            test_size=0.05,
            random_state=1
        )
        self.train_df = pd.DataFrame({'tweet': train_data})
        self.validation_df = pd.DataFrame({'tweet': validation_data})
        self.trained = False
  
    def train(self, epochs=8, batch_size=32):
        self.data_lm = TextLMDataBunch.from_df(
            'data',
            self.train_df,
            self.validation_df,
            text_cols='tweet',
            bs=batch_size
        )
        if not self.trained:
            self.model = language_model_learner(self.data_lm, pretrained_model=URLs.WT103, drop_mult=self.dropout)
            self.model.fit_one_cycle(1, 1e-2)
            self.model.unfreeze()
            self.model.fit_one_cycle(1, 1e-3)
            self.trained = True
            self.model.fit(epochs, lr=1e-3, wd=1e-7)
    
    def generate(self, count=10, max_words=70):
        generated_tweets = []
        while len(generated_tweets) < count:
            raw_generated = self.model.predict("xxbos", n_words=max_words, temperature=0.8)
            raw_tweets = raw_generated.split("xxbos ")[1:]
            for tweet in raw_tweets:
                tweet = tweet.replace('hyperlink', '')[:-1]
                if tweet:
                    generated_tweets.append(tweet)
        return generated_tweets

In [7]:
tweet_generator = TweetGenerator(real_tweets)
tweet_generator.train(epochs=3, batch_size=32)
tweet_generator.train(epochs=2, batch_size=64)

generated_tweets = tweet_generator.generate(5)
print('\n'.join(generated_tweets))

the united states is with great progress from being corrupt on china , but i m looking forward to doing very well . but then it does , and can not do , it is dumb against china . this is the worst number they have ever seen . there are no barriers to china , and all over the world , yet i ve always been very positive abou

the u.s . , however , is not strong on trade , the economy and the economy . we have the right to leakers and the border ?

today , we have in the 2017 presidential election in california . we won in november and be a great senator for ohio . when we have meeting and vote for governor david j. trump , you will be leaving th

the washington post had some big quotes since the election for governor of the great state of michigan . if you have to wait until you see your speech , they will be free to do so . but did nt care about that money . it is a really big job !

the fake news is on frame . no official news was leaked . the story i


## Capitalizing words and cleaning sentences

Let's train char-RNN for proper word capitalizing

In [0]:
PAD_token = 2
EOS_token = 1 
UNK_token = 0

class Vocabulary:
    def __init__(self):
        self.char2index = {'UNK': UNK_token, 'EOS': EOS_token, 'PAD': PAD_token}
        self.num_chars = 3  # Count PAD, EOS, UNK

    def add_tweet(self, tweet):
        for char in tweet:
            self.add_char(char.lower())

    def add_char(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.num_chars
            self.num_chars += 1


In [0]:
vocabulary = Vocabulary()
for tweet in real_tweets:
    vocabulary.add_tweet(tweet)

In [0]:
def prettify_tweet(tweet):
    while tweet.find('xxrep') != -1:
        rep_pos = tweet.find('xxrep')
        try:
            count = int(tweet[rep_pos + len('xxrep') + 1])
            char_to_rep = tweet[rep_pos + len('xxrep ') + 2]
            tweet = tweet[:rep_pos] + char_to_rep * count + tweet[rep_pos + len('xxrep ') + 3:]
        except:
            tweet = tweet.replace('xxrep', '')
    prepositions =['?', '!', ',', '.', '\'', '”', 'n\'t', '%', '$', ')', ':', '& ']
    postpositions = ['$', '#', '“', '(']
    for char in prepositions:
        tweet = tweet.replace(' ' + char, char)
    for char in postpositions:
        tweet = tweet.replace(' ' + char, char)
    return tweet

In [0]:
def get_indexes(voc, sentence):
    return [voc.char2index.get(char.lower(), UNK_token) for char in sentence]

def padd_batch(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def construct_input_batch(input_sentences, voc):
    indexes_batch = [get_indexes(voc, sentence) for sentence in input_sentences]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padded_input = padd_batch(indexes_batch)
    padded_input = torch.LongTensor(padded_input)
    return padded_input, lengths

def construct_target_batch(target_batch):
    padded_target = np.array(padd_batch(target_batch))
    mask = np.array(padded_target != PAD_token, dtype='int')
    mask = torch.FloatTensor(mask)
    padded_target = torch.FloatTensor(padded_target)
    return padded_target, mask

def construct_batch(voc, tweets):
    tweets.sort(key=lambda x: len(x), reverse=True)
    inp_batch, target_batch = [], []
    for tweet in tweets:
        inp_batch.append(tweet)
        target_batch.append(list(map(lambda x: int(x.isupper()), tweet)))
    inp_batch, lengths = construct_input_batch(inp_batch, voc)
    target_batch, mask = construct_target_batch(target_batch)
    return inp_batch, lengths, target_batch, mask

class CaseRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(CaseRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            dropout=(0 if n_layers == 1 else dropout),
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        outputs, hidden = self.gru(embedded, hidden)
        outputs = self.fc(outputs)
        outputs = outputs.view((input_lengths[0], -1))
        outputs = self.sigmoid(outputs)
        return outputs, hidden
  
    def capitalize(self, tweets, threshold=0.6):
        pretty_tweets = list(map(prettify_tweet, tweets))
        inp, lengths, target, mask = construct_batch(vocabulary, pretty_tweets)
        outputs, _ = self.forward(inp.to(device), lengths)
        outputs = outputs.transpose(0, 1)
  
        capitalized_tweets = []
        for i, tweet in enumerate(pretty_tweets):
            is_upper = outputs[i] > threshold
            capitalized_tweet = ''
            for j, char in enumerate(tweet):
                if is_upper[j]:
                    capitalized_tweet += char.upper()
                else:
                    capitalized_tweet += char
            capitalized_tweets.append(capitalized_tweet)
        return capitalized_tweets

EPS = 1e-8

def masked_loss(output, target, mask):
    loss =  mask * (-target * torch.log2(outputs + EPS) - (1 - target) * torch.log2(1 - outputs + EPS))
    average_loss = loss.sum(0).mean()
    return average_loss



In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_SIZE = 70

embedding = nn.Embedding(vocabulary.num_chars, HIDDEN_SIZE)

tweet_capitalizer = CaseRNN(HIDDEN_SIZE, embedding, n_layers=2, dropout=0.3)
tweet_capitalizer.to(device)
optimizer = optim.Adam(tweet_capitalizer.parameters())

In [0]:
loss_history = []

N_ITERATIONS = 800

for i in range(N_ITERATIONS):
    inp, lengths, target, mask = construct_batch(vocabulary, [random.choice(real_tweets) for _ in range(48)])
    inp = inp.to(device)
    target = target.to(device)
    mask = mask.to(device)
  
    optimizer.zero_grad()
    outputs, _ = tweet_capitalizer(inp, lengths)
    loss = masked_loss(outputs, target, mask)
    loss.backward()
    optimizer.step()
  
    loss_history.append(loss.item())


In [14]:
tweet_capitalizer.capitalize(generated_tweets)

['The United States is with great progress from being corrupt on China, but I m looking forward to doing very well. But then it does, and can not do, it is dumb against China. This is the worst number they have ever seen. There are no barriers to China, and all over the world, yet I ve always been very positive abou',
 'The Washington post had some big quotes since the Election for Governor of the great state of Michigan. If you have to wait until you see your speech, they will be free to do so. But did nt care about that money. It is a really big job!',
 'Today, we have in the 2017 Presidential Election in California. We won in November and be a great Senator for Ohio. When we have meeting and vote for Governor David J. Trump, you will be leaving th',
 'The U.S., however, is not strong on trade, the economy and the economy. We have the right to leakers and the Border?',
 'The Fake News is on frame. No official News was leaked. The story I']