# Markov Chain based text generation for Tweets

In [83]:
# pip install tweepy
import tweepy

CONSUMER_KEY = ''
CONSUMER_SECRET = ''

OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''


def get_user_tweets(screen_name):

    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
    api = tweepy.API(auth)

    ans = []
    for p in tweepy.Cursor(api.user_timeline, screen_name=screen_name, count=200, include_rts=True).pages():
        for t in p:
            ans.append(t)
    return [tweet.text for tweet in ans]

In [84]:
tweets = get_user_tweets('RealDonaldTrump')



In [85]:
len(tweets)

3205

In [86]:
import re

# text cleanup
def tweet_cleanup(tw):
    tw = re.sub('["]','',tw)
    
    cleaned = []
    for w in tw.split():
        if w.startswith('@'):
            continue
        if 'RT' in w:
            continue
        if 'http' in w:
            continue
        if 'via' in w:
            continue
        cleaned.append(w)
        
    return cleaned

In [87]:
t = tweets[0]
t

u'"@giatny:  Rubio an orator/liar like Obama but totally unqualified. Rubio visa bill did NOT protect American workers. See Disney."'

In [89]:
' '.join(tweet_cleanup(t))

u'Rubio an orator/liar like Obama but totally unqualified. Rubio visa bill did NOT protect American workers. See Disney.'

In [90]:
# place all the words in one long array

all_words = (' '.join([' '.join(tweet_cleanup(tw)) for tw in tweets])).split()
len(all_words)

46896

In [92]:
all_words[:10]

[u'Rubio',
 u'an',
 u'orator/liar',
 u'like',
 u'Obama',
 u'but',
 u'totally',
 u'unqualified.',
 u'Rubio',
 u'visa']

In [None]:
# triples - create combinations of three consecutive words from long array

def triples(words):
    for i in range(len(words) - 2):
        yield (words[i], words[i+1], words[i+2])

In [93]:
for i in triples(all_words[:10]):
    print i

(u'Rubio', u'an', u'orator/liar')
(u'an', u'orator/liar', u'like')
(u'orator/liar', u'like', u'Obama')
(u'like', u'Obama', u'but')
(u'Obama', u'but', u'totally')
(u'but', u'totally', u'unqualified.')
(u'totally', u'unqualified.', u'Rubio')
(u'unqualified.', u'Rubio', u'visa')


In [94]:
from collections import defaultdict

# create a dictionary of all transitions: keys are "source" state (tuple of two words), and the list represents 
# the "destination" state, which includes all options for the next word, given the existing two words

transitions = defaultdict(list)
for w1, w2, w3 in triples(all_words):
    transitions[(w1,w2)].append(w3)

In [96]:
transitions.keys()[:10]

[(u'benefits', u'than'),
 (u'Trump..Please..He', u'is'),
 (u'MONEY,', u"HE'S"),
 (u'has', u'awakened'),
 (u'yesterday,', u'massive'),
 (u'#trump2016', u'So'),
 (u'ppl', u'that'),
 (u'I', u"don't"),
 (u'ridiculous', u'$1,000,000'),
 (u'us', u'out')]

In [98]:
transitions[('I','am')]

[u'now',
 u'sure',
 u'leaving',
 u'fighting',
 u'winning',
 u'giving',
 u'not',
 u'going',
 u'Hispanic',
 u'now',
 u'having',
 u'attracting',
 u'in',
 u'having',
 u'ahead',
 u'with',
 u'72',
 u'not',
 u'only',
 u'self-funding',
 u'very',
 u'in',
 u'self',
 u'trying',
 u'with',
 u'going',
 u'looking',
 u'elected',
 u'very',
 u'a',
 u'so',
 u'rapidly',
 u'#1',
 u'now',
 u'a',
 u'going',
 u'not',
 u'no',
 u'now',
 u'Protestant',
 u'#1',
 u'#1',
 u'gonna',
 u'so',
 u'campaigning',
 u'standing',
 u'right.',
 u'the',
 u'the',
 u'a',
 u'number',
 u'sure',
 u'with',
 u'really',
 u'really',
 u'alone',
 u'so',
 u'running',
 u'behind',
 u'stunned',
 u'officially',
 u'a',
 u'really',
 u'betting',
 u'to']

In [109]:
import numpy as np

# let's pick a start position by random (called: seed)
seed = np.random.choice(len(all_words)-2)

# our beginning two words
w1, w2 = all_words[seed], all_words[seed+1]

# some parameters for our generated text
max_length = 20
min_length = 5
generated = []

# choose a somewhat random length for our text
chosen_length = np.random.random_integers(min_length, max_length)

# create the text using our transitions dictionary
while len(generated)<max_length:
    
    generated.append(w1)
    w1, w2 = w2, np.random.choice(transitions[(w1, w2)])
    
    # if our generated string is longer or equal to our wanted length
    if len(generated)>=chosen_length:
        
        # if our current last word doesn't end with punctuation
        if w2[-1] not in '.!?-:':
            
            # simply add punctuation to it
            w2+=np.random.choice(['!','.','?','-'])
        break

# attach the last word to our string
generated.append(w2)    


print ' '.join(generated).capitalize()

Went really well. big and wonderful crowd. just arrived in l.a. aboard the - the battleship of join-
