# Markov Chain based text generation for Tweets

In [None]:
# pip install tweepy
import tweepy

CONSUMER_KEY = ''
CONSUMER_SECRET = ''

OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''


def get_user_tweets(screen_name):

    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
    api = tweepy.API(auth)

    ans = []
    for p in tweepy.Cursor(api.user_timeline, screen_name=screen_name, count=200, include_rts=True).pages():
        for t in p:
            ans.append(t)
    return [tweet.text for tweet in ans]

In [None]:
tweets = get_user_tweets('RealDonaldTrump')

In [None]:
len(tweets)

In [None]:
import re

# text cleanup
def tweet_cleanup(tw):
    tw = re.sub('["]','',tw)
    
    cleaned = []
    for w in tw.split():
        if w.startswith('@'):
            continue
        if 'RT' in w:
            continue
        if 'http' in w:
            continue
        if 'via' in w:
            continue
        cleaned.append(w)
        
    return cleaned

In [None]:
t = tweets[0]
t

In [None]:
tweet_cleanup(t)

In [None]:
# place all the words in one long array

all_words = (' '.join([' '.join(tweet_cleanup(tw)) for tw in tweets])).split()
len(all_words)

In [None]:
# triples - create combinations of three consecutive words from long array

def triples(words):
    for i in range(len(words) - 2):
        yield (words[i], words[i+1], words[i+2])

In [None]:
for i in triples(all_words[:10]):
    print i

In [None]:
from collections import defaultdict

# create a dictionary of all transitions: keys are "source" state (tuple of two words), and the list represents 
# the "destination" state, which includes all options for the next word, given the existing two words

transitions = defaultdict(list)
for w1, w2, w3 in triples(all_words):
    transitions[(w1,w2)].append(w3)

In [None]:
import numpy as np

# let's pick a start position by random (called: seed)
seed = np.random.choice(len(all_words)-2)

# our beginning two words
w1, w2 = all_words[seed], all_words[seed+1]

# some parameters for our generated text
max_length = 20
min_length = 5
generated = []

# choose a somewhat random length for our text
chosen_length = np.random.random_integers(min_length, max_length)

# create the text using our transitions dictionary
while len(generated)<max_length:
    
    generated.append(w1)
    w1, w2 = w2, np.random.choice(transitions[(w1, w2)])
    
    # if our generated string is longer or equal to our wanted length
    if len(generated)>=chosen_length:
        
        # if our current last word doesn't end with punctuation
        if w2[-1] not in '.!?-:':
            
            # simply add punctuation to it
            w2+=np.random.choice(['!','.','?','-'])
        break

# attach the last word to our string
generated.append(w2)    


print ' '.join(generated).capitalize()