In [10]:
import csv
import re

In [21]:
def normalize_tweets(jack_tweets):

    jack_tweets_no_rts = list(filter(lambda x: not x[1].startswith('RT'), jack_tweets))

    # Need to process each list into the language model
    jack_tweets_no_rts_lowercase = [tweet[1].lower() for tweet in jack_tweets_no_rts]

    # Canonicalize links to "URL" and @mentions to "USER"
    jack_tweets_no_rts_lowercase = [re.sub(r'(https?:\/\/t\.co\/\w+)', 'URL', tweet) for tweet in jack_tweets_no_rts_lowercase]
    jack_tweets_normalized = [re.sub(r'(@\w+)', 'USER', tweet) for tweet in jack_tweets_no_rts_lowercase]
    
    # Replace multiple URLS and USERS with single user
    jack_tweets_normalized = [re.sub("(USER\s*)+", "USER ", tweet) for tweet in jack_tweets_normalized]
    jack_tweets_normalized = [re.sub("(URL\s*)+", "URL ", tweet) for tweet in jack_tweets_normalized]

    # For the language model, *maybe* having USER or URL makes sense?
    # When generating a tweet, we should replace USERs and URLs with "USER" and "URL"
    # This will eliminate the bias.  We can merge "USER USER" to "USER"
    # for i in range(10):
    #    print(jack_tweets_no_rts_lowercase[i])
    #    print("  -> {}".format(jack_tweets_normalized[i]))
        
    return jack_tweets_normalized

In [23]:
# Build a language model of all the tweets
tweet_corpus = ['data/jack.csv', 'data/peterthiel.csv', 'data/realDonaldTrump.csv']

all_tweets = []
for someone in tweet_corpus:
    csv_reader = csv.reader(open(someone, 'r'))
    
    columns = next(csv_reader)
    someones_tweets = list(csv_reader)
    
    someones_normalized_tweets = normalize_tweets(someones_tweets)

    all_tweets.extend(someones_normalized_tweets)
    
    print(len(someones_normalized_tweets))
    
print(len(all_tweets))


527
5
1491
2023


In [25]:
with open("data/all_tweets.txt", "w") as f:
    for tweet in all_tweets:
          f.write("{}\n".format(tweet))
    
# cat data/all_tweets.txt | python process.py | ./kenlm/bin/lmplz -o 3 > data/tweets.arpa
# ./kenlm/bin/build_binary data/tweets.arpa data/tweets.klm

In [26]:
import kenlm

In [27]:
model = kenlm.LanguageModel('data/tweets.klm')
model.score('in the beginning was the word a')

-15.884644508361816

In [29]:
model.score("thank you reno")

-6.473952293395996

In [30]:
model.score("thank you nevada")

-5.952385425567627

In [32]:
model.score("crooked hillary")

-4.437318801879883

In [33]:
model.score("hillary is a liar")

-10.082124710083008

In [34]:
dir(model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'full_scores',
 'order',
 'path',
 'score']

In [35]:
model.path


b'/Users/what/Documents/code/legomystego/data/tweets.klm'

In [36]:
model.order

3

In [43]:
list(model.full_scores("make america safe and great again"))

[(-2.605342388153076, 2),
 (-0.13421054184436798, 3),
 (-0.47079798579216003, 3),
 (-0.2677556574344635, 3),
 (-0.06159810349345207, 3),
 (-0.10894644260406494, 3),
 (-3.1698803901672363, 1)]

In [54]:
model.score("crooked and we are going to make america great again")

-12.686223983764648

In [59]:
model.score("thank for the incredible support this morning tampa")

-15.567790985107422