In [173]:
import pandas as pd
import json
import csv
import re
import nltk

# In this file we take the raw data (found in trumps twitter archive) and 
# combine it into a single file, which we manipulate accordingly.

In [174]:
def get_raw_data(filename):
    '''Get the raw json data downloaded from the tweepy API'''
    data = None
    with open(filename) as f:
        data = json.load(f)
    return data

In [175]:
data_files = ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
tweet_years = [get_raw_data('./data/trump_tweet_data_archive/master_' + f + '.json') for f in data_files]
tweets = []
for tweet_year in tweet_years:
    tweets += tweet_year    

In [176]:
def get_input_fields(tweets):
    '''Get all of the fields in the data (rows in the csv header). Note that for the csv 
    file we only include fields that we can put in a csv file
    '''
    all_fields = []
    csv_fields = []
    for key, value in tweets[0].iteritems():
        valid_types = (int, float, unicode, str, bool)
        if any([isinstance(value, valid_type) for valid_type in valid_types]):
            csv_fields.append(key)
        all_fields.append(key)
    return (all_fields, csv_fields)

In [177]:
all_fields, csv_fields = get_input_fields(tweets)
print(all_fields)
print(csv_fields)

[u'contributors', u'truncated', u'text', u'is_quote_status', u'in_reply_to_status_id', u'id', u'favorite_count', u'source', u'retweeted', u'coordinates', u'entities', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'retweet_count', u'id_str', u'favorited', u'user', u'geo', u'in_reply_to_user_id_str', u'lang', u'created_at', u'in_reply_to_status_id_str', u'place']
[u'truncated', u'text', u'is_quote_status', u'id', u'favorite_count', u'source', u'retweeted', u'retweet_count', u'id_str', u'favorited', u'lang', u'created_at']


In [178]:
def json_2_csv(json, filename, fields):
    '''Convert a python dict to csv and write it to a file'''
    fail_count = 0
    with open(filename, 'w') as f:
        # First erase all the previous contents of the file
        f.truncate()
        # create the csv writer object
        writer = csv.writer(f)
        rows = 0
        for entry in json:
            if rows == 0:
                header = fields
                writer.writerow(header)
                rows += 1
            try:
                vals = [entry[key] for key in fields]
                writer.writerow(vals)
            except UnicodeEncodeError:
                fail_count += 1
    print "Could not parse the contents of %d entries" % fail_count

In [179]:
# Save the tweets in a csv format
json_2_csv(tweets, "trumps_tweets.csv", csv_fields)

Could not parse the contents of 3854 entries


In [180]:
# Now lets read the contents of the file with pandas
data = pd.read_csv("trumps_tweets.csv", header=0)

In [181]:
# Let's examine the data...everything looks good
data

Unnamed: 0,truncated,text,is_quote_status,id,favorite_count,source,retweeted,retweet_count,id_str,favorited,lang,created_at
0,False,Trump International Tower in Chicago ranked 6t...,False,6312794445,6,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,33,6312794445,False,en,Thu Dec 03 19:39:09 +0000 2009
1,False,Wishing you and yours a very Happy and Bountif...,False,6090839867,11,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,13,6090839867,False,en,Thu Nov 26 19:55:38 +0000 2009
2,False,Donald Trump Partners with TV1 on New Reality ...,False,5775731054,3,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,5,5775731054,False,en,Mon Nov 16 21:06:10 +0000 2009
3,False,"Hear Donald Trump discuss big gov spending, ba...",False,5069623974,2,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,2,5069623974,False,en,Thu Oct 22 13:57:04 +0000 2009
4,False,Watch video of Ivanka Trump sharing business a...,False,4862580190,10,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,4,4862580190,False,en,Wed Oct 14 14:13:17 +0000 2009
5,False,- Read what Donald Trump has to say about daug...,False,4629116949,4,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,1,4629116949,False,en,Mon Oct 05 14:37:38 +0000 2009
6,False,"""A lot of people have imagination, but can't e...",False,4472353826,30,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,23,4472353826,False,en,Tue Sep 29 15:28:23 +0000 2009
7,False,Read Donald Trump's Top Ten Tips for Success: ...,False,3982428551,30,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,22,3982428551,False,en,Mon Sep 14 15:50:14 +0000 2009
8,False,- More hysterical DSRL videos featuring Donald...,False,3688564134,6,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,7,3688564134,False,en,Tue Sep 01 13:55:34 +0000 2009
9,False,- Donald Trump bids to buy the Oreo Double Stu...,False,3627328938,3,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,5,3627328938,False,en,Sat Aug 29 16:23:56 +0000 2009


In [182]:
# The shape of the data (num_rows, num_fields)
data.shape

(26745, 12)

In [183]:
# Now lets create a bag of words representation of our data for basic NLP
# Start by trying the process on one of our text entries. We bascially 
# replace anything that is not a letter with a space.
letters_only = re.sub("[^a-zA-Z]", " ", data["text"][0]).lower().split()  # The text to search
print letters_only

['trump', 'international', 'tower', 'in', 'chicago', 'ranked', 'th', 'tallest', 'building', 'in', 'world', 'by', 'council', 'on', 'tall', 'buildings', 'urban', 'habitat', 'http', 'bit', 'ly', 'sqvqq']


In [184]:
# Hmm looks like there was a link which we will need to deal with. We'll
# get back to that later.

# We need to get rid of stop words (common words that don't carry much
# meaning). To do this we use nltk: (download the stopwords corpus)
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [185]:
# Let's check to see that the stopwords were downloaded
from nltk.corpus import stopwords # Import the stop word list
print stopwords.words("english") 

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [186]:
# Looks good. Now we need to filter out these words from our representation
letters_only = [w for w in letters_only if w not in stopwords.words('english')]
print(letters_only)

['trump', 'international', 'tower', 'chicago', 'ranked', 'th', 'tallest', 'building', 'world', 'council', 'tall', 'buildings', 'urban', 'habitat', 'http', 'bit', 'ly', 'sqvqq']


In [187]:
# Let's put it all together in a function
def clean_text(text):
    '''Function to remove punctuation and stopwords from text. Note that we keep
    the @ symbol as a hashtag adds meaning'''
    # First let's get rid of links, as they will mess up our bag or words...for 
    # example, one of the links is "https://t.co/BSp685Q9Qf https://t.co/K7yeBZsf6r'",
    # which just turns into gibberish
    # source: http://stackoverflow.com/questions/6883049/regex-to-find-urls-in-string-in-python
    link_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    no_links = re.sub(link_regex, " ", text)
    word_list = re.sub("[^@a-zA-Z]", " ", no_links).lower().split()
    # Searching a set is faster in python
    stops = set(stopwords.words("english"))
    # blacklist of words that we should ignore...for example amp is really just &,
    # so it appears a ton but has no semantic value
    blacklist = ['amp']
    clean_list = [w for w in word_list if (w not in stops and w not in blacklist)]
    return " ".join( clean_list )

In [188]:
# Checking that it works:
print data["text"][2]
print clean_text(data["text"][2])

Donald Trump Partners with TV1 on New Reality Series Entitled, Omarosa's Ultimate Merger: http://tinyurl.com/yk5m3lc
donald trump partners tv new reality series entitled omarosa ultimate merger


In [189]:
# We now add a clean_text column to our data frame
data['clean_text'] = data.apply(lambda x: clean_text(x['text']), axis=1)

In [190]:
# Let's also change the source column to just be "android" or "iphone",
# as this is easier to parse. First we check all the different possible
# sources
set(data['source'])

{'<a href="http://beta.twitlonger.com" rel="nofollow">TwitLonger Beta</a>',
 '<a href="http://instagram.com" rel="nofollow">Instagram</a>',
 '<a href="http://twitter.com" rel="nofollow">Twitter QandA</a>',
 '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>',
 '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>',
 '<a href="http://www.Neatly.me" rel="nofollow">Neatly For BlackBerry 10</a>',
 '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>',
 '<a href="http://www.twitlonger.com" rel="nofollow">Twitlonger</a>',
 '<a href="http://www.twitter.com" rel="nofollow">Twitter Mirror for iPad</a>',
 '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>',
 '<a href="

In [191]:
# Now that we know all of the sources, let's map them to a shorter string
def get_phone(source):
    '''Return what medium the tweet was sent from'''
    if source is None:
        return ""
    # The source will now be one of the following text fields
    source_list = ['instagram', 'web', 'ipad', 'android', 'iphone', 'ads', \
                   'periscope', 'media', 'qanda', 'twitlonger', 'vine', 'blackberry' \
                   'facebook', 'tweetdeck', 'websites', 'm5']
    stripped = re.match('<a .*>(.*?)</a>', source).group(1).lower().split()
    return "".join([w for w in stripped if w in source_list])
# Apply this to the source column
data['source'] = data["source"].apply(lambda x: get_phone(x))

In [192]:
# Let's also get rid of the id_str field, as it is pretty useless
data.drop('id_str', axis=1, inplace=True)

In [193]:
data

Unnamed: 0,truncated,text,is_quote_status,id,favorite_count,source,retweeted,retweet_count,favorited,lang,created_at,clean_text
0,False,Trump International Tower in Chicago ranked 6t...,False,6312794445,6,web,False,33,False,en,Thu Dec 03 19:39:09 +0000 2009,trump international tower chicago ranked th ta...
1,False,Wishing you and yours a very Happy and Bountif...,False,6090839867,11,web,False,13,False,en,Thu Nov 26 19:55:38 +0000 2009,wishing happy bountiful thanksgiving
2,False,Donald Trump Partners with TV1 on New Reality ...,False,5775731054,3,web,False,5,False,en,Mon Nov 16 21:06:10 +0000 2009,donald trump partners tv new reality series en...
3,False,"Hear Donald Trump discuss big gov spending, ba...",False,5069623974,2,web,False,2,False,en,Thu Oct 22 13:57:04 +0000 2009,hear donald trump discuss big gov spending ban...
4,False,Watch video of Ivanka Trump sharing business a...,False,4862580190,10,web,False,4,False,en,Wed Oct 14 14:13:17 +0000 2009,watch video ivanka trump sharing business advi...
5,False,- Read what Donald Trump has to say about daug...,False,4629116949,4,web,False,1,False,en,Mon Oct 05 14:37:38 +0000 2009,read donald trump say daughter ivanka upcoming...
6,False,"""A lot of people have imagination, but can't e...",False,4472353826,30,web,False,23,False,en,Tue Sep 29 15:28:23 +0000 2009,lot people imagination execute execute imagina...
7,False,Read Donald Trump's Top Ten Tips for Success: ...,False,3982428551,30,web,False,22,False,en,Mon Sep 14 15:50:14 +0000 2009,read donald trump top ten tips success
8,False,- More hysterical DSRL videos featuring Donald...,False,3688564134,6,web,False,7,False,en,Tue Sep 01 13:55:34 +0000 2009,hysterical dsrl videos featuring donald trump ...
9,False,- Donald Trump bids to buy the Oreo Double Stu...,False,3627328938,3,web,False,5,False,en,Sat Aug 29 16:23:56 +0000 2009,donald trump bids buy oreo double stuf racing ...


In [194]:
# Looks good. Now let's save the cleaned data to both csv and json format in a separate
# clean_data directory. Note that we need to specify the records format for the json
# or else it will output in a strange way
data.to_json('./clean_data/clean_tweets.json', orient="records")
data.to_csv('./clean_data/clean_tweets.csv', index=False)

In [195]:
len(data)

26745