In [8]:
import sqlite3 as db
conn = db.connect('pol_tweets.db')

In [9]:
c = conn.cursor();
c.execute('SELECT t_id, handle, body FROM politics_tweets')
tweets = c.fetchall()

In [13]:
#Imports and global helper functions

import re
import string
from nltk.tokenize import TweetTokenizer
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from nltk.corpus import wordnet

# Auxiliary downloads for NLTK to do lemmatization and stopword removal
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

def treebank_to_wordnet(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return False

def lemmatize_tagged_token(tt):
    wn_tag = treebank_to_wordnet(tt[1])
    if not wn_tag:
        return(WordNetLemmatizer().lemmatize(tt[0]))
    else:
        return(WordNetLemmatizer().lemmatize(tt[0], wn_tag))

#English stopwords from wordnet
eng_stop_list = set(nltk.corpus.stopwords.words('english'))

# Preprocessing



In [3]:
#regular expressions
tco_regex = re.compile(r"[:]?\s*http[s]?://[a-zA-Z0-9?/:.]*\b", re.IGNORECASE)
#tco_regex = re.compile(r"\s*http[s]?://.*\b", re.IGNORECASE)

emoji_regex = re.compile("["
                       u"\U0001F600-\U0001F64F"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       u"\U0001f926-\U0001f937"
                       u"\U00002192"
                       "]+",flags=re.UNICODE)
pm_regex = re.compile('p\.m', flags=re.IGNORECASE)
am_regex = re.compile('a\.m', flags=re.IGNORECASE)
apostrophe_regex = re.compile('(' + str(chr(8217)) + '|' + str(chr(8220)) + ')')
hrs_abbv_regex = re.compile('hrs\.', flags=re.IGNORECASE)
end_quote_regex = re.compile(u"\U0000201D", flags=re.UNICODE)
hillary_abbv_regex = re.compile(r"(-|—)h(\s|(illary(clinton)?)+)", flags=re.IGNORECASE)
hillary_apostrophe_regex = re.compile(r"hillary('s)?", flags=re.IGNORECASE)
maga_regex = re.compile(r"[#]?make\s*america\s*great\s*again|#maga", flags=re.IGNORECASE)
double_hyphens_regex = re.compile(r"(--|——)")

#apply these BEFORE lowercasing in tokenization
usa1_regex = re.compile(r"(^|\s+)U[.]?[\s]?S[.]?[\s]?[A]?")
usa2_regex = re.compile(r"united\s+states(\s*of\s+america)?", flags=re.IGNORECASE)
la_regex = re.compile("L\.A[.]?")
gb_regex = re.compile(r"((George|george)[\sa-zA-Z.]*)?Bush('s|\s+II)?")

number_regex = re.compile(r"([0-9]+(,|.)?)+")
usernames_regex = re.compile(r"@[a-zA-Z_.0-9]+")
hashtags_regex = re.compile(r"#[a-zA-Z_.0-9]+")

In [5]:
#Get list of top referenced usernames and hashtags
min_user_references = 10
min_hashtag_references = 10

list_users = [];
list_hashtags = [];
for r in tweets:
    list_users.extend(usernames_regex.findall(r[2]))
    list_hashtags.extend(hashtags_regex.findall(r[2]))
    
[users, ucounts] = np.unique(list_users, return_counts=True)
[hashtags, hcounts] = np.unique(list_hashtags, return_counts=True)
users = np.column_stack((users, ucounts)).tolist()
hashtags = np.column_stack((hashtags, hcounts)).tolist()
users = sorted(users, key= lambda x: int(x[1]), reverse=True)
hashtags = sorted(hashtags, key= lambda x: int(x[1]), reverse=True)

frequent_users = [u[0].lower() for u in users if int(u[1]) > min_user_references]
frequent_hashtags = [h[0].lower() for h in hashtags if int(h[1]) > min_hashtag_references]

In [6]:
replacements = [
            [tco_regex, ' token_hyperlink '],
            [emoji_regex, ''],
            [pm_regex, ' pm '],
            [am_regex, ' am '],
            [apostrophe_regex, str(chr(39))],
            [hrs_abbv_regex, ' hours '],
            [end_quote_regex, str(chr(34))],
            [maga_regex, ' token_maga '],
            [usa1_regex, " token_unitedstates "],
            [usa2_regex, " token_unitedstates "],
            [gb_regex, " token_georgebush "],
            [la_regex, " token_losangeles "],
            [hillary_abbv_regex, " token_quotehillary "],
            [double_hyphens_regex, " -- "],
            [hillary_apostrophe_regex, "hillary"]
]
removed_punc = [symb for symb in string.punctuation]+['...']+['..']+['—']+['…']
removed_punc.remove('?')
removed_punc.remove('!')
removed_punc.remove('&')

twtok = TweetTokenizer(preserve_case=False)
split_tweets = []

for i, tweet in enumerate(tweets):
    line = list(tweet)
    line.append(tweet[2])
    for ctrlr in replacements:
        line[3] = ctrlr[0].sub(ctrlr[1], line[3])
        
    raw_token_list = twtok.tokenize(line[3])
    lda_tokens = [lemmatize_tagged_token(toktag) for toktag in nltk.pos_tag(raw_token_list)]
    lda_tokens = [tok for tok in lda_tokens if tok not in removed_punc]
    wordvec_tokens = [tok for tok in raw_token_list if tok not in removed_punc]
    lda_removed_count = 0
    for j, tok in enumerate(wordvec_tokens):
        if number_regex.match(tok) != None:
            wordvec_tokens[j] = 'token_number'
        elif usernames_regex.match(tok) and tok not in frequent_users:
            wordvec_tokens[j] = 'token_rare_user'
        elif hashtags_regex.match(tok) and tok not in frequent_hashtags:
            wordvec_tokens[j] = 'token_rare_hashtag'
        if number_regex.match(tok) or usernames_regex.match(tok) or hashtags_regex.match(tok)or tok in ['token_quotehillary','token_hyperlink','!','?','&']:
            del lda_tokens[j - lda_removed_count]
            lda_removed_count += 1
    
    lda_tokens = [tok for tok in lda_tokens if tok not in eng_stop_list and tok not in ['!','?','&'] and len(tok) > 1]    
    line.append(wordvec_tokens)
    line.append(lda_tokens)

    split_tweets.append(line)

In [167]:
#'261192751935279104'
[r for r in split_tweets if r[0] == '626140315116830721']
#split_tweets[1195]

[['626140315116830721',
  'realDonaldTrump',
  'A nation WITHOUT BORDERS is not a nation at all. We must have a wall. The rule of law matters. Jeb just doesn’t get it.',
  "A nation WITHOUT BORDERS is not a nation at all. We must have a wall. The rule of law matters. Jeb just doesn't get it.",
  ['a',
   'nation',
   'without',
   'borders',
   'is',
   'not',
   'a',
   'nation',
   'at',
   'all',
   'we',
   'must',
   'have',
   'a',
   'wall',
   'the',
   'rule',
   'of',
   'law',
   'matters',
   'jeb',
   'just',
   "doesn't",
   'get',
   'it'],
  ['nation',
   'without',
   'border',
   'nation',
   'must',
   'wall',
   'rule',
   'law',
   'matter',
   'jeb',
   'get']]]

# LDA benchmark

In [7]:
import gensim.corpora as corpora
import gensim.models as gm

lda_bow = [r[5] for r in split_tweets]
word2id_dict = corpora.Dictionary([tweet for tweet in lda_bow])
lda_bow_ids = [word2id_dict.doc2bow(tweet) for tweet in lda_bow]

def get_max_topic(topic_list):
    running_topic = topic_list[0][0]
    running_max = 0
    for topic in topic_list:
        if topic[1] > running_max:
            running_topic = topic[0]
            running_max = topic[1]
    return(running_topic)

def get_tweets_for_topic(k):
    return([lda_bow[i] for i in [i for i, bow in enumerate(lda_bow_ids) if get_max_topic(lda_obj[bow]) == k]])


In [8]:
num_topics = 80
LDA = gm.ldamulticore.LdaMulticore(corpus=lda_bow_ids,
                                   num_topics=num_topics,
                                   id2word=word2id_dict,
                                   workers=3)

In [9]:
topic_keywords = [[word2id_dict.get(word_prob[0]) for word_prob in LDA.get_topic_terms(n_topic,topn=15)] for n_topic in range(num_topics)]

In [10]:
topic_keywords

[['restaurant',
  'trump',
  'great',
  'president',
  'know',
  'really',
  'win',
  'geraldo',
  'bad',
  'donald',
  'one',
  'thank',
  'tv',
  'run',
  'get'],
 ['thanks',
  'think',
  'track',
  'great',
  'go',
  'trump',
  'obama',
  'big',
  'fast',
  'country',
  'give',
  'never',
  'new',
  'one',
  'make'],
 ['congrats',
  'great',
  'trump',
  'want',
  "that's",
  'last',
  'like',
  'true',
  'never',
  'know',
  'others',
  'really',
  'lawyer',
  'well',
  'good'],
 ['celebrity',
  "i'll",
  'fan',
  'say',
  'iraq',
  "i'm",
  'love',
  'yes',
  'view',
  'watch',
  'go',
  'opportunity',
  'get',
  'start',
  'trump'],
 ['cont',
  'trump',
  'great',
  'new',
  "china's",
  'imagine',
  'yet',
  'say',
  'girl',
  'hillary',
  'like',
  'short',
  'make',
  'competitor',
  'best'],
 ['course',
  'trump',
  'thanks',
  'star',
  'get',
  'look',
  'like',
  'president',
  'one',
  'thank',
  'run',
  'obama',
  'mr',
  'back',
  'watch'],
 ['trump',
  'weiner',
  're