In [1]:
import sqlite3 as db
conn = db.connect('pol_tweets.db')

In [2]:
c = conn.cursor();
c.execute('SELECT t_id, handle, body FROM politics_tweets')
tweets = c.fetchall()

In [3]:
import re
import string
from nltk.tokenize import TweetTokenizer
import numpy as np
#from nltk.tokenize import sent_tokenize

tco_regex = re.compile(r"[:]?\s*http[s]?://[a-zA-Z0-9?/:.]*\b", re.IGNORECASE)
#tco_regex = re.compile(r"\s*http[s]?://.*\b", re.IGNORECASE)

emoji_regex = re.compile("["
                       u"\U0001F600-\U0001F64F"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       u"\U0001f926-\U0001f937"
                       u"\U00002192"
                       "]+",flags=re.UNICODE)
pm_regex = re.compile('p\.m', flags=re.IGNORECASE)
am_regex = re.compile('a\.m', flags=re.IGNORECASE)
apostrophe_regex = re.compile(str(chr(8217)))
hrs_abbv_regex = re.compile('hrs\.', flags=re.IGNORECASE)
end_quote_regex = re.compile(u"\U0000201D", flags=re.UNICODE)
hillary_abbv_regex = re.compile(r"(-|—)h(\s|(illary(clinton)?)+)", flags=re.IGNORECASE)
maga_regex = re.compile(r"[#]?make\s*america\s*great\s*again|#maga", flags=re.IGNORECASE)
double_hyphens_regex = re.compile(r"(--|——)")

#apply these BEFORE lowercasing in tokenization
usa1_regex = re.compile(r"(^|\s+)U[.]?[\s]?S[.]?[\s]?[A]?")
usa2_regex = re.compile(r"united\s+states(\s*of\s+america)?", flags=re.IGNORECASE)
la_regex = re.compile("L\.A[.]?")
gb_regex = re.compile(r"((George|george)[\sa-zA-Z.]*)?Bush('s|\s+II)?")

number_regex = re.compile(r"([0-9]+(,|.)?)+")

replacements = [
            [tco_regex, ' token_hyperlink '],
            [emoji_regex, ''],
            [pm_regex, ' pm '],
            [am_regex, ' am '],
            [apostrophe_regex, str(chr(39))],
            [hrs_abbv_regex, ' hours '],
            [end_quote_regex, str(chr(34))],
            [maga_regex, ' token_maga '],
            [usa1_regex, " token_unitedstates "],
            [usa2_regex, " token_unitedstates "],
            [hillary_abbv_regex, " token_quotehillary "],
            [gb_regex, " token_georgebush "],
            [la_regex, " token_losangeles "],
            [double_hyphens_regex, " -- "]
]

removed_punc = [symb for symb in string.punctuation]+['...']+['—']
removed_punc.remove('?')
removed_punc.remove('!')
removed_punc.remove('&')


usernames_regex = re.compile(r"@[a-zA-Z_.0-9]+")
hashtags_regex = re.compile(r"#[a-zA-Z_.0-9]+")
list_users = [];
list_hashtags = [];
for r in tweets:
    list_users.extend(usernames_regex.findall(r[2]))
    list_hashtags.extend(hashtags_regex.findall(r[2]))
    
[users, ucounts] = np.unique(list_users, return_counts=True)
[hashtags, hcounts] = np.unique(list_hashtags, return_counts=True)
users = np.column_stack((users, ucounts)).tolist()
hashtags = np.column_stack((hashtags, hcounts)).tolist()
users = sorted(users, key= lambda x: int(x[1]), reverse=True)
hashtags = sorted(hashtags, key= lambda x: int(x[1]), reverse=True)

min_user_references = 10
min_hashtag_references = 10

frequent_users = [u[0].lower() for u in users if int(u[1]) > min_user_references]
frequent_hashtags = [h[0].lower() for h in hashtags if int(h[1]) > min_hashtag_references]

twtok = TweetTokenizer(preserve_case=False)
split_tweets = []

for i, tweet in enumerate(tweets):
    line = list(tweet)
    line.append(tweet[2])
    for ctrlr in replacements:
        line[3] = ctrlr[0].sub(ctrlr[1], line[3])
        
    line.append(twtok.tokenize(line[3])) #add fourth column of tokenized and lowercased tweet
    line[4] = [tok for tok in line[4] if tok not in removed_punc]
    for j, tok in enumerate(line[4]):
        if number_regex.match(tok) != None:
            line[4][j] = 'token_number'
        if usernames_regex.match(tok) and tok not in frequent_users:
            line[4][j] = 'token_rare_user'
        if hashtags_regex.match(tok) and tok not in frequent_hashtags:
            line[4][j] = 'token_rare_hashtag'
    split_tweets.append(line)

In [4]:
#'261192751935279104'
#[r for r in split_tweets if r[0] == '830047626414477312']
split_tweets[10]

['780859319319400448',
 'realDonaldTrump',
 'Great afternoon in Little Havana with Hispanic community leaders. Thank you for your support! #ImWithYou https://t.co/vxWZ2tyJTF',
 'Great afternoon in Little Havana with Hispanic community leaders. Thank you for your support! #ImWithYou token_hyperlink ',
 ['great',
  'afternoon',
  'in',
  'little',
  'havana',
  'with',
  'hispanic',
  'community',
  'leaders',
  'thank',
  'you',
  'for',
  'your',
  'support',
  '!',
  '#imwithyou',
  'token_hyperlink']]

In [15]:
split_tweets[1125]

['765247533727834112',
 'HillaryClinton',
 '“She’s strong. She’s respected. She’s admired. There’s nothing that she doesn’t understand about America’s place in the world." —@JoeBiden',
 '“She\'s strong. She\'s respected. She\'s admired. There\'s nothing that she doesn\'t understand about America\'s place in the world." —@JoeBiden',
 ['“',
  "she's",
  'strong',
  "she's",
  'respected',
  "she's",
  'admired',
  "there's",
  'nothing',
  'that',
  'she',
  "doesn't",
  'understand',
  'about',
  "america's",
  'place',
  'in',
  'the',
  'world',
  '@joebiden']]