In [2]:
import re
import pandas as pd
import string
import json
import nltk
from collections import Counter
import operator
import numpy as np

In [3]:
config = json.load(open('../config.json', 'r'))
DATA_DIR = config['DATA_DIR']
TWEET_DIR = config['TWEET_DIR']
events = open(DATA_DIR + 'event_names.txt', 'r').read().splitlines()

## Identify retweets and tweets to be removed

Note: 
- we remove tweets with no user ids and tweets that were posted before the events
- the reason why the percentage of removed tweets varies is because some event files don't have many retweets (this is the result of the firehose storing mostly original tweets)

In [37]:
event_times = json.load(open(DATA_DIR + "event_times.json","r"))

In [40]:
for e in events:
    data = pd.read_csv(TWEET_DIR + e + '/' + e + '.csv', sep='\t', encoding='utf-8', lineterminator='\n')
    data['remove'] = ((data['timestamp'].astype(int) < event_times[e]) | data['user_id'].isnull())
    
    # we also remove duplicate tweets because oftentimes people / institutions tweet the same using multiple accounts
    if 'is_retweet' not in data.columns:
        data['isRT'] = (data['text'].duplicated() | data['text'].str.startswith('RT') | data['text'].str.contains('\bvia\b', regex=True))
    else:
        data['isRT'] = (data['text'].duplicated() | (data['is_retweet'] == True) | data['text'].str.contains('\bvia\b', regex=True))
    print(e + ':', '%.2f%% removed' % ((data['remove'] | data['isRT']).sum() * 100 / len(data)))
    data.to_csv(TWEET_DIR +e+'/'+e+'.csv', sep='\t', encoding='utf-8', index=False, line_terminator='\n')

chattanooga: 30.36% removed
roseburg: 6.64% removed
colorado_springs: 3.67% removed
san_bernardino: 28.31% removed
kalamazoo: 5.46% removed
orlando: 26.99% removed


  interactivity=interactivity, compiler=compiler, result=result)


dallas: 2.88% removed
baton_rouge: 4.84% removed
burlington: 46.65% removed
fort_lauderdale: 86.47% removed
fresno: 55.72% removed
san_francisco: 41.59% removed
vegas: 40.61% removed
thornton: 40.17% removed
sutherland_springs: 47.97% removed


  interactivity=interactivity, compiler=compiler, result=result)


parkland: 49.53% removed
nashville: 55.37% removed
santa_fe: 48.19% removed
annapolis: 59.86% removed
pittsburgh: 14.91% removed
thousand_oaks: 6.70% removed


## Build joint vocabulary

In [4]:
punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}) - set(['#']))
punct_chars.sort()
punctuation = ''.join(punct_chars)
replace = re.compile('[%s]' % re.escape(punctuation))

In [5]:
sno = nltk.stem.SnowballStemmer('english')

In [6]:
def clean_text(text):
    # lower case
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    # eliminate @mentions
    text = re.sub(r'\s@\S+', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    # stem words
    return [sno.stem(w) for w in text.split()]

In [9]:
vocabs = []
for e in events:
    data = pd.read_csv(TWEET_DIR +e+'/'+e+'.csv', sep='\t', encoding='utf-8', lineterminator='\n', usecols=['text', 'remove', 'isRT'])
    data = data[~data['remove'] & ~data['isRT']] # ignore retweets
    # sample an equal number of tweets from each event 
    # this has to be done to eliminate words that are too specific to a particular event
    data = data.sample(8000) 
    print(e)
    word_counts = Counter(clean_text(' '.join(data['text'])))
    vocab = []
    for k, v in word_counts.items():
        if v > 10:  # keep words that occur more than 10 times
            vocab.append(k)
    vocabs.append(set(vocab))

chattanooga
roseburg
colorado_springs
san_bernardino
kalamazoo
orlando
dallas
baton_rouge
burlington
fort_lauderdale
fresno
san_francisco
vegas
thornton
sutherland_springs
parkland
nashville
santa_fe
annapolis
pittsburgh
thousand_oaks


In [10]:
word_event_count = {}
for vocab in vocabs:
    for w in vocab:
        if w in word_event_count:
            word_event_count[w] += 1
        else:
            word_event_count[w] = 1

In [11]:
# Keep all words that occur in at least three events' tweets. Note that we keep stopwords.
keep = [k for k, v in sorted(word_event_count.items(), key=operator.itemgetter(1), reverse=True) if (v > 2 and not k.isdigit())]
print(len(keep))

1788


In [12]:
with open(DATA_DIR + 'joint_vocab.txt', 'w') as f:
    f.write('\n'.join(keep))

## Clean up tweets

In [13]:
vocab = open(DATA_DIR + 'joint_vocab.txt', 'r').read().splitlines()
vocab_set = set(vocab)

In [14]:
def clean_tweet(text):
    cleaned = clean_text(text)
    return ' '.join([w for w in cleaned if w in vocab_set])

In [15]:
for e in events:
    print(e)
    data = pd.read_csv(TWEET_DIR + e + '/' + e + '.csv', sep='\t', lineterminator='\n', usecols=['text', 'remove', 'isRT', 'dem_follows', 'rep_follows'])
    data = data[~data['remove'] & ~data['isRT']]  # ignore retweets
    with open(TWEET_DIR + e + '/' + e + '_cleaned_text.txt', 'w') as f:
        # clean tweets
        data['text'] = data['text'].astype(str).apply(clean_tweet)
        data = data[data['text'].str.contains(' ')]
        f.write('\n'.join(data['text']))
        
        # store the indices of cleaned tweets
        np.save(TWEET_DIR + e + '/' + e + '_cleaned_indices.npy', np.array(data.index))
        
        # store indices of cleaned AND partisan tweets
        partisan = data[~data['dem_follows'].isnull() & ~data['rep_follows'].isnull() & (data['dem_follows'] != data['rep_follows'])]
        np.save(TWEET_DIR + e + '/' + e + '_cleaned_and_partisan_indices.npy', np.array(partisan.index))
        
        # store indices of partisan tweets AMONG cleaned tweets
        data.reset_index(drop=True, inplace=True)
        data = data[~data['dem_follows'].isnull() & ~data['rep_follows'].isnull() & (data['dem_follows'] != data['rep_follows'])]
        np.save(TWEET_DIR + e + '/' + e + '_partisan_indices_among_cleaned_indices.npy', np.array(data.index))
        assert(len(partisan.index) == len(data.index))
        
        

chattanooga
roseburg
colorado_springs
san_bernardino
kalamazoo
orlando
dallas
baton_rouge
burlington
fort_lauderdale
fresno
san_francisco
vegas
thornton
sutherland_springs
parkland
nashville
santa_fe
annapolis
pittsburgh
thousand_oaks


# Build separate vocabulary

In [20]:
stop = set(open(DATA_DIR + 'stopwords.txt', 'r').read().splitlines())
event_stopwords = json.load(open(DATA_DIR + "event_stopwords.json","r"))

In [24]:
def clean_no_stop(text, event):
    stopwords = stop | set(event_stopwords[event])
    # lower case
    text = text.lower()
    # eliminate urls
    text = re.sub(r'http\S*|\S*\.com\S*|\S*www\S*', ' ', text)
    # eliminate @mentions
    text = re.sub(r'\s@\S+', ' ', text)
    # substitute all other punctuation with whitespace
    text = replace.sub(' ', text)
    # replace all whitespace with a single space
    text = re.sub(r'\s+', ' ', text)
    # strip off spaces on either end
    text = text.strip()
    # stem words
    return [sno.stem(w) for w in text.split() if w not in stopwords]

In [27]:
def build_vocab(corpus):
    freq = {}
    for words in corpus:
        prev = ''
        count = 0
        for i, w in enumerate(words):
            if w in freq:
                freq[w] += 1
            else:
                freq[w] = 1
            if count > 0:
                bigram = prev + ' ' + w
                if bigram in freq:
                    freq[bigram] += 1
                else:
                    freq[bigram] = 1
            count += 1
            prev = w
    cutoff = 50  # keep unigrams / bigrams that occur at least a 50 times
    vocab = [k for k,v in sorted(freq.items(), key=operator.itemgetter(1), reverse=True) if v > cutoff]
    return vocab

In [28]:
for i, e in enumerate(events):
    print(e)
    data = pd.read_csv(TWEET_DIR +e+'/'+e+'.csv', sep='\t', encoding='utf-8', lineterminator='\n', usecols=['text', 'remove', 'isRT'])
    data = data[~data['remove'] & ~data['isRT']]
    cleaned = data['text'].apply(clean_no_stop, args=(e,))
    vocab = build_vocab(cleaned)
    print(len(vocab))
    with open(TWEET_DIR +e+'/' + e+'_vocab.txt', 'w') as f:
        f.write('\n'.join(vocab))

chattanooga
1021
roseburg
592
colorado_springs
1620
san_bernardino
1971
kalamazoo
350
orlando
22682
dallas
5872
baton_rouge
1333
burlington
253
fort_lauderdale
367
fresno
297
san_francisco
315
vegas
26604
thornton
430
sutherland_springs
4531
parkland
9893
nashville
1733
santa_fe
3016
annapolis
1248
pittsburgh
2023
thousand_oaks
3120
