In [7]:
from __future__ import division
import numpy as np
import json
import pandas as pd
import nltk

In [8]:
sno = nltk.stem.SnowballStemmer('english')
config = json.load(open('../config.json', 'r'))
INPUT_DIR = config['INPUT_DIR']
OUTPUT_DIR = config['OUTPUT_DIR']
TWEET_DIR = config['TWEET_DIR']
BTM_DIR = config['BTM']
events = open(INPUT_DIR + 'event_names.txt', 'r').read().splitlines()

In [9]:
stopwords = set([sno.stem(w) for w in open(INPUT_DIR + 'stopwords.txt', 'r').read().splitlines()])

In [13]:
def remove_stopwords(tweet):
    return ' '.join([w for w in tweet.split() if w not in stopwords])

# get tweets for inducing topics

In [14]:
def get_samples():
    all_tweets = []
    all_events = []
    all_indices = []
    for event in events:
        tweets = open(TWEET_DIR + event + '/' + event + '_cleaned_text.txt', 'r').read().splitlines()
        N = len(tweets)
        print(event, N)

        # get tweets with embeddings
        idx1 = np.load(TWEET_DIR + event + '/' + event + '_partisan_indices_among_cleaned_indices.npy')
        tweets = [tweets[i] for i in idx1]

        # get sample for determining topics
        idx2 = np.load(TWEET_DIR + event + '/' + event + '_indices_among_embeddings_for_getting_topics.npy')
        tweets = [tweets[i] for i in idx2]

        # get indices among cleaned ones
        filter1 = sorted(list(set(idx1) & set(range(N))))
        filter2 = [filter1[j] for j in idx2]
        all_indices.extend(filter2)

        all_events.extend([event] * len(tweets))
        all_tweets.extend(tweets)
    df = pd.DataFrame({'tweet': all_tweets, 'event': all_events, 'index_in_clean_text': all_indices})
    df['tweet'] = df['tweet'].apply(remove_stopwords)
    df = df[df['tweet'].str.contains(' ')]
    return df

In [15]:
df = get_samples()
with open(BTM_DIR + 'tweets_train.txt', 'w') as f:
    f.write('\n'.join(df['tweet']))
df[['event', 'index_in_clean_text']].to_csv(BTM_DIR + 'indices_train.csv', index=False)

chattanooga 29550
roseburg 18042
colorado_springs 55792
san_bernardino 70452
kalamazoo 10950
orlando 1825759
dallas 259736
baton_rouge 46036
burlington 8148
fort_lauderdale 12514
fresno 8850
san_francisco 10484
vegas 1284855
thornton 14296
sutherland_springs 153889
parkland 272213
nashville 38638
santa_fe 73563
annapolis 27673
pittsburgh 59869
thousand_oaks 117580


## get all tweets

In [19]:
def combine_all_tweets():
    all_tweets = []
    all_indices = []
    all_events = []
    for event in events:
        print(event)
    
        # get tweets for which we have embeddings
        tweets = open(TWEET_DIR + event + '/' + event + '_cleaned_text.txt', 'r').read().splitlines()
        idx1 = np.load(TWEET_DIR + event + '/' + event + '_partisan_indices_among_cleaned_indices.npy')
        tweets = [tweets[i] for i in idx1]
    
        # get indices of these tweets in the **original data** (with no filtering)
        original_indices = np.load(TWEET_DIR + event + '/' + event + '_cleaned_and_partisan_indices.npy')
        
        all_tweets.extend(tweets)
        all_indices.extend(original_indices)
        all_events.extend([event] * len(tweets))
    
    df = pd.DataFrame({'tweet': all_tweets, 'indices_in_original': all_indices, 'event': all_events})
    df['tweet'] = df['tweet'].apply(remove_stopwords)
    df = df[df['tweet'].str.contains(' ')]
    return df

In [20]:
df = combine_all_tweets()
df[['indices_in_original', 'event']].to_csv(BTM_DIR + 'indices_all.csv', index=False)
with open(BTM_DIR + 'tweets_all.txt', 'w') as f:
    f.write('\n'.join(df['tweet']))

chattanooga
roseburg
colorado_springs
san_bernardino
kalamazoo
orlando
dallas
baton_rouge
burlington
fort_lauderdale
fresno
san_francisco
vegas
thornton
sutherland_springs
parkland
nashville
santa_fe
annapolis
pittsburgh
thousand_oaks


## next step is to run BTM script: `sh myBTMexample.sh`

Make sure to set `K` in the parameters and to user